arxiv_web_scraping.py

import requests
from typing import List
from bs4 import BeautifulSoup
from pprint import pprint

# In general need to edit these
# If you feel like it you could also autogenerate these and fill in the date from the command line, but I haven't been
# bothered to do this yet.
hepex = "https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=boson&terms-0-field=title&terms-1-operator=OR&terms-1-term=jet*&terms-1-field=title&terms-2-operator=OR&terms-2-term=long*lived&terms-2-field=title&terms-3-operator=OR&terms-3-term=top&terms-3-field=title&terms-4-operator=NOT&terms-4-term=super&terms-4-field=abstract&terms-5-operator=NOT&terms-5-term=supersymmetric&terms-5-field=abstract&terms-6-operator=OR&terms-6-term=HNL&terms-6-field=abstract&terms-7-operator=OR&terms-7-term=axion*&terms-7-field=title&terms-8-operator=NOT&terms-8-term=talk&terms-8-field=comments&terms-9-operator=NOT&terms-9-term=workshop&terms-9-field=comments&terms-10-operator=OR&terms-10-term=%22dark+photon*%22&terms-10-field=abstract&classification-physics=y&classification-physics_archives=hep-ex&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2025-02-24&date-to_date=&date-date_type=submitted_date_first&abstracts=show&size=200&order=-announced_date_first"

hepph = "https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=boson&terms-0-field=title&terms-1-operator=OR&terms-1-term=jet*&terms-1-field=title&terms-2-operator=OR&terms-2-term=long*lived&terms-2-field=title&terms-3-operator=OR&terms-3-term=top&terms-3-field=title&terms-4-operator=NOT&terms-4-term=super&terms-4-field=abstract&terms-5-operator=NOT&terms-5-term=supersymmetric&terms-5-field=abstract&terms-6-operator=OR&terms-6-term=HNL&terms-6-field=abstract&terms-7-operator=OR&terms-7-term=axion*&terms-7-field=title&terms-8-operator=NOT&terms-8-term=talk&terms-8-field=comments&terms-9-operator=NOT&terms-9-term=workshop&terms-9-field=comments&terms-10-operator=OR&terms-10-term=%22dark+photon*%22&terms-10-field=abstract&classification-physics=y&classification-physics_archives=hep-ph&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2025-02-24&date-to_date=&date-date_type=submitted_date_first&abstracts=show&size=200&order=-announced_date_first"

preamble = """
Dear QEE colleagues,

Please find below a summary of arXiv contributions with topics related to QEE in the past ~week. Happy reading!

Best,

Ross, on behalf of the QEE (sub)Conveners
"""


def recursive_remove_double_wspace(line: str) -> str:
    if '  ' not in line:
        return line
    else:
        new_line = line.replace('  ', ' ')
        return recursive_remove_double_wspace(new_line)


def find_idx(entry: List[str], substr: str) -> int:
    indices = [i_l  for i_l, l in enumerate(entry) if substr in l ]
    if len(indices) != 1:
        pprint(entry)
        raise RuntimeError(f"Found more than 1 {substr} token: {indices}")
    return indices[0]


def format_entry(entry: List[str]) -> List[str]:
    # Remove whitespace lines
    entry = [l for l in entry if not (l.isspace() or l == '')]

    i_arxiv_no = 0
    entry = [entry[i_arxiv_no]] + entry[i_arxiv_no+2::] # Remove pdf link

    # 'More' token signifies the end of the short abstract - we want to remove all between "Abstract" and up to there
    i_more_token = find_idx(entry, '▽ More')
    entry = entry[0:i_more_token-1] + entry[i_more_token+1::]# + entry[i_more_token+3:-1]

    # Remove also all including and after the 'Less' token
    i_less_token = find_idx(entry, '△ Less')
    entry = entry[0:i_less_token+1]

    # Stop mutating now - now pick out the lines you want
    i_author = find_idx(entry, 'Author')
    i_title = i_author - 1
    arxiv_serv_l = " ".join([l for l in entry[0:i_title] if ('hep-ex' in l or 'hep-ph' in l or 'arXiv' in l)])

    bold = lambda txt: f"*{txt}*"

    i_abs = find_idx(entry, 'Abstract')
    author_ls = entry[i_author+1:i_abs]
    if len(author_ls) > 5:
        author_ls = author_ls[0:5] + ["et. al."]
    author_l = " ".join([bold(entry[i_author])] + [l.lstrip().rstrip() for l in author_ls])
    abs_l = "".join([bold(entry[i_abs])] + entry[i_abs+1:-1])

    formatted = [
        arxiv_serv_l,
        entry[i_title].lstrip(),
        author_l,
        abs_l
    ]
    formatted = [recursive_remove_double_wspace(l) for l in formatted]
    return formatted


def find_entry_starts(lines: List[str]) -> List[int]:
    starts = []
    for i_line, line in enumerate(lines):
        if 'arXiv:' in line and 'pdf,' in lines[i_line+1]:#and lines[i_line-1] == '\n':
            starts.append(i_line)
    return starts


def main():

    formatted_entries = []

    for k, websearch in {'hep-ex': hepex, 'hep-ph': hepph}.items():
        web_resp = requests.get(websearch)
        web_text = BeautifulSoup(web_resp.text, 'html.parser').get_text()
        lines = web_text.split('\n')

        # Now find all the entry starts
        entry_start_lines = find_entry_starts(lines)#
        n_entries = len(entry_start_lines)
        entry_end_lines = [i_l-1 for i_l in entry_start_lines[1::]] + [len(lines)]
        print(f"Found {len(entry_start_lines)} entries for {k}")

        naive_starts = [i_line for i_line, line in enumerate(lines) if "arXiv:" in line]
        naive_n_entries = len(naive_starts)
        if naive_n_entries != n_entries:
            print(f"WARNING:\t Found {n_entries} but naively expected {naive_n_entries}...")
            print("Naive guesses at:", naive_starts)
            print("More careful at:", entry_start_lines)

        for i_entry, entry_start in enumerate(entry_start_lines):
            entry = lines[entry_start:entry_end_lines[i_entry]]

            formatted = "\n".join(format_entry(entry))

            if formatted not in formatted_entries:
                formatted_entries.append(formatted)

    print(f"Got {len(formatted_entries)} formatted and non-overlapping entries")
    formatted_and_numbered = [f"{i}. {entry}" for i, entry in enumerate(formatted_entries, start=1)]

    border = "="*104
    formatted_and_numbered.insert(0, border)
    formatted_and_numbered.insert(0, preamble)
    formatted_and_numbered.append(border)

    # Concatenate into a single string and write it to the file
    outfile = 'arxiv_summary.txt'
    with open(outfile, 'w') as f:
        f.write("\n\n".join(formatted_and_numbered))

    print(f"Written output to {outfile}")

if __name__ == "__main__":
    main()