Skip to content
Snippets Groups Projects

Scraping from arXiv for QEE

  • Clone with SSH
  • Clone with HTTPS
  • Embed
  • Share
    The snippet can be accessed without any authentication.
    Authored by Ross John Hunter

    A short python script to scrape from arXiv in order to generate the weekly QEE summary. The resulting text file can be copy-pasted directly into an email.

    To use:

    • Follow the instructions on the QEE Convener Twiki to set up the filtered search for hep-ex and hep-ph,
    • Replace the hepex and hepph variables with the updated URLs for your current search,
    • Edit the preamble to modify the beginning of your email text if you like / your name isn't Ross,
    • pip install requests beautifulsoup4 if you don't already have them
    • Execute this snippet with python arxiv_web_scraping.py

    You should then find a new file called arxiv_summary.txt which you can copy-paste directly into an email. Before sending you should read the log for WARNINGs, and also have a quick check that the output you have corresponds to what you can read directly on arXiv.

    I take no responsibility for people using this and not checking the results it gives.

    I may one day update this so that the filtered search URLs are autogenerated for you.

    Edited
    arxiv_web_scraping.py 6.38 KiB
    import requests
    from typing import List
    from bs4 import BeautifulSoup
    from pprint import pprint
    
    # In general need to edit these
    # If you feel like it you could also autogenerate these and fill in the date from the command line, but I haven't been 
    # bothered to do this yet.
    hepex = "https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=boson&terms-0-field=title&terms-1-operator=OR&terms-1-term=jet*&terms-1-field=title&terms-2-operator=OR&terms-2-term=long*lived&terms-2-field=title&terms-3-operator=OR&terms-3-term=top&terms-3-field=title&terms-4-operator=NOT&terms-4-term=super&terms-4-field=abstract&terms-5-operator=NOT&terms-5-term=supersymmetric&terms-5-field=abstract&terms-6-operator=OR&terms-6-term=HNL&terms-6-field=abstract&terms-7-operator=OR&terms-7-term=axion*&terms-7-field=title&terms-8-operator=NOT&terms-8-term=talk&terms-8-field=comments&terms-9-operator=NOT&terms-9-term=workshop&terms-9-field=comments&terms-10-operator=OR&terms-10-term=%22dark+photon*%22&terms-10-field=abstract&classification-physics=y&classification-physics_archives=hep-ex&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2025-02-24&date-to_date=&date-date_type=submitted_date_first&abstracts=show&size=200&order=-announced_date_first"
    
    hepph = "https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=boson&terms-0-field=title&terms-1-operator=OR&terms-1-term=jet*&terms-1-field=title&terms-2-operator=OR&terms-2-term=long*lived&terms-2-field=title&terms-3-operator=OR&terms-3-term=top&terms-3-field=title&terms-4-operator=NOT&terms-4-term=super&terms-4-field=abstract&terms-5-operator=NOT&terms-5-term=supersymmetric&terms-5-field=abstract&terms-6-operator=OR&terms-6-term=HNL&terms-6-field=abstract&terms-7-operator=OR&terms-7-term=axion*&terms-7-field=title&terms-8-operator=NOT&terms-8-term=talk&terms-8-field=comments&terms-9-operator=NOT&terms-9-term=workshop&terms-9-field=comments&terms-10-operator=OR&terms-10-term=%22dark+photon*%22&terms-10-field=abstract&classification-physics=y&classification-physics_archives=hep-ph&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2025-02-24&date-to_date=&date-date_type=submitted_date_first&abstracts=show&size=200&order=-announced_date_first"
    
    preamble = """
    Dear QEE colleagues, 
    
    Please find below a summary of arXiv contributions with topics related to QEE in the past ~week. Happy reading!
    
    Best,
    
    Ross, on behalf of the QEE (sub)Conveners 
    """
    
    
    def recursive_remove_double_wspace(line: str) -> str:
        if '  ' not in line:
            return line
        else:
            new_line = line.replace('  ', ' ') 
            return recursive_remove_double_wspace(new_line)
    
    
    def find_idx(entry: List[str], substr: str) -> int:
        indices = [i_l  for i_l, l in enumerate(entry) if substr in l ]
        if len(indices) != 1:
            pprint(entry)
            raise RuntimeError(f"Found more than 1 {substr} token: {indices}")
        return indices[0]
    
    
    def format_entry(entry: List[str]) -> List[str]:
        # Remove whitespace lines
        entry = [l for l in entry if not (l.isspace() or l == '')]
    
        i_arxiv_no = 0
        entry = [entry[i_arxiv_no]] + entry[i_arxiv_no+2::] # Remove pdf link
    
        # 'More' token signifies the end of the short abstract - we want to remove all between "Abstract" and up to there
        i_more_token = find_idx(entry, '▽ More')
        entry = entry[0:i_more_token-1] + entry[i_more_token+1::]# + entry[i_more_token+3:-1]
    
        # Remove also all including and after the 'Less' token
        i_less_token = find_idx(entry, '△ Less')
        entry = entry[0:i_less_token+1]
    
        # Stop mutating now - now pick out the lines you want
        i_author = find_idx(entry, 'Author')
        i_title = i_author - 1
        arxiv_serv_l = " ".join([l for l in entry[0:i_title] if ('hep-ex' in l or 'hep-ph' in l or 'arXiv' in l)])
    
        bold = lambda txt: f"*{txt}*"
    
        i_abs = find_idx(entry, 'Abstract')
        author_ls = entry[i_author+1:i_abs]
        if len(author_ls) > 5:
            author_ls = author_ls[0:5] + ["et. al."]
        author_l = " ".join([bold(entry[i_author])] + [l.lstrip().rstrip() for l in author_ls])
        abs_l = "".join([bold(entry[i_abs])] + entry[i_abs+1:-1])
        
        formatted = [
            arxiv_serv_l,
            entry[i_title].lstrip(),
            author_l,
            abs_l
        ]
        formatted = [recursive_remove_double_wspace(l) for l in formatted]
        return formatted
    
    
    def find_entry_starts(lines: List[str]) -> List[int]:
        starts = []
        for i_line, line in enumerate(lines):
            if 'arXiv:' in line and 'pdf,' in lines[i_line+1]:#and lines[i_line-1] == '\n':
                starts.append(i_line)
        return starts
    
    
    def main():
    
        formatted_entries = []
    
        for k, websearch in {'hep-ex': hepex, 'hep-ph': hepph}.items():
            web_resp = requests.get(websearch)
            web_text = BeautifulSoup(web_resp.text, 'html.parser').get_text()
            lines = web_text.split('\n')
    
            # Now find all the entry starts
            entry_start_lines = find_entry_starts(lines)#
            n_entries = len(entry_start_lines)
            entry_end_lines = [i_l-1 for i_l in entry_start_lines[1::]] + [len(lines)]
            print(f"Found {len(entry_start_lines)} entries for {k}")
    
            naive_starts = [i_line for i_line, line in enumerate(lines) if "arXiv:" in line]
            naive_n_entries = len(naive_starts)
            if naive_n_entries != n_entries:
                print(f"WARNING:\t Found {n_entries} but naively expected {naive_n_entries}...")
                print("Naive guesses at:", naive_starts)
                print("More careful at:", entry_start_lines)
            
            for i_entry, entry_start in enumerate(entry_start_lines):
                entry = lines[entry_start:entry_end_lines[i_entry]] 
    
                formatted = "\n".join(format_entry(entry))
    
                if formatted not in formatted_entries:
                    formatted_entries.append(formatted)
    
        print(f"Got {len(formatted_entries)} formatted and non-overlapping entries")
        formatted_and_numbered = [f"{i}. {entry}" for i, entry in enumerate(formatted_entries, start=1)]
    
        border = "="*104
        formatted_and_numbered.insert(0, border)
        formatted_and_numbered.insert(0, preamble)
        formatted_and_numbered.append(border)
    
        # Concatenate into a single string and write it to the file
        outfile = 'arxiv_summary.txt'
        with open(outfile, 'w') as f:
            f.write("\n\n".join(formatted_and_numbered))
    
        print(f"Written output to {outfile}")
    
    if __name__ == "__main__":
        main()
    0% Loading or .
    You are about to add 0 people to the discussion. Proceed with caution.
    Finish editing this message first!
    Please register or to comment