Scraping from arXiv for QEE
The snippet can be accessed without any authentication.
Authored by
Ross John Hunter
A short python script to scrape from arXiv
in order to generate the weekly QEE summary. The resulting text file can be copy-pasted directly into an email.
To use:
- Follow the instructions on the QEE Convener Twiki to set up the filtered search for
hep-ex
andhep-ph
, - Replace the
hepex
andhepph
variables with the updated URLs for your current search, - Edit the
preamble
to modify the beginning of your email text if you like / your name isn't Ross, -
pip install requests beautifulsoup4
if you don't already have them - Execute this snippet with
python arxiv_web_scraping.py
You should then find a new file called arxiv_summary.txt
which you can copy-paste directly into an email. Before sending you should read the log for WARNINGs, and also have a quick check that the output you have corresponds to what you can read directly on arXiv.
I take no responsibility for people using this and not checking the results it gives.
I may one day update this so that the filtered search URLs are autogenerated for you.
arxiv_web_scraping.py 6.38 KiB
import requests
from typing import List
from bs4 import BeautifulSoup
from pprint import pprint
# In general need to edit these
# If you feel like it you could also autogenerate these and fill in the date from the command line, but I haven't been
# bothered to do this yet.
hepex = "https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=boson&terms-0-field=title&terms-1-operator=OR&terms-1-term=jet*&terms-1-field=title&terms-2-operator=OR&terms-2-term=long*lived&terms-2-field=title&terms-3-operator=OR&terms-3-term=top&terms-3-field=title&terms-4-operator=NOT&terms-4-term=super&terms-4-field=abstract&terms-5-operator=NOT&terms-5-term=supersymmetric&terms-5-field=abstract&terms-6-operator=OR&terms-6-term=HNL&terms-6-field=abstract&terms-7-operator=OR&terms-7-term=axion*&terms-7-field=title&terms-8-operator=NOT&terms-8-term=talk&terms-8-field=comments&terms-9-operator=NOT&terms-9-term=workshop&terms-9-field=comments&terms-10-operator=OR&terms-10-term=%22dark+photon*%22&terms-10-field=abstract&classification-physics=y&classification-physics_archives=hep-ex&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2025-02-24&date-to_date=&date-date_type=submitted_date_first&abstracts=show&size=200&order=-announced_date_first"
hepph = "https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=boson&terms-0-field=title&terms-1-operator=OR&terms-1-term=jet*&terms-1-field=title&terms-2-operator=OR&terms-2-term=long*lived&terms-2-field=title&terms-3-operator=OR&terms-3-term=top&terms-3-field=title&terms-4-operator=NOT&terms-4-term=super&terms-4-field=abstract&terms-5-operator=NOT&terms-5-term=supersymmetric&terms-5-field=abstract&terms-6-operator=OR&terms-6-term=HNL&terms-6-field=abstract&terms-7-operator=OR&terms-7-term=axion*&terms-7-field=title&terms-8-operator=NOT&terms-8-term=talk&terms-8-field=comments&terms-9-operator=NOT&terms-9-term=workshop&terms-9-field=comments&terms-10-operator=OR&terms-10-term=%22dark+photon*%22&terms-10-field=abstract&classification-physics=y&classification-physics_archives=hep-ph&classification-include_cross_list=include&date-year=&date-filter_by=date_range&date-from_date=2025-02-24&date-to_date=&date-date_type=submitted_date_first&abstracts=show&size=200&order=-announced_date_first"
preamble = """
Dear QEE colleagues,
Please find below a summary of arXiv contributions with topics related to QEE in the past ~week. Happy reading!
Best,
Ross, on behalf of the QEE (sub)Conveners
"""
def recursive_remove_double_wspace(line: str) -> str:
if ' ' not in line:
return line
else:
new_line = line.replace(' ', ' ')
return recursive_remove_double_wspace(new_line)
def find_idx(entry: List[str], substr: str) -> int:
indices = [i_l for i_l, l in enumerate(entry) if substr in l ]
if len(indices) != 1:
pprint(entry)
raise RuntimeError(f"Found more than 1 {substr} token: {indices}")
return indices[0]
def format_entry(entry: List[str]) -> List[str]:
# Remove whitespace lines
entry = [l for l in entry if not (l.isspace() or l == '')]
i_arxiv_no = 0
entry = [entry[i_arxiv_no]] + entry[i_arxiv_no+2::] # Remove pdf link
# 'More' token signifies the end of the short abstract - we want to remove all between "Abstract" and up to there
i_more_token = find_idx(entry, '▽ More')
entry = entry[0:i_more_token-1] + entry[i_more_token+1::]# + entry[i_more_token+3:-1]
# Remove also all including and after the 'Less' token
i_less_token = find_idx(entry, '△ Less')
entry = entry[0:i_less_token+1]
# Stop mutating now - now pick out the lines you want
i_author = find_idx(entry, 'Author')
i_title = i_author - 1
arxiv_serv_l = " ".join([l for l in entry[0:i_title] if ('hep-ex' in l or 'hep-ph' in l or 'arXiv' in l)])
bold = lambda txt: f"*{txt}*"
i_abs = find_idx(entry, 'Abstract')
author_ls = entry[i_author+1:i_abs]
if len(author_ls) > 5:
author_ls = author_ls[0:5] + ["et. al."]
author_l = " ".join([bold(entry[i_author])] + [l.lstrip().rstrip() for l in author_ls])
abs_l = "".join([bold(entry[i_abs])] + entry[i_abs+1:-1])
formatted = [
arxiv_serv_l,
entry[i_title].lstrip(),
author_l,
abs_l
]
formatted = [recursive_remove_double_wspace(l) for l in formatted]
return formatted
def find_entry_starts(lines: List[str]) -> List[int]:
starts = []
for i_line, line in enumerate(lines):
if 'arXiv:' in line and 'pdf,' in lines[i_line+1]:#and lines[i_line-1] == '\n':
starts.append(i_line)
return starts
def main():
formatted_entries = []
for k, websearch in {'hep-ex': hepex, 'hep-ph': hepph}.items():
web_resp = requests.get(websearch)
web_text = BeautifulSoup(web_resp.text, 'html.parser').get_text()
lines = web_text.split('\n')
# Now find all the entry starts
entry_start_lines = find_entry_starts(lines)#
n_entries = len(entry_start_lines)
entry_end_lines = [i_l-1 for i_l in entry_start_lines[1::]] + [len(lines)]
print(f"Found {len(entry_start_lines)} entries for {k}")
naive_starts = [i_line for i_line, line in enumerate(lines) if "arXiv:" in line]
naive_n_entries = len(naive_starts)
if naive_n_entries != n_entries:
print(f"WARNING:\t Found {n_entries} but naively expected {naive_n_entries}...")
print("Naive guesses at:", naive_starts)
print("More careful at:", entry_start_lines)
for i_entry, entry_start in enumerate(entry_start_lines):
entry = lines[entry_start:entry_end_lines[i_entry]]
formatted = "\n".join(format_entry(entry))
if formatted not in formatted_entries:
formatted_entries.append(formatted)
print(f"Got {len(formatted_entries)} formatted and non-overlapping entries")
formatted_and_numbered = [f"{i}. {entry}" for i, entry in enumerate(formatted_entries, start=1)]
border = "="*104
formatted_and_numbered.insert(0, border)
formatted_and_numbered.insert(0, preamble)
formatted_and_numbered.append(border)
# Concatenate into a single string and write it to the file
outfile = 'arxiv_summary.txt'
with open(outfile, 'w') as f:
f.write("\n\n".join(formatted_and_numbered))
print(f"Written output to {outfile}")
if __name__ == "__main__":
main()
Please register or sign in to comment