forked from tatsu-lab/gpt_paper_assistant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
arxiv_scraper.py
74 lines (63 loc) · 2.51 KB
/
arxiv_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import dataclasses
import json
from datetime import datetime, timedelta
from html import unescape
from typing import List, Optional
import re
import feedparser
from dataclasses import dataclass
class EnhancedJSONEncoder(json.JSONEncoder):
def default(self, o):
if dataclasses.is_dataclass(o):
return dataclasses.asdict(o)
return super().default(o)
@dataclass
class Paper:
# paper class should track the list of authors, paper title, abstract, arxiv id
authors: List[str]
title: str
abstract: str
arxiv_id: str
# add a hash function using arxiv_id
def __hash__(self):
return hash(self.arxiv_id)
def get_papers_from_arxiv_rss(area: str, config: Optional[dict]) -> List[Paper]:
# get the feed from http://export.arxiv.org/rss/ and use the updated timestamp to avoid duplicates
updated = datetime.utcnow() - timedelta(days=1)
# format this into the string format 'Fri, 03 Nov 2023 00:30:00 GMT'
updated_string = updated.strftime("%a, %d %b %Y %H:%M:%S GMT")
feed = feedparser.parse(
f"http://export.arxiv.org/rss/{area}", modified=updated_string
)
if feed.status == 304:
if (config is not None) and config["OUTPUT"]["debug_messages"]:
print("No new papers since " + updated_string + " for " + area)
# if there are no new papers return an empty list
return []
# get the list of entries
entries = feed.entries
paper_list = []
for paper in entries:
# ignore updated papers
if ("UPDATED" in paper.title) or ("CROSS LISTED" in paper.title):
continue
# otherwise make a new paper, for the author field make sure to strip the HTML tags
authors = [
unescape(re.sub("<[^<]+?>", "", author)).strip()
for author in paper.author.split(",")
]
# strip html tags from summary
summary = re.sub("<[^<]+?>", "", paper.summary)
summary = unescape(re.sub("\n", " ", summary))
# strip the last pair of parentehses containing (arXiv:xxxx.xxxxx [area.XX])
title = re.sub("\(arXiv:[0-9]+\.[0-9]+v[0-9]+ \[.*\]\)$", "", paper.title)
# remove the link part of the id
id = paper.id.split("/")[-1]
# make a new paper
new_paper = Paper(authors=authors, title=title, abstract=summary, arxiv_id=id)
paper_list.append(new_paper)
return paper_list
if __name__ == "__main__":
paper_list = get_papers_from_arxiv_rss("math.AC", None)
print(paper_list)
print("success")