-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
56 lines (49 loc) · 1.69 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import json
import requests
import khayyam
from bs4 import BeautifulSoup
BASE_URL = 'https://www.isna.ir'
date = str(khayyam.JalaliDate.today()).split('-')
title = []
link = []
abstract = []
news = []
lis = []
ps = []
"""
ISNA sorts news by their release time and the date should be declared in the url. This script is written in the way that
every time it runs, we get the news of that day and days before; but the data in data.json file is static.
"""
for i in range(1, 20):
url = f'https://www.isna.ir/archive?pi={i}&ms=0&dy={date[2]}&mn={date[1]}&yr={date[0]}'
res = requests.get(url=url)
soup = BeautifulSoup(res.text, 'html.parser')
divs = soup.find_all('div', attrs={'class': 'desc'})
for div in divs:
lis.append(div.findChildren('a', recursive=True)[0])
ps.append(div.findChildren('p', recursive=True))
link = [BASE_URL+li.get('href') for li in lis]
title = [li.getText() for li in lis]
for p in ps:
try:
abstract.append(p[0].getText())
except:
abstract.append('')
for l, t, a in zip(link, title, abstract):
paragraphs = []
tag = []
res = requests.get(url=l)
soup = BeautifulSoup(res.text, 'html.parser')
div = soup.find('div', attrs={'class': 'item-text'})
tags = soup.findAll('a', attrs={'rel': 'Tag'})
try:
ps = div.findChildren('p', recursive=False)
except:
continue
for p in ps:
paragraphs.append(p.getText())
for tg in tags:
tag.append(tg.getText())
news.append({'title': t, 'link': l, 'abstract': a, 'paragraphs': paragraphs, 'tag': tag})
with open('data2.json', 'w+', encoding='utf-8') as file:
json.dump(news, file)