-
Notifications
You must be signed in to change notification settings - Fork 2
/
Article.py
82 lines (73 loc) · 2.38 KB
/
Article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from html.parser import HTMLParser
class Article:
def __init__(self, article):
self.article = article
def getTitle(self):
ident = 'class=\"DY5T1d\" >'
endit = '</a>'
si = self.article.find(ident)
if si == -1:
return 'NOT FOUND'
si += len(ident)
ei = (self.article[si:]).find(endit)
if ei == -1:
return self.article[si:]+'END NOT FOUND'
ei += si
return self.article[si:ei]
def getAuthor(self):
identifier = "wEwyrc"
start = False
start2 = False
si = -1
ei = -1
i = 0
while i < len(self.article)-len(identifier):
if self.article[i:i+len(identifier)] == identifier:
start = True
if start and self.article[i] == '>' and self.article[i+1] != '<':
si = i+1
start2 = True
if start2 and self.article[i] == '<':
ei = i-1
break
i += 1
return self.article[si:ei+1]
def getClass1(self):
idnt='class=\"DY5T1d\" >'
si=self.article.index(idnt)+len(idnt)
tmp1=self.article[si:]
ei = self.article[si:].index('</a>')
return tmp1[:ei]
def getClass2(self):
idnt = 'class=\"xBbh9\">'
si = self.article.index(idnt)+len(idnt)
tmp1=self.article[si:]
ei = self.article[si:].index('</span')
return tmp1[:ei]
def getShortDesc(self):
out = self.getClass1()+' . '+self.getClass2()
out = HTMLParser().unescape(out)
return out
def getImage(self):
ident = 'src=\"'
endnt = '\"'
art = self.article[self.article.find(ident)+len(ident):]
return art[:art.find(endnt)]
def getUrl(self):
url = "\"https://news.google.com"
identifier = "./articles/"
si = -1
ei = -1
i = 0
while i < len(self.article)-len(identifier):
if self.article[i:i+len(identifier)] == identifier:
si = i-1
elif i > si and si > -1 and self.article[i] == '\"':
ei = i
break
i += 1
return url+self.article[si+2:ei]+"\""
def getPublishedAt(self):
snip=self.article[self.article.find('datetime=\"'):]
snip=snip[snip.find('>'):]
return snip[:snip.find('<')]