-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawler_chinese_classic_novels.py
141 lines (111 loc) · 4.11 KB
/
crawler_chinese_classic_novels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
==============================================
Chinese classic novels crawler (中国古典小说爬虫)
Data from http://www.shicimingju.com
==============================================
Usage
------
positional arguments:
outpath Download files path
optional arguments:
-h, --help show this help message and exit
--skip SKIP How many books you need to skip? default 0
--stop STOP How many books you need to download? default 0
--wait WAIT No of seconds to wait between two books
Example
------
# Download all books in the catalog, and keep the data in current dir
$ python crawler_china_classic_novels.py .
# Skip 1 book, and download 1 book, and keep the data in current dir
$ python crawler_china_classic_novels.py . --stop 1 --skip 1
# Skip 1 book, and download 5 book, and insert a 5 sec sleep step between each book,
# and keep the data in current dir
$ python crawler_china_classic_novels.py . --stop 5 --skip 1 --wait 5
==============================================
"""
from bs4 import BeautifulSoup
import requests
import logging
import argparse
import time
import sys
import os
logger = logging.getLogger(__name__)
FORMAT = '[%(levelname)s]: %(message)s'
logging.basicConfig(format=FORMAT)
logger.setLevel(logging.INFO)
from_url = 'http://www.shicimingju.com/book/'
domain = 'http://www.shicimingju.com'
def download_chapter(name, link, out):
logger.info('Requesting chapter {}: {}...'.format(name, link))
resp = requests.get(link)
if resp.status_code != 200:
logger.error('Failed to download chapter: {}, code: {}, err: {}'.format(name, resp.status_code, resp.text))
return
html = BeautifulSoup(resp.text, 'html.parser')
paragraphs = html.select('.chapter_content p')
if len(paragraphs) == 0:
paragraphs = html.select('.chapter_content')
out.write(name)
out.write('\n\n')
for paragraph in paragraphs:
out.write(paragraph.text)
out.write('\n\n')
def download_book(book_name, link, outdir):
logger.info('Requesting book {}: {}...'.format(book_name, link))
resp = requests.get(link)
if resp.status_code != 200:
logger.error('Failed to download book: {}, code: {}, err: {}'.format(book_name, resp.status_code, resp.text))
return
html = BeautifulSoup(resp.text, 'html.parser')
chapters = html.select('.book-mulu li a')
for chapter in chapters:
filename = os.path.join(outdir, '{}-{}.txt'.format(book_name, chapter.text.replace(' ', '')))
with open(filename, 'w') as out:
name, link = chapter.text, '{}{}'.format(domain, chapter['href'])
download_chapter(name, link, out)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'outpath',
type=str,
help='Download files path'
)
parser.add_argument(
'--skip',
type=int,
default=0,
help='How many books you need to skip? default 0'
)
parser.add_argument(
'--stop',
type=int,
default=0,
help='How many books you need to download? default 0'
)
parser.add_argument(
'--wait',
type=int,
default=0,
help='No of seconds to wait between two books'
)
args = parser.parse_args(sys.argv[1:])
resp = requests.get(from_url)
if resp.status_code == 200:
logger.info('Requesting classic novels...')
html = BeautifulSoup(resp.text, 'html.parser')
book_hrefs = html.select('.bookmark-list li a')
success = 0
for i, href in enumerate(book_hrefs):
book_name = href.text
if i >= args.skip:
book_link = '{}{}'.format(domain, href['href'])
download_book(book_name, book_link, args.outpath)
success += 1
if args.stop > 0 and success >= args.stop:
break
if args.wait > 0:
time.sleep(args.wait)
else:
logger.info('Skip book {}'.format(book_name))
logger.info('Download finished. Total book downloaded: {}. Books in {}'.format(success, args.outpath))