-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathspider.py
135 lines (108 loc) · 3.64 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import requests
import re
import random
import time
from lxml import etree
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from docx import Document
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
def get_title_url():
"""获得所有章节标题及 url"""
url = 'https://www.doupocangqiong.org/doupocangqiong/'
response = requests.get(url, headers=headers)
response.encoding = 'utf8'
html = etree.HTML(response.text)
title_list = html.xpath('//*[@id="play_0"]/ul/li')
titles = []
links = []
for title in title_list:
t = title.xpath('./a/text()')[0]
l = title.xpath('./a/@href')[0]
l = 'https://www.doupocangqiong.org' + l
titles.append(t)
links.append(l)
return titles, links
def get_text(url):
"""爬取当前章节正文内容"""
response = requests.get(url, headers=headers)
response.encoding = 'utf8'
text = re.findall('<br />(.*)<br />', response.text, re.S)
text = text[0].replace(' ', '').replace('<br />', '\n')
print(f'{text[:100]}')
return text
def save_to_word(t, text):
"""保存 word"""
# 创建一个新的Word文档
doc = Document()
# 添加标题
doc.add_heading(t, 0) # 0代表最大的标题级别
# 添加文本
for line in text.split('\n'):
doc.add_paragraph(line)
# 保存Word文档
doc.save(f'data/word/{t}.docx')
def save_to_pdf(t, content):
"""保存 pdf"""
# 保存的 pdf 不会自动换行,会少字,处理一下
new_content = ''
for c in range(0, len(content), 43):
new_content += content[c:c + 43] + '\n'
# 注册中文字体
pdfmetrics.registerFont(TTFont('msyh', 'data/msyh.ttc'))
# 创建PDF文件
pdf_filename = f'data/pdf/{t}.pdf'
c = canvas.Canvas(pdf_filename, pagesize=letter)
width, height = letter # 获取页面宽度和高度
width += 20
# 添加标题
c.setFont("msyh", 16)
c.drawCentredString(width / 2, height - 60, t)
# 添加自动换行的文本
c.setFont("msyh", 12)
text = c.beginText(40, height - 100)
text.setFont("msyh", 12)
text.textLines(new_content)
text.setWordSpace(20) # 设置字间距,以避免单词之间的断行
text.setCharSpace(20)
c.drawText(text)
# 保存PDF
c.save()
def save_to_txt(t, text):
"""保存 txt"""
with open(f'data/txt/{t}.txt', 'w', encoding='utf-8') as file:
file.write(t + '\n')
file.write(text)
def save_to_md(t, text):
"""保存为 markdown"""
with open(f'data/markdown/{t}.md', 'w', encoding='utf-8') as file:
file.write('# ' + t + '\n')
file.write(text)
if __name__ == '__main__':
# 获得所有章节标题及 url
titles, links = get_title_url()
# 开始遍历爬取每一章节
for t, u in list(zip(titles, links)):
print(f'开始爬取 {t} 章节')
try:
text = get_text(u)
# 随机保存到不同文件类型
file_type = ['word', 'pdf', 'txt', 'md']
type_ = random.choice(file_type)
if type_ == 'word':
save_to_word(t, text)
elif type_ == 'pdf':
save_to_pdf(t, text)
elif type_ == 'txt':
save_to_txt(t, text)
else:
save_to_md(t, text)
print(f'保存成功!')
print('========================================')
except:
pass
time.sleep(1)