抽空写了个python爬虫爬取起点小说《江峰柳月》

本文最后更新于 2023年10月26日。

上两周整理笔记发现一些哲理性句子，想到了17年看的一部小说，想再看一遍，但苦于满屏的广告，太浪费时间了。于是花了几天时间写了一个爬虫把文章爬下来，看着舒服多了。

下面是代码，请合理文明使用，不要用于非法用途。

import json
import requests, random
from requests.exceptions import RequestException
import re
import time
import random
user_agent = [
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
]
def get_one_page(url):
请求头
headers ={
'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
headers = {'User-Agent': random.choice(user_agent)}
打印当前时间
print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) )
response = requests.get(url, headers = headers)
response.encoding='utf-8'
response = requests.get(url,proxies=http,timeout=3)
print(response.status_code)
print(response.text)
if response.status_code == 200:
return response.content.decode('utf-8')
return response.content.decode('utf-8','ignore')
return response.content.decode('GBK')
return response.content.decode('GBK','ignore')
return response.content
return None
找到所有章节的链接
chcp 65001 设置控制台编码
def parse_one_page(html):
正则表达式
pattern = re.compile(
'<div id="book_text">(.*?)</div>',re.S
)
pattern = re.compile(
'<div id="content" name="content">(.*?)</div>',re.S
)
对文本进行筛选
items = re.findall(pattern,html)
print(items)
for item in items:
item = item.replace('\n&nbsp;','')
item = item.replace('&nbsp;','')
item = item.replace('<br />','')
item = item.replace(' ','')
print(item)
yield item
yield{
'index':item
}
筛选出章节名称
def parse_one_page_head(html):
print(html)
正则表达式
pattern = re.compile(
'<h1>(.*)</h1>',re.S
)
对文本进行筛选
items = re.findall(pattern,html)
for item in items:
print(item)
yield{
'index':item
}
yield item
将结果保存到文本文档中
def write_to_file(content):
with open('liuyue(142).txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.write(content+'\n\n')
f.write('sfgdffks1\nddffd2\r\ndfjgf3\ndkjk4\r\n5')
def main(offset):
url = 'http://www.bequgew.com/99647/'+str(offset)+'.html'
url = 'https://www.rzlib.net/b/25/25339/'+str(offset)+'.html'
url = 'http://www.dingdianxs.la/29/29228/'+str(offset)+'.html'
url = str('http://www.dingdianxs.la/29/29228/%s.html' % offset)
html = get_one_page(url)
print(html)
首先筛选出章节
for item in parse_one_page_head(html):
print(item)
write_to_file(item)
筛选出段落
for item in parse_one_page(html):
print(item)
write_to_file(item)
if name == 'main':
从第一章到最后一章
12639965
12640047:83
12640167：202
for i in range(27661700,27661730):
print(i)
try:
main(i)
except Exception as e:
print('错误明细是',e.class.name,e)   #continue#jia
continue
每10秒访问一次
time_waite = random.randint(3,7)
time.sleep(time_waite)

注意不要频繁抓取，以免影响服务器正常工作，甚至自己被屏蔽。

写的时候遇到很多问题，被注释的语句很多是因为在某些情况不适用了换其他方法。

大家如果有不理解的地方可以去Weiyoun（id：WeiyounMimi）公众号找我交流。

扫码领外卖红包，省十几块！

Post Views: 333

2021 年 3 月
一	二	三	四	五	六	日
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30	31

请求头

headers ={

'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/52.0.2743.116 Safari/537.36'

}

打印当前时间

response = requests.get(url,proxies=http,timeout=3)

print(response.text)

return response.content.decode('utf-8')

return response.content.decode('GBK')

return response.content.decode('GBK','ignore')

return response.content

找到所有章节的链接

chcp 65001 设置控制台编码

正则表达式

pattern = re.compile(

'<div id="book_text">(.*?)</div>',re.S

)

对文本进行筛选

print(items)

print(item)

yield{

'index':item

}

筛选出章节名称

print(html)

正则表达式

对文本进行筛选

print(item)

yield{

'index':item

}

将结果保存到文本文档中

f.write(json.dumps(content,ensure_ascii=False)+'\n')

f.write('sfgdffks1\nddffd2\r\ndfjgf3\ndkjk4\r\n5')

url = 'http://www.bequgew.com/99647/'+str(offset)+'.html'

url = 'https://www.rzlib.net/b/25/25339/'+str(offset)+'.html'

url = 'http://www.dingdianxs.la/29/29228/'+str(offset)+'.html'

print(html)

首先筛选出章节

筛选出段落

从第一章到最后一章

12639965

12640047:83

12640167：202

每10秒访问一次

相关文章：