Python BeautifulSoup库爬虫小说

最近看的小说(盗版网站)广告太多了,又想白嫖,于是写了个爬虫emmm(好吧是不想写实验来着

这份代码基于盗版小说网站https://www.26ksw.com/,但是盗版网站都差不都这样吧。。。

然后发现这个网站的这本书缺了两章似乎emmm

原理很简单,只不过bs4不太熟悉搞了好久。。

观察审查元素中Elements结构随便搞一搞。。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# coding:utf-8
import requests
from bs4 import BeautifulSoup
import bs4
import re


req_header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3 ',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Cookie': 'PHPSESSID=hmgtb7vir38sc6p80vtc644c44; bookid=84288; bgcolor=; font=; size=; fontcolor=; width=; '
'chapterid=37162336; chaptername=chapter17',
'Host': 'www.26ksw.com',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/78.0.3904.108 Safari/537.36',
}

title = []
urls = []


def get_html(url):
r = requests.get(url, params=req_header)
soup = BeautifulSoup(r.text, 'html.parser')
return soup


def get_catalog(url):
soup = get_html(url)
title_list = soup.select('#wrapper #main .box_con #list')
i = 0
for item in title_list[0].find_all('a'):
i += 1
if i <= 12:
continue
title.append(item.string)
urls.append(item.attrs['href'])


def get_content(id, url):
soup = get_html(url)
content = soup.select('#wrapper #main .content_read #content')[0].contents
ret = 'chapter' + str(id) + '\r\n'
for item in content:
if isinstance(item, bs4.element.NavigableString):
ret += item + '\r\n'
return ret


if __name__ == '__main__':
main_url = 'https://www.26ksw.com/book/84288/'
get_catalog(main_url)
with open('book.txt', 'a', encoding='utf-8') as f:
for i in range(59):
print(i + 1)
url = 'https://www.26ksw.com' + urls[i]
f.write(get_content(i+1, url))