Python BeautifulSoup库爬虫小说

最近看的小说（盗版网站）广告太多了，又想白嫖，于是写了个爬虫emmm（好吧是不想写实验来着

这份代码基于盗版小说网站https://www.26ksw.com/，但是盗版网站都差不都这样吧。。。

然后发现这个网站的这本书缺了两章似乎emmm

原理很简单，只不过bs4不太熟悉搞了好久。。

观察审查元素中Elements结构随便搞一搞。。

# coding:utf-8
import requests
from bs4 import BeautifulSoup
import bs4
import re


req_header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
              'application/signed-exchange;v=b3 ',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Connection': 'keep-alive',
    'Cookie': 'PHPSESSID=hmgtb7vir38sc6p80vtc644c44; bookid=84288; bgcolor=; font=; size=; fontcolor=; width=; '
              'chapterid=37162336; chaptername=chapter17',
    'Host': 'www.26ksw.com',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/78.0.3904.108 Safari/537.36',
}

title = []
urls = []


def get_html(url):
    r = requests.get(url, params=req_header)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup


def get_catalog(url):
    soup = get_html(url)
    title_list = soup.select('#wrapper #main .box_con #list')
    i = 0
    for item in title_list[0].find_all('a'):
        i += 1
        if i <= 12:
            continue
        title.append(item.string)
        urls.append(item.attrs['href'])


def get_content(id, url):
    soup = get_html(url)
    content = soup.select('#wrapper #main .content_read #content')[0].contents
    ret = 'chapter' + str(id) + '\r\n'
    for item in content:
        if isinstance(item, bs4.element.NavigableString):
            ret += item + '\r\n'
    return ret


if __name__ == '__main__':
    main_url = 'https://www.26ksw.com/book/84288/'
    get_catalog(main_url)
    with open('book.txt', 'a', encoding='utf-8') as f:
        for i in range(59):
            print(i + 1)
            url = 'https://www.26ksw.com' + urls[i]
            f.write(get_content(i+1, url))