多线程爬取小说网站“全书网”
1、可爬取所有分类的文章
2、包括小说的封面、作者、介绍以及每章的内容,就是说这个小说网站架构都爬下来了。
3、本来是打算都爬下来的,后来发现我还是太年轻,一本书的内容就占了3到4M的数据库空间,爬到300多本以后才恍然大悟,要知道全站至少十几万本书。这你可以算一下要多少空间,我这小小服务器承担不起了。后来就改成先爬所有小说的封面、作者、书名、介绍以及链接。省略了章节具体内容后总共爬了15万3千多本。。。
具体代码如下:
# coding:utf8
#!/usr/bin/python
# -*- coding: UTF-8 -*-import requests
import pymysql
from bs4 import BeautifulSoup
import _thread
import time
import threading# 获取书简介,修改书信息
def getIntroduce(novel_href,id):header = {'Host':'www.quanshuwang.com','Upgrade-Insecure-Requests':'1','Connection':'keep-alive','Accept-Encoding':'gzip, deflate','Accept-Language':'zh-CN,zh;q=0.9','Cache-Control':'max-age=0','Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}time.sleep(0.2)novellist = requests.get(novel_href,headers=header,timeout=20)novellist.encoding = 'gbk'soup = BeautifulSoup(novellist.text, 'lxml')res = soup.select("#waa")if(len(res)>0):# 书简介introduce = soup.select("#waa")[0].get_text()chapterHref = soup.select(".reader")[0].get("href")print(introduce)sql = "UPDATE novel_info SET novel_introduce='%s' WHERE novel_href='%s'" % (introduce,novel_href)te = threading.Thread(target=getChapterList, args=(chapterHref, id,sql))te.start()# getChapterList(chapterHref,id,sql)# 获取章节信息
def getChapterList(h,id,sql):db = pymysql.connect("localhost", "root", "123456", "wx_app")db.ping(True)time.sleep(0.2)novellist = requests.get(h, timeout=20)novellist.encoding = 'gbk'soup = BeautifulSoup(novellist.text, 'lxml')list = soup.select(".dirconone > li");i = 1print("开始输入-> 书ID:%d " % id)insertNovelInfo(sql,db)for chapter in list:contHref = chapter.select("a")[0].get("href")# 章节标题contTitle = chapter.select("a")[0].get_text()# content = getContents(contHref)res1 = requests.get(contHref, timeout=20)res1.encoding = 'gbk'soup = BeautifulSoup(res1.text, 'lxml')tx = soup.select(".mainContenr")if (len(tx) > 0):content = soup.select(".mainContenr")[0].get_text().lstrip('style5();').rstrip('style6();')else:content = hprint("章节:%s" % (contTitle))sql1 = "INSERT INTO `novel_chapter`(novel_id,chapter_id,chapter_name) VALUES(%d,%d,'%s')" % (id,i,contTitle)i = i+1sql2 = "INSERT INTO `novel_chapter_info`(chapter_id,chapter_name,chapter_content,novel_id) VALUES(%d,'%s','%s',%d)" % (i,contTitle,content,id)insertNovelInfo(sql1,db)insertNovelInfo(sql2,db)print("文件%s输入完成" % id)db.commit()db.close()
def getContents(h):res = requests.get(h, timeout=20)res.encoding = 'gbk'soup = BeautifulSoup(res.text, 'lxml')tx = soup.select(".mainContenr")if(len(tx)>0):content = soup.select(".mainContenr")[0].get_text().lstrip('style5();').rstrip('style6();')else:content = hreturn contentdef insertNovelInfo(sql,db):cursor = db.cursor()try:cursor.execute(sql)except:#回滚db.rollback()print("mysql错误:",sql)exec(0)# getIntroduce('http://www.quanshuwang.com/book_135083.html')def test(i):print(i)def init(count,num):# count = 0while (count < num):i = 0str = "select a.novel_href,a.novel_id from novel_info a inner join (select novel_id from novel_info GROUP BY novel_id limit %d,1) b on a.novel_id=b.novel_id " % count# 打开数据库连接db = pymysql.connect("localhost", "root", "123456", "wx_app")db.ping(True)# 使用cursor()方法获取操作游标cursor = db.cursor()try:# 执行SQL语句cursor.execute(str)# 获取所有记录列表results = cursor.fetchall()db.close()except:print("Error: unable to fecth data")# 关闭数据库连接for row in results:getIntroduce(row[0],row[1])print(row[0],row[1])count = count + 1def test(res):i = 0;while(i<10):print(res)i = i+1try:threads = []# 循环开启线程for i in range(0, 100):# 每个线程执行多少文章j = i+1t = threading.Thread(target=init, args=(i,j))i = jthreads.append(t)t.start()for t in threads:t.join()print("end")except:print("Error: 无法启动线程")