37.scrapy解决翻页及采集杭州造价网站材料数据

news/2024/5/9 19:42:16/文章来源:https://blog.csdn.net/weixin_33747129/article/details/94342636
37.scrapy解决翻页及采集杭州造价网站材料数据
1.目标采集地址: http://183.129.219.195:8081/bs/hzzjb/web/list

2.这里的翻页还是较为简单的,只要模拟post请求发送data包含关键参数就能获取下一页页面信息。

获取页面标签信息的方法不合适,是之前写的,应该用xpath匹配整个table数据获取父类选择器再去二次匹配子类标签数据。

3.采集结果如下:
#hzzjb.py# -*- coding: utf-8 -*-
import scrapy
import json
import re
from hzzjb_web.items import HzzjbWebItem
class HzzjbSpider(scrapy.Spider):name = 'hzzjb'allowed_domains = ['183.129.219.195:8081/bs']start_urls = ['http://183.129.219.195:8081/bs/hzzjb/web/list']custom_settings = {"DOWNLOAD_DELAY": 0.2,"ITEM_PIPELINES": {'hzzjb_web.pipelines.MysqlPipeline': 320,},"DOWNLOADER_MIDDLEWARES": {'hzzjb_web.middlewares.HzzjbWebDownloaderMiddleware': 500},}def parse(self, response):_response=response.text# print(_response)try :#获取信息表tag_list=response.xpath("//table[@class='table1']//tr/td").extract()# print(tag_list)# for i in tag_list:#     print(i)tag1=tag_list[:9]tag2=tag_list[9:18]tag3=tag_list[18:27]tag4=tag_list[27:36]tag5=tag_list[36:45]tag6=tag_list[45:54]tag7=tag_list[54:63]tag8=tag_list[63:72]tag9=tag_list[72:81]tag10=tag_list[81:90]tag11=tag_list[90:99]tag12=tag_list[99:108]tag13=tag_list[108:117]tag14=tag_list[117:126]tag15=tag_list[126:135]tag16=tag_list[135:144]tag17=tag_list[144:153]tag18=tag_list[153:162]tag19=tag_list[162:171]tag20=tag_list[171:180]list=[]list.append(tag1)list.append(tag2)list.append(tag3)list.append(tag4)list.append(tag5)list.append(tag6)list.append(tag7)list.append(tag8)list.append(tag9)list.append(tag10)list.append(tag11)list.append(tag12)list.append(tag13)list.append(tag14)list.append(tag15)list.append(tag16)list.append(tag17)list.append(tag18)list.append(tag19)list.append(tag20)print(list)except:print('————————————————网站编码有异常!————————————————————')for index,tag in enumerate(list):# print('*'*100)# print(index+1,TAG(i))
item = HzzjbWebItem()try:# 地区district = tag[0].replace('<td>','').replace('</td>','')# print(district)item['district'] = district# 类别category = tag[1].replace('<td>','').replace('</td>','')# print(category)item['category'] = category# 材料名称material_name = tag[2].replace('<td>','').replace('</td>','')# print(material_name)item['material_name'] = material_name# 规格及型号version = tag[3].replace('<td>','').replace('</td>','')# print(version)item['version'] = version# 单位unit = tag[4].replace('<td>','').replace('</td>','')# print(unit)item['unit'] = unit# 含税信息价tax_information_price = tag[5].replace('<td>','').replace('</td>','')# print(tax_information_price)item['tax_information_price'] = tax_information_price# 除税信息价except_tax_information_price = tag[6].replace('<td>','').replace('</td>','')# print(except_tax_information_price)item['except_tax_information_price'] = except_tax_information_price# 年/月year_month = tag[7].replace('<td>','').replace('</td>','')# print(year_month)item['y_m'] = year_monthexcept:pass# print('*'*100)yield itemfor i in range(2, 5032):# 翻页data={'mtype': '2','_query.nfStart':'','_query.yfStart':'','_query.nfEnd':'','_query.yfEnd':'','_query.dqstr':'','_query.dq':'','_query.lbtype':'','_query.clmc':'','_query.ggjxh':'','pageNumber': '{}'.format(i),'pageSize':'','orderColunm':'','orderMode':'',}yield scrapy.FormRequest(url='http://183.129.219.195:8081/bs/hzzjb/web/list', callback=self.parse, formdata=data, method="POST", dont_filter=True)
#items.py# -*- coding: utf-8 -*-# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass HzzjbWebItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()
district=scrapy.Field()category=scrapy.Field()material_name=scrapy.Field()version=scrapy.Field()unit=scrapy.Field()tax_information_price=scrapy.Field()except_tax_information_price=scrapy.Field()y_m=scrapy.Field()
#piplines.py# -*- coding: utf-8 -*-# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.conf import settings
import pymysqlclass HzzjbWebPipeline(object):def process_item(self, item, spider):return item# 数据保存mysql
class MysqlPipeline(object):def open_spider(self, spider):self.host = settings.get('MYSQL_HOST')self.port = settings.get('MYSQL_PORT')self.user = settings.get('MYSQL_USER')self.password = settings.get('MYSQL_PASSWORD')self.db = settings.get(('MYSQL_DB'))self.table = settings.get('TABLE')self.client = pymysql.connect(host=self.host, user=self.user, password=self.password, port=self.port, db=self.db, charset='utf8')def process_item(self, item, spider):item_dict = dict(item)cursor = self.client.cursor()values = ','.join(['%s'] * len(item_dict))keys = ','.join(item_dict.keys())sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=self.table, keys=keys, values=values)try:if cursor.execute(sql, tuple(item_dict.values())):  # 第一个值为sql语句第二个为 值 为一个元组print('数据入库成功!')self.client.commit()except Exception as e:print(e)
self.client.rollback()return itemdef close_spider(self, spider):self.client.close()
#setting.py# -*- coding: utf-8 -*-# Scrapy settings for hzzjb_web project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'hzzjb_web'SPIDER_MODULES = ['hzzjb_web.spiders']
NEWSPIDER_MODULE = 'hzzjb_web.spiders'# mysql配置参数
MYSQL_HOST = "172.16.0.55"
MYSQL_PORT = 3306
MYSQL_USER = "root"
MYSQL_PASSWORD = "concom603"
MYSQL_DB = 'web_datas'
TABLE = "web_hzzjb"# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'hzzjb_web (+http://www.yourdomain.com)'# Obey robots.txt rules
ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'hzzjb_web.middlewares.HzzjbWebSpiderMiddleware': 543,
#}# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {'hzzjb_web.middlewares.HzzjbWebDownloaderMiddleware': 500,
}# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {'hzzjb_web.pipelines.HzzjbWebPipeline': 300,
}# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
#middlewares.py# -*- coding: utf-8 -*-# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlfrom scrapy import signalsclass HzzjbWebSpiderMiddleware(object):# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the spider middleware does not modify the# passed objects.
@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return sdef process_spider_input(self, response, spider):# Called for each response that goes through the spider# middleware and into the spider.# Should return None or raise an exception.return Nonedef process_spider_output(self, response, result, spider):# Called with the results returned from the Spider, after# it has processed the response.# Must return an iterable of Request, dict or Item objects.for i in result:yield idef process_spider_exception(self, response, exception, spider):# Called when a spider or process_spider_input() method# (from other spider middleware) raises an exception.# Should return either None or an iterable of Response, dict# or Item objects.passdef process_start_requests(self, start_requests, spider):# Called with the start requests of the spider, and works# similarly to the process_spider_output() method, except# that it doesn’t have a response associated.# Must return only requests (not items).for r in start_requests:yield rdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)class HzzjbWebDownloaderMiddleware(object):# Not all methods need to be defined. If a method is not defined,# scrapy acts as if the downloader middleware does not modify the# passed objects.
@classmethoddef from_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.s = cls()crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return sdef process_request(self, request, spider):# Called for each request that goes through the downloader# middleware.# Must either:# - return None: continue processing this request# - or return a Response object# - or return a Request object# - or raise IgnoreRequest: process_exception() methods of#   installed downloader middleware will be calledreturn Nonedef process_response(self, request, response, spider):# Called with the response returned from the downloader.# Must either;# - return a Response object# - return a Request object# - or raise IgnoreRequestreturn responsedef process_exception(self, request, exception, spider):# Called when a download handler or a process_request()# (from other downloader middleware) raises an exception.# Must either:# - return None: continue processing this exception# - return a Response object: stops process_exception() chain# - return a Request object: stops process_exception() chainpassdef spider_opened(self, spider):spider.logger.info('Spider opened: %s' % spider.name)

 

posted on 2018-10-19 09:43 五杀摇滚小拉夫 阅读(...) 评论(...) 编辑 收藏

转载于:https://www.cnblogs.com/lvjing/p/9814690.html

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.luyixian.cn/news_show_822531.aspx

如若内容造成侵权/违法违规/事实不符,请联系dt猫网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

公司为什么一定要有自己的网站

公司为何一定要有属于自己的网站&#xff0c;网站在人们的日常生活中的作用越来越重要&#xff0c;大家买东西&#xff0c;找工作&#xff0c;旅游等等都喜欢直接在网络上解决&#xff0c;不仅省时省事&#xff0c;而且更加方便。可以毫不夸张的说&#xff0c;我们生活中90%的事…

【大型网站技术实践】初级篇:海量图片的分布式存储设计与实现

说明&#xff1a;本文是我阅读计算机工程期刊《海量图片的分布式存储及负载均衡研究》一文的学习笔记和具体实践&#xff0c;原文地址在本文底部。 一、研究背景&#xff1a;性能与资金&#xff0c;二者可兼得乎&#xff1f; 1.1 那么问题来了&#xff1f; 随着互联网的发展&am…

SEO的艺术(原书第2版)

《SEO的艺术(原书第2版)》基本信息原书名&#xff1a;The Art of SEO, Second Edition作者&#xff1a; Eric Enge Stephan Spencer Jessie Stricchiola Rand Fishkin译者&#xff1a; 姚军丛书名&#xff1a; O’Reilly精品图书系列出版社&#xff1a;机械工业出版社ISBN&…

微软没强迫?Win 10 版本号追踪网站 Buildfeed 关闭

开发四年只会写业务代码&#xff0c;分布式高并发都不会还做程序员&#xff1f; 近日&#xff0c;迫于各方压力&#xff0c;知名 Windows 10 版本号追踪网站 Buildfeed 宣布关闭。对于 Windows Insider 用户来说&#xff0c;即时获取最新 Windows 10 预览版本信息很重要&…

开发小白也毫无压力的hexo静态博客建站全攻略 - 躺坑后亲诉心路历程

目录 基本原理方法1 - 本机Windows下建站 (力荐)下载安装node.js用管理员权限打开命令行&#xff0c;安装hexo-cli和hexo下载安装git初始化hexo使用hexo generate生成静态资源在本地运行hexo&#xff0c;看一切是否正常在Coding.net创建与用户名相同的项目&#xff0c;并启用代…

机器学习实战—k近邻算法(kNN)02-改进约会网站的配对效果

示例&#xff1a;使用k-近邻算法改进约会网站的配对效果 在约会网站上使用k-近邻算法&#xff1a; 1.收集数据&#xff1a;提供文本文件。 2.准备数据&#xff1a;使用Python解析文本文件。 3.分析数据&#xff1a;使用matplotlib画二维扩散图。 4.训练算法&#xff1a;此…

php微信用户绑定网站用户

php微信用户绑定网站用户实现原理&#xff1a;因为对于每个公共号&#xff0c;每个微信用户的open_id是固定不变的&#xff0c;也就是说可以利用网站用户id与微信用户的open_id建立一一对应关系。废话不多说&#xff0c;直接看代码&#xff1a; 数据库设计如下&#xff1a; 当…

JavaScript中mouseover和mouseout多次触发解决办法

问题描述 我希望当鼠标移动到id1上的时候&#xff0c;id2显示&#xff0c;当鼠标离开id1的时候&#xff0c;id2不显示。问题如下&#xff1a; 1.当鼠标从id1上移动到id2上的时候&#xff0c;id由有显示变为不显示&#xff0c;然后变为显示 2.当鼠标从id2上移动到id1上的时候…

怎样给自己的网站添加一个在浏览器标签、地址栏左边和收藏夹上显示的缩略logo标志

问题描述 不知道&#xff0c;大家有没有注意&#xff0c;有的网站&#xff0c;地址栏上都有一个小图标&#xff0c;如csdn或者是百度。 但是我个人做的网站就没有&#xff0c;怎样添加这样的图标呢&#xff1f; 其实&#xff0c;这个是通过favicon.ico来控制的。 favicon.ico…

当修改网站上的图片等资源时怎样避免客户缓存的问题

问题分析 最近在修改网站上的logo时候&#xff0c;发现修改后&#xff0c;浏览的时候&#xff0c;还是看到之前的图片&#xff0c;PC端多多刷新几次&#xff0c;显示倒是okey&#xff0c;可是手机端依旧是原图片。很明显是缓存的问题&#xff0c;但是我又不想清除手机浏览器缓…

在window中通过IIS发布自己的网站经验总结

转自&#xff1a;https://blog.csdn.net/YSG___/article/details/69061310?utm_mediumdistribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-2.control&depth_1-utm_sourcedistribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-2.c…

Win10下IIS配置、项目发布、添加网站

转自&#xff1a;https://study-life.blog.csdn.net/article/details/77006831?utm_mediumdistribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-7.control&depth_1-utm_sourcedistribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2…

内存泄露检查工具及相关网站

Visual Leak Detector http://xiaoruanjian.iteye.com/blog/1091410 灵活自由是C/C语言的一大特色&#xff0c;而这也为C/C程序员出了一个难题。当程序越来越复杂时&#xff0c;内存的管理也会变得越加复杂&#xff0c;稍有不慎就会出现内存问题。内存泄漏是最常见的内存问题之…

源码托管网站推荐——OKSvn

在团队开发时&#xff0c;没使用SVN或者其他版本控制工具必将带来很多不必要的麻烦。在本机搭建SVN的方法虽然可行&#xff0c;但你不能保证你的电脑一直处于运行状态&#xff0c;显然是很不方便的。 我们知道&#xff0c;新浪、谷歌都有项目托管&#xff0c;由于网速…

WordPress 在主题网站添加新年快乐红灯笼特效源码样式

在春节过年的时候看到有在WordPress博客网站添加了红灯笼新年快乐样式。很有过年气氛&#xff0c;今天就给大家分享一下具体的代码样式。 WordPress主题过节灯笼 CSS 样式 这个样式代码可以加在自己的主题 css 样式文件里&#xff0c;也可以单独写进去。部分 WordPress 主题&a…

网站变灰代码,一行代码让网站整体变灰,wordpress网站一行代码全站变灰教程

在遇到特殊情况的时候&#xff0c;我们作为站长需要紧急将网站变灰的需求&#xff0c;在此小编给大家总结了几种方法&#xff0c;通过简单修改一下站点样式即可实现。一段代码让网站整体变灰。这里主要介绍的利用 filter: grayscale属性来实现。供大家学习交流。 网站变灰代码…

WordPress插件 SuperPWA让你的WordPress网站瞬间变成APP

PWA 是 Progressive Web App 的英文缩写&#xff0c; 翻译过来就是渐进式增强 WEB 应用&#xff0c; 是 Google 在 2016 年提出的概念&#xff0c;2017 年落地的 web 技术。目的就是在移动端利用提供的标准化框架&#xff0c;在网页应用中实现和原生应用相近的用户体验的渐进式…

WordPress 网站怎么做会员中心功能【会员中心】

WordPress网站的会员后台与管理员后台默认是一样的&#xff0c;只不过功能少一些而已。但从整体版面上看&#xff0c;Wordpress 网站会员后台并不美观&#xff0c;很多站长并不喜欢这样的后台。那么对于使用 WordPress 建网站的站长&#xff0c;怎么样开发出一个版面美观的会员…

使用angular $interval服务实现购物网站秒杀活动时间倒计时

最近在做一个购物网站的秒杀活动,其中涉及到了一个时间的倒计时. 所谓“秒杀”&#xff0c;就是网络卖家发布一些超低价格的商品&#xff0c;所有买家在同一时间网上抢购的一种销售方式。通俗一点讲就是网络商家为促销等目的组织的网上限时抢购活动。由于商品价格低廉&#xf…

怎么扒站建站_深扒国内建站服务:网站建设哪家服务好?

企业如果想通过互联网来打响品牌&#xff0c;吸引更多客户&#xff0c;一个自己的官方网站是少不了的。如今各种建站服务商也有很多&#xff0c;但是服务质量良莠不齐&#xff0c;这该怎么选择呢&#xff1f;今天就跟大家深扒一下网站建设哪家服务好&#xff0c;让你明白企业到…