Python + Selenium: 爬取某社交门户网站新闻栏目焦点

news/2024/5/21 7:10:57/文章来源:https://blog.csdn.net/qq_21264377/article/details/108197113

以前通过直接获取的HTML文本内容分析过，但是明显毫无结果。彼时深切体会到要想测试爬取“普通”反爬虫策略级别以上的网站内容，必须从浏览器内核级别入手。因为只有彻底模拟浏览器的行为，才无法被机器人100%识别为爬虫而遭到限制或拒绝服务。“爬虫”与“反爬虫”是一对相互对立的矛盾，也是互相促进的因素。

话说回来，该网站是国内某知名社交网站发家的，在中国互联网历史也算是较早的。每年毕业季，都会有大量毕业生“挤破头”想要通过层层面试进入该网站企业。这样的历史背景应该有相当的技术沉淀。适合练习尝试。那接下来，就开始咯。

前期准备工作依然是，调查分析该社交网站的“反爬虫”策略。

浏览目标：

浏览目标

这个页面是延时加载的。而且，往下滚动页面至底部时，会刷新底部追加新的新闻消息内容。
查看HTML源码：

目标HTML源码

HTML源码只有短短80多行，没有包含任何新闻消息相关内容。由此证实，目标网页是通过javascript延时异步加载的AJAX模式。这对爬虫来是“不友好”的，鉴此评级该目标为“普通”或“严格”级别反爬虫策略。

现在，进行解析目标的HTML体系结构。利用浏览器的“检查元素”功能，解读目标HTML。（这里使用的Chrome，其它如Firefox等selenium支持的浏览器均可）以前Firefox上著名的firebug插件是非常有用的工具，现在该插件似乎已停止更新，主流浏览器都已拥有类似内置功能。

目标体系结构2-1

从上图可以看出，所要提取的焦点内容在一个list列表 – ul标签，class名为：“list”。

在这里插入图片描述

而ul的li的内容如上图所示，标题包含在一个a标签。a标签里包含了一些注释符（绿色字体）。需要特别处理一下，可以利用正则表达式匹配出来，逐个去除：

comment_pattern='<!--[^<>]*-->'
comments=re.findall(comment_pattern, target_html)
for comment in comments:# 替换为空白符target_html=target_html.replace(comment, '')

按照流程来看，就是：
等待目标主体加载 -> 获取目标主体节点 -> 解析目标列表 -> 遍历列表解析目标详细 -> 生成目标实体集合
设计代码：

# -*- coding: utf-8 -*-
#!/usr/bin/env python"""
@author: WowlNAN@github: https://github.com/WowlNAN@blog: https://blog.csdn.net/qq_21264377"""
"""
Get news list of target
"""import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ECS
from htmldom import *
import re
import sys
from schedule import *class Solution:def __init__(self):self.url=''# headless#self.options=Options()#self.options.add_argument("--headless")#self.options.add_argument("--disable-gpu")#self.driver=webdriver.Chrome(chrome_options=self.options)self.driver=webdriver.Chrome()self.caches={}self.path=Nonedef __delete__(self):self.url=Noneself.caches=Noneself.path=''            if self.driver:self.driver.close()self.driver=Noneif self.options:self.options=Nonedef reset(self):self.url=''self.caches={}self.path=Nonedef dictcache(self, title: str, url: str):if self.caches.get(title, '')=='':self.caches[title]=urldef pickcaches(self, htmls: str):for html in htmls:# 是否包含注释标签if '<!--' in html:comments=re.findall('<!--[^<>]*-->', html)if comments and len(comments)>0:for comment in comments:html=html.replace(comment, '')# 目标标题title=re.findall('>([^<>]+)</a>', html, re.I|re.S|re.M)# 目标链接link=re.findall('href="([^<>"]+)"', html, re.I|re.S|re.M)if title and link and len(title)>=1 and len(link)>=1:self.dictcache(title[0], link[0])def getcaches(self, url: str):if not url:return Noneelif not url.strip():return Noneelif not url.startswith('http://') and not url.startswith('https://'):return Noneif self.driver:self.reset()else:self.__init__()starttime=time.time()self.path=url.split('/')[-1].split('.')[0]self.driver.get(url)wait=WebDriverWait(self.driver, 5)time.sleep(1)        # 目标主体列表ul标签    targetelement=wait.until(ECS.presence_of_element_located((By.CSS_SELECTOR, 'ul[class="list"]')))if not targetelement or targetelement==[]:return Nonehtml=targetelement.get_attribute('innerHTML')if not html:return#print(html)# 目标列表titlehtmls=match('//li//div:class=detail//h3//a', html)if not titlehtmls:return            self.pickcaches(titlehtmls)codes={'count':0, 'hit':0, 'done':0, 'failed':0, 'lost':0, 'time':0}                          endtime=time.time()if len(self.caches.keys())>0:for key in self.caches.keys():print(key, self.caches.get(key, ''))sys.stdout.flush()codes['done']=len(self.caches.keys())codes['time']=str(int(endtime-starttime))+'s'keys='|'actions='|'l=len(codes.keys())i=0gridwidth=12for key in codes.keys():kl=gridwidth-len(key)keys+=' '*(kl//2)+key+' '*(kl-kl//2)al=gridwidth-len(str(codes[key]))actions+=' '*(al//2)+str(codes[key])+' '*(al-al//2)i+=1if i>0:keys+='|'actions+='|'print('\r|'+('-'*gridwidth+'|')*l, end='\n')print(keys)print('|'+('-'*gridwidth+'|')*l)print(actions)print('|'+('-'*gridwidth+'|')*l)testdatas=[]
s=Solution()
#s.getimages(testdatas[-1])
print("ENTER:[eg, http://a.com/b/c.html]")
url=' '
while url:if url:s.getcaches(url)time.sleep(.2)s.reset()url=input(">>")