遇到的问题:网站设置了简单的反爬虫规则:数字防爬,如:这样的。
解决方法:直接获取0-9的编码加入字典以此替换。
代码如下:
import requests,re,time,xlwt headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } end_list = [] replace_dict={"":"0","":"1","":"2","":"3","":"4","":"5","":"6","":"7","":"8","":"9"} def get_links(url):wb_data = requests.get(url,headers=headers)wb_data.encoding=wb_data.apparent_encodinglinks = re.findall('class="name-box clearfix".*?href="(.*?)"',wb_data.text,re.S)for link in links:get_infos('https://www.shixiseng.com'+link) def get_infos(url):wb_data = requests.get(url,headers=headers)wb_data.encoding=wb_data.apparent_encodingsalarys = re.findall('class="job_money cutom_font">(.*?)</span>',wb_data.text,re.S)addresses = re.findall('class="job_position">(.*?)</span>',wb_data.text,re.S)educations = re.findall('class="job_academic">(.*?)</span>',wb_data.text,re.S)jobways = re.findall('class="job_week cutom_font">(.*?)</span>',wb_data.text,re.S)months = re.findall('class="job_time cutom_font">(.*?)</span>',wb_data.text,re.S)jobgoods = re.findall('class="job_good".*?>(.*?)</div>',wb_data.text,re.S)contents = re.findall(r'div class="job_til">([\s\S]*?)<div class="job_til">', wb_data.text, re.S)[0].replace(' ','').replace('\n', '').replace(' ', '')contents = re.sub(r'<[\s\S]*?>', "", str(contents))#requires = re.findall(r'class="job_detail".*?>font-size:14px;>([\s\S]*?)</span>',wb_data.text,re.S)for salary,address,education,jobway,month,jobgood in zip(salarys,addresses,educations,jobways,months,jobgoods):for key, vaule in replace_dict.items():salary = salary.replace(key, vaule)jobway = jobway.replace(key,vaule)month = month.replace(key,vaule)list=[url,salary,address,education,jobway,month,jobgood,contents]end_list.append(list) if __name__ == '__main__':try:urls = ['https://www.shixiseng.com/it/{}'.format(str(i))for i in range(1,10)]q = 1for url in urls:print('正在打印第%d页'%q)q+=1get_links(url)time.sleep(3)book = xlwt.Workbook(encoding='utf-8')sheet = book.add_sheet('newjobmessage')header = ['网址','日薪','地址','学历','上班要求','实习期','福利','要求']for h in range(len(header)):sheet.write(0,h,header[h])i = 1for list in end_list:j = 0for data in list:sheet.write(i,j,data)j+=1i+=1book.save('123.xls')except:print('endprocess')
效果图: