python爬虫真假网址,python爬取福利网站图片完整代码,懂得人都懂

news/2024/5/9 11:18:01/文章来源:https://blog.csdn.net/weixin_30129661/article/details/116190056

网址需要自己替换懂的人都懂512*2,主要学习简单的爬虫,别乱用,否则后果自负!

[Python] 纯文本查看复制代码

import requests,bs4,re,os,threadingclass MeiNvTu: def __init__(self): self.url_main='https://网址保密,不能乱发哈哈/pw/' self.url=f'{self.url_main}thread.php?fid=' def getPageMax(self,typeID=14): try: res = requests.get(f'{self.url}{typeID}') res.encoding = 'utf-8' soup = bs4.BeautifulSoup(res.text, 'lxml') pageNum = soup.select('#main > div > span.fl > div.pages.cc > span') pageNum = int(re.search('/(.*?)Go', str(pageNum)).group(1)) return pageNum except: return 0 def getTitleList(self,typeID=14,page=1): ''' 爬取栏目里某一页的列表,网络错误返回False :param typeID: :param page: :return: ''' try: res=requests.get(f'{self.url}{typeID}&page={page}') res.encoding= 'utf-8' soup=bs4.BeautifulSoup(res.text,'lxml') listTitle=soup.select('tr > td > h3') lists=[] for item in listTitle: if 'html_data' in item.a['href'] : d={} d['href']=self.url_main+item.a['href'] d['title']=item.a.text lists.append(d) return lists except: return False def downImg(self,url,path): ''' 下载一整个页面的图片 :param url: :param path: :return: ''' global pool_sema res = requests.get(url) res.encoding = 'utf-8' soup = bs4.BeautifulSoup(res.text, 'lxml') imgs=soup.select('#read_tpc > img') lists=[] try: for i,item in enumerate(imgs): imgUrl=re.search("window.open\('(.*?)'\);", str(item['onclick'])).group(1) imgData=requests.get(imgUrl).content typ=imgUrl.split('.')[-1] with open(f'{path}{i}.{typ}','wb')as f: f.write(imgData) except: print('\033[31m[下载失败!网络异常] ' + path) pool_sema.release() return #将下载好的情况记录下来,下次可以跳过 textpath='' for item in path.split('\\')[0:3]: textpath=textpath+item+'\\' mutex.acquire() try: with open(textpath+'log.txt','a')as f: f.writelines(path.split('\\')[3]+'\n\r') except: pass mutex.release() # 完成后线程池记录-1 print('\033[31m[完成下载] '+path) pool_sema.release() def get_typeTitle(self,id): ''' 返回类型的标题 :param id: :return: ''' if id==14: return '唯美写真' if id==15: return '网友马赛克' if id==16: return '露出马赛克' if id==49: return '街拍马赛克' if id==21: return '丝袜美腿' if id==114: return '欧美马赛克' def downloadthe(self,title,path): ''' 判断是否已经下载过,下载过返回True,没下载过返回False :param title: :param path: :return: ''' try: with open(path+'log.txt', 'r')as f: text = f.read() if title in text: return True else: return False except: return False def get_Page_History(self,path): ''' 读取上一次结束的页码 :param path: :return: ''' try: with open(path+'pagelog.ini','r')as f: return int(f.read()) except: return 0if __name__ == '__main__': # 限制线程数量 pool_sema = threading.BoundedSemaphore(70) # 创建互斥体 mutex = threading.Lock() #创建爬取对象 mnt=MeiNvTu() #栏目id typeID=21 #获得最大页数 page_max=mnt.getPageMax(typeID) if page_max==0: print('\033[31m网络错误!,总页数为0') else: path_main= f"D:\\爬取的网站图片\\{mnt.get_typeTitle(typeID)}\\" if os.path.isdir(path_main) != True: os.makedirs(path_main, mode=0o777) #爬取某页的列表 page_History=mnt.get_Page_History(path_main) for i in range(page_max): #跳过之前下载过的页码 if i+1

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.luyixian.cn/news_show_766688.aspx

如若内容造成侵权/违法违规/事实不符，请联系dt猫网进行投诉反馈email:809451989@qq.com，一经查实，立即删除！