最近学了一下爬虫,就写段代码来试一下成果如何.(目的是爬取某动漫网站上的一部动漫darling in the franxx)
版本是python3.7
import requests import refrom selenium import webdriverimport osheaders={'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}url="http://www.imomoe.in/view/7288.html"response = requests.get(url,params=headers)html=response.textstep1_name=re.findall("/player/7288-0-.*?.html",html)if not os.path.exists("DarlingInTheFranxx"):os.mkdir("DarlingInTheFranxx")for i in range(24):file_name = "DITF" + str(i + 1)if not os.path.exists("DarlingInTheFranxx" + '/' + file_name + '.mp4'):url='http://imomoe.in'+step1_name[i]response=requests.get(url,headers=headers)#这里实在是找不到什么好的方法来获取iframe的源代码browser=webdriver.Chrome()browser.get(url) browser.switch_to.frame("play2") temp=browser.page_sourcebrowser.quit() video_url=re.findall("https://.*?.mp4",temp)with open("DarlingInTheFranxx"+'/'+file_name+'.mp4','wb') as fuck:response = requests.get(video_url[0], stream=True)print("正在下载第" + str(i + 1) + "集")for chunk in response.iter_content(chunk_size=1024):if chunk:fuck.write(chunk)print("已下载第"+str(i+1)+"集")else:print("第"+str(i+1)+'集已经存在')
运行测试结果如下: