要求:爬取https://ssr1.scrape.center/ 网站中所有电影标题、类型、地区、电影时长、上映日期、评分、简介;
分析:网站共有两个页面组成,电影列表和详情,我们所需要的内容都在详情页面里面可以找到。
列表页面共10页,根据分析可得出,列表页面地址:https://ssr1.scrape.center/page/{页码};
详情页面就利用正则表达式把一个个内容解析出来即可。
直接上代码了:
# coding:utf-8
import requests
import json
import re
from os import makedirs
from os.path import exists
import timeimport multiprocessingBASE_URL = 'https://ssr1.scrape.center'
RESULTS_DIR = 'movies'
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
TOTAL_PAGE = 10# 抓取列表页面内容
def grap_html(page):page_url = f'{BASE_URL}/page/{page}'res = requests.get(page_url)return res.text
# 解析列表页面,利用正则表达式解析出电影详情url地址。
def parse_index(html):results = re.findall('.*?el-card.*?href="(.*?)"', html, re.S)results = results if results else []return results
# 抓取详情页面内容
def grap_detail(path):detail_url = f'{BASE_URL}{path}'res = requests.get(detail_url)res.encoding = 'utf-8'return res.text
# 解析电影详情页面,利用正则表达式解析出需要内容,返回字典
def parse_detail(html):title_match = re.search('<h2.*?>(.*?)</h2>', html)title = title_match.group(1) if title_match else Nonecategories = re.search('<div.*?categories">(.*?)</div>', html, re.S)kind_match = re.findall('<span>(.*?)</span>', categories.group(1), re.S)kind = kind_match if kind_match else Nonearea_html = re.search('<div.*?categories">.*?</div>.*?<div.*?>(.*?)</div>', html, re.S)area_match = re.search('<span.*?>(.*?)</span>.*?<span.*?>.*?</span>.*?<span.*?>(.*?)</span>', area_html.group(1),re.S)area = area_match.group(1) if area_match else Noneusetime = area_match.group(2) if area_match else Nonetime = re.search('\d{4}-\d{2}-\d{2}', html, re.S).group() if re.search('\d{4}-\d{2}-\d{2}', html, re.S) else Noneremark = re.search('<div.*?drama.*?<p.*?>(.*?)</p>', html, re.S)desc = remark.group(1).strip() if remark else Nonescore = re.search('<p.*?score.*?>(.*?)</p>', html, re.S)score = score.group(1).strip() if score else Nonelogo = re.search('<div.*detail.*?<img.*?src="(.*?)@.*?".*?>', html, re.S)logo = logo.group(1) if logo else Nonedict = {'title': title,'kind': kind,'area': area,'usetime': usetime,'onlinetime': time,'desc': desc,'score': score,'logo': logo}return dict# 下载电影海报,并以图片形式保存
def down_movie_poster(title, url):path = f'{RESULTS_DIR}/{title}.jpg'res = requests.get(url)with open(path, "wb") as f:f.write(res.content)
# 保存电影信息到文件,电影名称作为文件名
def savefile(movie):text = '电影主题:{title}\n电影类型:{kind}\n上映地区:{area}\n电影时长:{usetime}\n上映时间:{onlinetime}\n评分:{score}\n简介:{desc}'.format(**movie)name = movie.get('title')data_path = f'{RESULTS_DIR}/{name}.json'# 将字典以json字符串形式写入文件json.dump(movie, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)# with open(RESULTS_DIR + '/' + movie.get('title') + '.json', 'w') as f: f.write(text)down_movie_poster(name, movie.get('logo'))# 处理指定页码的列表页中数据,并存文件
def grapone(page):print("开始下载第%d页" % page)start = time.perf_counter()s = time.time()# 下载列表页面,并解析出电影详情url地址。detailurls = parse_index(grap_html(page))for result in detailurls:# 抓取详情页面detail_html = grap_detail(result)# 解析详情页面,返回电影dictmovie = parse_detail(detail_html)savefile(movie)end = time.perf_counter()e = time.time()print("第%d页完成下载,CPU用时:%d,耗时:%d" % (page, end - start, e - s))# 单一线程抓取网站内电影数据
def grapall():# 单线程模式start = time.perf_counter()s = time.time()for i in range(1, TOTAL_PAGE + 1):grapone(i)end = time.perf_counter()e = time.time()print("单线程CPU共用时:%d,耗时%d" % (end - start, e - s))# 多线程抓取电影数据
def multigrap():# 多线程模式start = time.perf_counter()s = time.time()pool = multiprocessing.Pool()pages = range(1, TOTAL_PAGE + 1)pool.map(grapone, pages)pool.close()pool.join()end = time.perf_counter()e = time.time()print("多线程CPU共用时:%d,耗时%d" % (end - start, e - s))if __name__ == '__main__':multigrap()# grapall()
最终效果:
其他不说了,直接看代码得了,正则表达式部分比较烂。。。
本文参考文献:https://cuiqingcai.com/202224.html ,想学习爬虫的可以移步。