为什么80%的码农都做不了架构师?>>>
写了一个抓取http://www.youwu.cc/index.html整个网站图片的爬虫,使用redis去重和任务队列,这样可以避免递归,我不能保证你看到的时候还能够使用,人家网站也会反爬虫的,代码如下,非常直白
# -*- coding:utf-8 -*-
"""
python collect_images.py http://www.youwu.cc/index.html
"""
import os
import sys
import json
import urllib2
import redis
import urlparse
import requests
import traceback
from copy import deepcopy
from lxml import etreeHEADERS = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
}class Queue:def __init__(self):self.list_name = 'task'self._redis_conn = redis.Redis()def put(self, task):self._redis_conn.lpush(self.list_name, task)def get(self):return self._redis_conn.brpop(self.list_name, timeout=60)class DedupMap:def __init__(self):self.set_name = 'visited'self._redis_conn = redis.Redis()def first_visit(self, element):return self._redis_conn.sadd(self.set_name, element)def retry(self, element):self._redis_conn.srem(self.set_name, element)TASK_QUEUE = Queue() # 网页任务调度,redis队列
DEDUP_MAP = DedupMap() # 网页去重, redis集合
FILTER = {'href': lambda x: True,'src': lambda x: True,
}def dn_of_url(url):return urlparse.urlparse(url).netlocdef ensure_dir(path):if not os.path.exists(path):os.mkdir(path)def full_path(href, refer):parse = urlparse.urlparse(refer)if href.startswith('http://') or href.startswith('https://'):rtv = hrefelif href.startswith('/'):rtv = '%s://%s%s' % (parse.scheme, parse.netloc, href)elif href.startswith('#'):query = '?' + parse.query if parse.query else ''rtv = '%s://%s%s%s%s' % (parse.scheme, parse.netloc, parse.path, query, href)elif href.startswith('?'):rtv = '%s://%s%s%s' % (parse.scheme, parse.netloc, parse.path, href)elif href.startswith('javascript'):rtv = referelse:rtv = '%s://%s%s' % (parse.scheme, parse.netloc, os.path.join(os.path.dirname(parse.path), href))return rtvdef extract_src_list(text):if not text:return []tree = etree.HTML(text)return tree.xpath('//img/@src')def extract_href_list(text):if not text:return []tree = etree.HTML(text)return tree.xpath('//a/@href')def get_html_content(url, headers):response = requests.get(url, headers=headers)if response.status_code == 200:return response.textdef get_image_content(url, headers):request = urllib2.Request(url=url, headers=headers)socket = urllib2.urlopen(request)if socket.code == 200:return socket.read()def get_next_urls(url, html):"""html是HTTP请求url获得的内容"""href_list = [full_path(href, url) for href in extract_href_list(html)]src_list = [full_path(src, url) for src in extract_src_list(html)]return href_list, src_listdef download_img(url, headers):path = os.path.join(os.path.dirname(__file__), os.path.basename(os.path.dirname(url)))ensure_dir(path)file_name = os.path.join(path, os.path.basename(url))if os.path.exists(file_name):return Falsecontent = get_image_content(url, headers)if content:with open(file_name, 'wb') as fp:fp.write(content)return Truedef deep_crawl(url, headers):print 'GET HTML:', urlhtml = get_html_content(url, headers)href_list, src_list = get_next_urls(url, html)for src in src_list:try:if FILTER['src'](src):succeed = download_img(src, headers)if succeed:print 'OK down: ', srcexcept BaseException as e:print 'ERROR down: ', srcraise eheaders = deepcopy(HEADERS)headers['Referer'] = urlfor href in href_list:if FILTER['href'](href) and DEDUP_MAP.first_visit(href):TASK_QUEUE.put(json.dumps({'url': href, 'headers': headers}))def main(index_url):FILTER['href'] = lambda x: dn_of_url(index_url) in xFILTER['src'] = lambda x: dn_of_url(index_url) in xheaders = deepcopy(HEADERS)headers['Referer'] = index_urlTASK_QUEUE.put(json.dumps({'url': index_url, 'headers': headers}))while True:task = TASK_QUEUE.get()if not task:breakelse:task = json.loads(task[1])try:deep_crawl(task['url'], task['headers'])except BaseException as e: # 失败的时候把它重新放回去print 'PUT BACK:', taskTASK_QUEUE.put(json.dumps(task))print traceback.format_exc()raise eif __name__ == '__main__':if len(sys.argv) < 2:print sys.argv[0], 'url'else:main(sys.argv[1])