多线程导出excel_seo必备网站分析工具，关键词百度搜索结果查询导出源码

news/2024/5/21 1:16:07/文章来源:https://blog.csdn.net/weixin_39929646/article/details/111233145

seo必备网站分析工具，关键词百度搜索结果查询导出源码

两个简单的版本，关于百度搜索结果的采集抓取，可以获取到竞争对手的网站，加以分析和研究，只需输入关键词和搜索页码，即可完成对于竞争对手的获取和研究，给出两个版本，希望可以起到参考和帮助！

版本一

特点

cookies读取，随机选取一个访问网页
导出结果排除了百度自家产品
excel导出数据
简单多线程案例可参考

#百度搜索结果抓取
#author/微信：huguo00289
# -*- coding: utf-8 -*-

import requests,time,random
from fake_useragent import UserAgent
from lxml import etree
import threading
import xlsxwriter



class Baidu_search():
    def __init__(self):
        self.url="https://www.baidu.com/s?wd="
        self.ua=UserAgent()
        self.search_datas=[]



    #获取cookies
    def get_cookies(self):
        with open("cookie.txt", "r", encoding="utf-8") as f:
            cookies = f.readlines()
            cookie=random.choice(cookies)
            cookie=cookie.strip()
        return cookie


    #获取搜索结果
    def get_search_objects(self,search_url):
        headers={
            "User-Agent":self.ua.random,
            'Cookie':self.get_cookies(),
        }
        html=requests.get(search_url,headers=headers,timeout=8).content.decode("utf-8")
        time.sleep(2)
        req=etree.HTML(html)
        h3s=req.xpath('//div[@]/h3[@]/a')
        hrefs=req.xpath('//div[@]/h3[@]/a/@href')
        for h3,href in zip(h3s,hrefs):
            h3=h3.xpath('.//text()')
            h3=''.join(h3)
            href=self.get_website_url(href)
            data=h3,href
            self.search_datas.append(data)
            print(data)




    # 获取真实地址
    def get_website_url(self,baidu_url):
        r = requests.head(baidu_url, stream=True)
        website_url = r.headers['Location']
        # print(website_url)
        return website_url


    #插入excel
    def write_to_xlsx(self, file_name):
        workbook = xlsxwriter.Workbook(f'{file_name}_{time.strftime("%Y-%m-%d ", time.localtime())}.xlsx')  # 创建一个Excel文件
        worksheet = workbook.add_worksheet(file_name)
        title = ['标题', '网址']  # 表格title
        worksheet.write_row('A1', title)
        for index, data in enumerate(self.search_datas):
            # content = content.rstrip()
            # keyword, rank, include_num, chart_url, title, game_id, company_num, long_words_num = data
            num0 = str(index + 2)
            row = 'A' + num0
            # data = [name, size, game_id]
            worksheet.write_row(row, data)
        workbook.close()

        print("搜索结果数据插入excel表格成功！")





    def main(self,keyword,num):
        for i in range(0, num):
            print(f'正在查询第{i+1}页百度搜索结果数据..')
            ym = i * 10
            search_url = f"{self.url}{keyword}&ie=UTF-8&pn={ym}"
            self.get_search_objects(search_url)

        self.write_to_xlsx(keyword)


    #多线程
    def Thread_main(self,keyword,num):
        threadings=[]
        for i in range(0, num):
            print(f'正在查询第{i+1}页百度搜索结果数据..')
            ym = i * 10
            search_url = f"{self.url}{keyword}&ie=UTF-8&pn={ym}"
            t=threading.Thread(target=self.get_search_objects,args=(search_url,))
            threadings.append(t)
            t.start()

        for x in threadings:
            x.join()

        print("多线程查询百度搜索结果完成")

        print(self.search_datas)


if __name__=='__main__':
    keyword="工业设计"
    num=10
    spider=Baidu_search()
    spider.main(keyword,num)
    #spider.Thread_main(keyword, num)

版本二

特点

cookies 固定，不可变
数据几乎全部导出，排名也已经写入

#关键词百度搜索结果查询
#20191121 by 微信：huguo00289
# -*- coding: UTF-8 -*-

import requests,time
import urllib.parse
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import json


def ua():
    ua = UserAgent()
    return ua.random

headers={
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Cookie':Cookie ,
    'Host': 'www.baidu.com',
    'Referer': 'https://www.baidu.com/?tn=48021271_6_hao_pg',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent':ua()
    #'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}


#获取百度跳转真实网址
def get_trueurl(url):
    try:
        r = requests.head(url, stream=True)
        zsurl = r.headers['Location']
    except:
        zsurl=url
    return zsurl

#获取网页信息
def get_response(url):
    """
    #代理ip
    proxy = '120.83.105.195:9999'
    proxies = {
        'http': 'http://' + proxy,
        'https': 'https://' + proxy
    }
    response=requests.get(url,headers=ua(),proxies=proxies,timeout=10)"""
    #response = requests.get(url, headers=ua(),timeout=10)
    response = requests.get(url, headers=headers, timeout=10)
    print(f'状态码：{response.status_code}')
    time.sleep(2)
    response.encoding='utf-8'
    req=response.text
    return req

#查询搜索结果
def get_bdpm(keyword,num):
    """
    #转换为utf-8编码
    key_word = urllib.parse.quote(keyword)
    print(key_word)
    """
    for i in range(0,int(num)):
        print(f'正在查询{i + 1}页搜索结果...')
        ym=i * 10
        url=f"https://www.baidu.com/s?wd={keyword}&ie=UTF-8&pn={ym}"
        #print(url)
        req=get_response(url)
        #print(req)
        soup=BeautifulSoup(req,'lxml')
        divs=soup.find('div',id="content_left").find_all('div')
        for div in divs:
            if 'class="result'in str(div):
                try:
                    pm=div['id']
                except:
                    pm=''
                title=div.find('a').get_text()
                title=title.strip()
                href=div.find('a')['href']
                zsurl=get_trueurl(href)
                print(pm,title,zsurl)
        time.sleep(5)







if __name__ == '__main__':
    while True:
        keyword =input('请输入要查询的关键词：')
        num = input('请输入要查询的页码数：')
        try:
            get_bdpm(keyword,num)
        except IndexError as e:
            print(e)
            print("查询结果失败！")