目录结构
文件内容
安装依赖
cnpm install cheerio --save
service/spider.js
'use strict';const Service = require('egg').Service;class SpiderService extends Service {async requestUrl(url) {var result = await this.ctx.curl(url);return result;}
}module.exports = SpiderService;
schedule/watchdomain.js
var cheerio = require('cheerio')
module.exports = (app) => {return {schedule:{interval:'10s',type:'all'},async task(ctx){var url = 'https://news.baidu.com/';var result = await ctx.service.spider.requestUrl(url);//buff数据转为utf8var htmlData = result.data.toString();// 乱码转为utf8const $ = cheerio.load(htmlData,{decodeEntities:false})// 拿到网站标题var title = $('title').html();if(title != '百度新闻——全球最大的中文新闻平台'){console.log("网站被修改了")}else{console.log("正常")}//根据class拿到数据$('.hotnews a').each(function(){console.log($(this).html())})}}
}
网站内容
cheerio爬虫拿到数据,解析