最近自己为了提升一下技术,写了一个简单的电影链接网站。主要分三部份:
第一个:网站
点击打开链接
第二个:后台管理
点击打开链接
账号:ag 密码:test@123
第三个:抓取服务
本文重点介绍抓取服务,目前只抓取了两个电影网站的部份信息(只供技术开发使用为目的)。
现在直接上代码:
Program
using Autofac;
using Autofac.Builder;
using OA.Common.DtoModel;
using Ohye.Film.Application;
using Ohye.Film.Domain;
using Ohye.Film.Infrastructure;
using Ohye.Film.Infrastructure.EFRepositories;
using Ohye.Film.Infrastructure.EFRepositories.UnitOfWork;
using Ohye.Film.Service.Spider;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Threading.Tasks;namespace Ohye.Film.Service
{class Program{static void Main(string[] args){Init();SpireFilms();List<string> spiredFilmDate = new List<string>();List<string> createIndexDate = new List<string>();while (true){var date = DateTime.Now.ToString("yyyyMMdd");var hour = DateTime.Now.Hour;if (hour == 4 && !spiredFilmDate.Contains(date)){SpireFilms();spiredFilmDate.Add(date);}if (hour == 6 && !createIndexDate.Contains(date)){System.Net.Http.HttpClient http = new System.Net.Http.HttpClient();http.GetAsync("http://film.ohyewang.com/");createIndexDate.Add(date);Console.ForegroundColor = ConsoleColor.Red;Console.WriteLine($"生成首页成功:{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss")}");}System.Threading.Thread.Sleep(TimeSpan.FromMinutes(10));}}private static void SpireFilms(){List<Tuple<string, int, string>> pageList = new List<Tuple<string, int, string>>();pageList.Add(new Tuple<string, int, string>("http://list.iqiyi.com/www/1/2-----------2017--11-1-1-iqiyi--.html", 2017, "美国"));pageList.Add(new Tuple<string, int, string>("http://list.iqiyi.com/www/1/2-----------2016--11-1-1-iqiyi--.html", 2016, "美国"));pageList.Add(new Tuple<string, int, string>("http://list.iqiyi.com/www/1/2-----------2015--11-1-1-iqiyi--.html", 2015, "美国"));pageList.Add(new Tuple<string, int, string>("http://list.iqiyi.com/www/1/1-----------2017--11-1-1-iqiyi--.html", 2017, "华语"));pageList.Add(new Tuple<string, int, string>("http://list.iqiyi.com/www/1/1-----------2016--11-1-1-iqiyi--.html", 2016, "华语"));pageList.Add(new Tuple<string, int, string>("http://list.iqiyi.com/www/1/1-----------2015--11-1-1-iqiyi--.html", 2015, "华语"));pageList.ForEach(p =>{ISplider _AIQIYI = new AIQIYI { Url = p.Item1, Year = p.Item2, Country = p.Item3 };_AIQIYI.SpliderResult();});List<Tuple<string, int, string>> pageListMGTV = new List<Tuple<string, int, string>>();pageListMGTV.Add(new Tuple<string, int, string>("https://list.mgtv.com/3/a4-537193-------2835073-2-1--a1-.html?channelId=3", 0, "美国"));pageListMGTV.Add(new Tuple<string, int, string>("https://list.mgtv.com/3/a4-49-------2835073-2-1--a1-.html?channelId=3", 0, "华语"));pageListMGTV.ForEach(p =>{ISplider _mgtv = new mgtv { Url = p.Item1, Year = p.Item2, Country = p.Item3 };_mgtv.SpliderResult();});//List<Tuple<string, int, string>> pageListQQ = new List<Tuple<string, int, string>>();//pageListQQ.Add(new Tuple<string, int, string>("http://film.qq.com/film_all_list/allfilm.html?type=movie&sort=5", 0, "美国"));//pageListQQ.ForEach(p =>//{// ISplider _qq = new qq { Url = p.Item1, Year = p.Item2, Country = p.Item3 };// _qq.SpliderResult();//});}static void Init(){AutoMapperConfig.RegisterMappings();var builder = IocCenter.ContainerBuilder;SetupResolveRules(builder);}static void SetupResolveRules(ContainerBuilder builder){var application = Assembly.Load("Ohye.Film.Application");builder.Register<OAUser>(c => CreateOAUser()).AsSelf();builder.RegisterType<EntityManager>().AsSelf().SingleInstance();builder.RegisterAssemblyTypes(application).Where(t => t.Name.EndsWith("Service")).AsSelf().InstancePerDependency();builder.RegisterType<OhyeFilmDbContext>().InstancePerLifetimeScope();builder.RegisterType<UnitOfWork>().As<IUnitOfWork>().InstancePerDependency();builder.RegisterGeneric(typeof(Repository<>)).As(typeof(IRepository<>)).InstancePerDependency();}static OAUser CreateOAUser(){return new OAUser{EmplID = "",EmplName = "系统管理员",DeptID = "",DeptName = "总部",};}}
}
Config
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;namespace Ohye.Film.Service
{public class Config{public static string DataDir{get{return System.Configuration.ConfigurationManager.AppSettings["DataDir"];}}public static string TempDir{get{string temp = $"{DataDir}FilmTemp";if (!System.IO.Directory.Exists(temp)){System.IO.Directory.CreateDirectory(temp);}return temp;}}}
}
SpireClient
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net;
using System.Net.Http;
using HtmlAgilityPack;namespace Ohye.Film.Service.Spider
{public class SpireClient{static List<string> _spiredUrlList;HttpClient _httpClient;public event EventHandler<string> Complete;public SpireClient(){_spiredUrlList = new List<string>();_httpClient = new HttpClient();}public async Task<string> GetHtml(string url){return await _httpClient.GetStringAsync(url);}public void SpireUrl(string url){if (_spiredUrlList.Contains(url)) return;_spiredUrlList.Add(url);_httpClient.GetAsync(url).ContinueWith((r) =>{HttpResponseMessage response = r.Result;response.Content.ReadAsStringAsync().ContinueWith((t) =>{OnGetResult(this , t.Result);});});}private void OnGetResult(object sender, string e){Complete?.Invoke(sender, e);}public List<HtmlNode> SelectNodes(string content, string regex){HtmlDocument htmlDoc = new HtmlDocument();htmlDoc.LoadHtml(content);var htmlNodes = htmlDoc.DocumentNode.SelectNodes(regex);if (htmlNodes == null) return new List<HtmlNode>();return htmlNodes.ToList();}}
}
HttpImage
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net.Http;
using System.IO;
using OA.Infrastructure;namespace Ohye.Film.Service.Spider
{public class HttpImage{public string GetImg(string url){byte[][] images = DownloadPicAsync(new string[] { url }).Result;//多线程下载图片,充分利用CPU多核string imageName = url.Substring(url.LastIndexOf('/') + 1, url.Length - url.LastIndexOf('/') - 1);string filePath = $@"{ Config.TempDir}\{imageName}";using (FileStream stream = new FileStream(filePath, FileMode.OpenOrCreate)){byte[] buff = images[0];stream.Write(buff, 0, buff.Length);Console.WriteLine("成功下载图片:" + imageName);}string fileID = MongoContext.Mongo.SaveFile(filePath);File.Delete(filePath);return fileID;}/// <summary>/// 批量下载图片/// </summary>/// <param name="urls"></param>/// <returns></returns>public async Task<byte[][]> DownloadPicAsync(IEnumerable<string> urls){HttpClient httpClient = new HttpClient();Task<byte[]>[] downloadTask = urls.Select(r => httpClient.GetByteArrayAsync(r)).ToArray();byte[][] data = await Task.WhenAll(downloadTask);return data;}}
}
AIQIYI
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Ohye.Film.DTO.Film;
using Ohye.Film.Application.Film;
using Autofac;
using Ohye.Film.Infrastructure.Enums;
using Ohye.Film.Infrastructure;namespace Ohye.Film.Service.Spider
{public class AIQIYI : ISplider{public AIQIYI(){}public string Url { get; set; }public int Year { get; set; }public string Country { get; set; }public void SpliderResult(){SpireClient spireClient = new SpireClient();spireClient.Complete += SpireClient_Complete;spireClient.SpireUrl(Url);}private void SpireClient_Complete(object sender, string html){SpireClient client = (SpireClient)sender;var productNodes = client.SelectNodes(html, "//ul[contains(@class,'site-piclist')]/li");productNodes.ForEach(p =>{var linkPic = client.SelectNodes(p.InnerHtml, "//div[@class='site-piclist_pic']/a").SingleOrDefault();bool canRead = !client.SelectNodes(linkPic.InnerHtml, "//p[@class='viedo_lt ']").Any();var productName = linkPic.Attributes.Where(x => x.Name == "title").SingleOrDefault().Value.Trim();var detailURL = linkPic.Attributes.Where(x => x.Name == "href").SingleOrDefault().Value.Trim();var detailHtml = client.GetHtml(detailURL).Result;var introduction = client.SelectNodes(detailHtml, "//span[@id='data-videoInfoDes']").SingleOrDefault()?.InnerText.Trim();var imgUrl = client.SelectNodes(linkPic.InnerHtml, "//img").SingleOrDefault().Attributes.Where(x => x.Name == "src").SingleOrDefault().Value.Trim();var duration = client.SelectNodes(linkPic.InnerHtml, "//span[@class='icon-vInfo']").SingleOrDefault().InnerText.Trim();var linkInfo = client.SelectNodes(p.InnerHtml, "//div[@class='site-piclist_info']").SingleOrDefault();var score = client.SelectNodes(linkInfo.InnerHtml, "//span[@class='score']").SingleOrDefault().InnerText.Trim();var authors = client.SelectNodes(linkInfo.InnerHtml, "//div[@class='role_info']/em/a").SelectMany(x => x.Attributes).Where(x => x.Name == "title").Select(x => x.Value).ToList();try{HttpImage httpImage = new HttpImage();IocCenter.Resolve<ProductService>(_productService =>{if (!_productService.CheckExisted(productName)){TimeSpan dur;TimeSpan.TryParse(duration, out dur);FM_ProductDTO product = new FM_ProductDTO{ID = Guid.NewGuid(),Name = productName,CategoryID = Guid.Parse("d012fcc6-b25a-447c-b079-95cc293a3f92"),Year = Year,Score = decimal.Parse(score),Duration = dur,CanRead = canRead,ImageID = null,IsDeleted = false,Country = Country,Content = new FM_ContentDTO{ID = Guid.NewGuid(),Introduction = introduction,ReadCount = 0,DownLoadCount = 0},LinkList = !canRead ? new List<FM_LinkDTO>() : new List<FM_LinkDTO>{new FM_LinkDTO{ID=Guid.NewGuid(),Address=detailURL,AuditStatus= AuditStatus.AuditPass,AuditTime=DateTime.Now,LinkType=LinkType.PlayUrl,}},AuthorList = authors.Select(x => new FM_AuthorDTO{ID = Guid.NewGuid(),AuhorType = AuhorType.Main,Name = x}).ToList()};product.ImageID = httpImage.GetImg(imgUrl);_productService.Add(product);Console.ForegroundColor = ConsoleColor.DarkGreen;Console.WriteLine(productName);Console.ForegroundColor = ConsoleColor.Gray;}else if (canRead){var productInfo = _productService.CheckCanRead(productName);if (!productInfo.Item1){Console.ForegroundColor = ConsoleColor.Green;Console.WriteLine($"发现新可播放电影:{productName}");//重新更新_productService.UpdateLink(productInfo.Item2, new List<FM_LinkDTO>{new FM_LinkDTO{ID = Guid.NewGuid(),Address = detailURL,AuditStatus = AuditStatus.AuditPass,AuditTime = DateTime.Now,LinkType = LinkType.PlayUrl}});}}else{Console.WriteLine($"已存在:{productName}");}});}catch (Exception ex){Console.ForegroundColor = ConsoleColor.Red;Console.WriteLine(productName + ex.Message + ex.InnerException);Console.WriteLine("failed");}});//查找下一页var cc = client.SelectNodes(html, "//div[@class='mod-page']/a[@data-search-page='item']").ToList();var pagesNodes = client.SelectNodes(html, "//div[@class='mod-page']/a[@data-search-page='item']").ToList().Where(p => p.Attributes["data-key"].Value != "down" && p.Attributes["data-key"].Value != "up").Select(p => new Tuple<int, string>(Int32.Parse(p.Attributes["data-key"].Value), p.Attributes["href"].Value));var currentPage = client.SelectNodes(html, "//div[@class='mod-page']/span[@class='curPage']").SingleOrDefault();if (currentPage != null){var pageIndex = Int32.Parse(currentPage.InnerText);var nextPageIndex = pageIndex + 1;pagesNodes.ToList().ForEach(x =>{if (x.Item1 == nextPageIndex){Url = $"http://list.iqiyi.com/{x.Item2}";SpliderResult();}});}}}
}
mgtv
using Ohye.Film.Application.Film;
using Ohye.Film.DTO.Film;
using Ohye.Film.Infrastructure;
using Ohye.Film.Infrastructure.Enums;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;namespace Ohye.Film.Service.Spider
{public class mgtv : ISplider{public mgtv(){}public string Url { get; set; }public int Year { get; set; }public string Country { get; set; }public void SpliderResult(){SpireClient spireClient = new SpireClient();spireClient.Complete += SpireClient_Complete;spireClient.SpireUrl(Url);}private void SpireClient_Complete(object sender, string html){SpireClient client = (SpireClient)sender;var productNodes = client.SelectNodes(html, "//ul/li[contains(@class,'m-result-list-item')]");productNodes.ForEach(p =>{var linkPic = client.SelectNodes(p.InnerHtml, "//a[contains(@class,'u-video u-video-y')]").SingleOrDefault();bool canRead = !client.SelectNodes(linkPic.InnerHtml, "//i[@class='mark-v']").Any();var productName = client.SelectNodes(p.InnerHtml, "//a[@class='u-title']").FirstOrDefault().InnerHtml.Trim();var detailURL = linkPic.Attributes.Where(x => x.Name == "href").SingleOrDefault().Value.Trim();detailURL = $"https://{detailURL.Substring(2)}";var detailHtml = client.GetHtml(detailURL).Result;var introduction = client.SelectNodes(detailHtml, "//p[@class='u-meta-intro']/span[@class='details']").FirstOrDefault()?.InnerText.Trim();var imgUrl = client.SelectNodes(linkPic.InnerHtml, "//img[@class='u-pic']").SingleOrDefault().Attributes.Where(x => x.Name == "src").SingleOrDefault().Value.Trim();imgUrl = $"https://{imgUrl.Substring(2)}";var duration = "";var score = client.SelectNodes(linkPic.InnerHtml, "//em[@class='u-meta']").SingleOrDefault().InnerText.Trim();var authors = client.SelectNodes(p.InnerHtml, "//span[@class='u-desc']/a").SelectMany(x => x.Attributes).Where(x => x.Name == "title").Select(x => x.Value).ToList();try{HttpImage httpImage = new HttpImage();IocCenter.Resolve<ProductService>(_productService =>{if (!_productService.CheckExisted(productName)){TimeSpan dur;TimeSpan.TryParse(duration, out dur);FM_ProductDTO product = new FM_ProductDTO{ID = Guid.NewGuid(),Name = productName,CategoryID = Guid.Parse("d012fcc6-b25a-447c-b079-95cc293a3f92"),Year = Year,Score = decimal.Parse(score == "" ? "0" : score),Duration = dur,CanRead = canRead,ImageID = null,IsDeleted = false,Country = Country,Content = new FM_ContentDTO{ID = Guid.NewGuid(),Introduction = introduction,ReadCount = 0,DownLoadCount = 0},LinkList = !canRead ? new List<FM_LinkDTO>() : new List<FM_LinkDTO>{new FM_LinkDTO{ID=Guid.NewGuid(),Address=detailURL,AuditStatus= AuditStatus.AuditPass,AuditTime=DateTime.Now,LinkType=LinkType.PlayUrl,}},AuthorList = authors.Select(x => new FM_AuthorDTO{ID = Guid.NewGuid(),AuhorType = AuhorType.Main,Name = x}).ToList()};product.ImageID = httpImage.GetImg(imgUrl);_productService.Add(product);Console.ForegroundColor = ConsoleColor.DarkGreen;Console.WriteLine(productName);Console.ForegroundColor = ConsoleColor.Gray;}else if (canRead){var productInfo = _productService.CheckCanRead(productName);if (!productInfo.Item1){Console.ForegroundColor = ConsoleColor.Green;Console.WriteLine($"发现新可播放电影:{productName}");//重新更新_productService.UpdateLink(productInfo.Item2, new List<FM_LinkDTO>{new FM_LinkDTO{ID = Guid.NewGuid(),Address = detailURL,AuditStatus = AuditStatus.AuditPass,AuditTime = DateTime.Now,LinkType = LinkType.PlayUrl}});}}else{Console.WriteLine($"已存在:{productName}");}});}catch (Exception ex){Console.ForegroundColor = ConsoleColor.Red;Console.WriteLine(productName + ex.Message + ex.InnerException);Console.WriteLine("failed");}});//查找下一页var pages = client.SelectNodes(html, "//div[contains(@class,'w-pages w-pages-default')]/ul/li/a").ToList();var pagesNodes = pages.Where(p => 1 == 1&& p.Attributes["href"]!=null&&p.InnerText != "..."&&p.InnerText!= " ").Select(p => new Tuple<int, string>(Int32.Parse(p.InnerText), p.Attributes["href"].Value)).ToList();var currentPage = pages.Where(p => p.Attributes["class"] != null && p.Attributes["class"].Value == "current").SingleOrDefault();if (currentPage != null){var pageIndex = Int32.Parse(currentPage.InnerText);var nextPageIndex = pageIndex + 1;pagesNodes.ToList().ForEach(x =>{if (x.Item1 == nextPageIndex){Url = $"https://list.mgtv.com/{x.Item2}";SpliderResult();}});}}}
}
using Ohye.Film.Application.Film;
using Ohye.Film.DTO.Film;
using Ohye.Film.Infrastructure;
using Ohye.Film.Infrastructure.Enums;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;namespace Ohye.Film.Service.Spider
{public class qq : ISplider{public qq(){}public string Url { get; set; }public int Year { get; set; }public string Country { get; set; }public void SpliderResult(){SpireClient spireClient = new SpireClient();spireClient.Complete += SpireClient_Complete;spireClient.SpireUrl(Url);}private void SpireClient_Complete(object sender, string html){SpireClient client = (SpireClient)sender;var productNodes = client.SelectNodes(html, "//ul[@class='figures_list']/li");productNodes.ForEach(p =>{var linkPic = client.SelectNodes(p.InnerHtml, "//a").SingleOrDefault();bool canRead = !client.SelectNodes(linkPic.InnerHtml, "//i[@class='mark_v ']").Any();var productName = linkPic.Attributes.Where(x => x.Name == "title").SingleOrDefault().Value.Trim();var detailURL = linkPic.Attributes.Where(x => x.Name == "href").SingleOrDefault().Value.Trim();var detailHtml = client.GetHtml(detailURL).Result;var introduction = client.SelectNodes(detailHtml, "//span[@id='data-videoInfoDes']").SingleOrDefault()?.InnerText.Trim();var imgUrl = client.SelectNodes(linkPic.InnerHtml, "//img").SingleOrDefault().Attributes.Where(x => x.Name == "src").SingleOrDefault().Value.Trim();var duration = ""; //client.SelectNodes(linkPic.InnerHtml, "//span[@class='icon-vInfo']").SingleOrDefault().InnerText.Trim();var linkInfo = client.SelectNodes(p.InnerHtml, "//div[@class='figure_title_score']").SingleOrDefault();var score = client.SelectNodes(linkInfo.InnerHtml, "//div[@class='figure_score']/em[@class='score_l']").SingleOrDefault().InnerText.Trim()+"."+ client.SelectNodes(p.InnerHtml, "//div[@class='figure_score']/em[@class='score_2']").SingleOrDefault().InnerText.Trim();var authors = new List<string>();//client.SelectNodes(linkInfo.InnerHtml, "//div[@class='role_info']/em/a").SelectMany(x => x.Attributes).Where(x => x.Name == "title").Select(x => x.Value).ToList();try{HttpImage httpImage = new HttpImage();IocCenter.Resolve<ProductService>(_productService =>{if (!_productService.CheckExisted(productName)){TimeSpan dur;TimeSpan.TryParse(duration, out dur);FM_ProductDTO product = new FM_ProductDTO{ID = Guid.NewGuid(),Name = productName,CategoryID = Guid.Parse("d012fcc6-b25a-447c-b079-95cc293a3f92"),Year = Year,Score = decimal.Parse(score),Duration = dur,CanRead = canRead,ImageID = null,IsDeleted = false,Country = Country,Content = new FM_ContentDTO{ID = Guid.NewGuid(),Introduction = introduction,ReadCount = 0,DownLoadCount = 0},LinkList = !canRead ? new List<FM_LinkDTO>() : new List<FM_LinkDTO>{new FM_LinkDTO{ID=Guid.NewGuid(),Address=detailURL,AuditStatus= AuditStatus.AuditPass,AuditTime=DateTime.Now,LinkType=LinkType.PlayUrl,}},AuthorList = authors.Select(x => new FM_AuthorDTO{ID = Guid.NewGuid(),AuhorType = AuhorType.Main,Name = x}).ToList()};product.ImageID = httpImage.GetImg(imgUrl);_productService.Add(product);Console.ForegroundColor = ConsoleColor.DarkGreen;Console.WriteLine(productName);Console.ForegroundColor = ConsoleColor.Gray;}else if (canRead){var productInfo = _productService.CheckCanRead(productName);if (!productInfo.Item1){Console.ForegroundColor = ConsoleColor.Green;Console.WriteLine($"发现新可播放电影:{productName}");//重新更新_productService.UpdateLink(productInfo.Item2, new List<FM_LinkDTO>{new FM_LinkDTO{ID = Guid.NewGuid(),Address = detailURL,AuditStatus = AuditStatus.AuditPass,AuditTime = DateTime.Now,LinkType = LinkType.PlayUrl}});}}else{Console.WriteLine($"已存在:{productName}");}});}catch (Exception ex){Console.ForegroundColor = ConsoleColor.Red;Console.WriteLine(productName + ex.Message + ex.InnerException);Console.WriteLine("failed");}});//查找下一页var cc = client.SelectNodes(html, "//div[@class='mod-page']/a[@data-search-page='item']").ToList();var pagesNodes = client.SelectNodes(html, "//div[@class='mod-page']/a[@data-search-page='item']").ToList().Where(p => p.Attributes["data-key"].Value != "down" && p.Attributes["data-key"].Value != "up").Select(p => new Tuple<int, string>(Int32.Parse(p.Attributes["data-key"].Value), p.Attributes["href"].Value));var currentPage = client.SelectNodes(html, "//div[@class='mod-page']/span[@class='curPage']").SingleOrDefault();if (currentPage != null){var pageIndex = Int32.Parse(currentPage.InnerText);var nextPageIndex = pageIndex + 1;pagesNodes.ToList().ForEach(x =>{if (x.Item1 == nextPageIndex){Url = $"http://list.iqiyi.com/{x.Item2}";SpliderResult();}});}}}
}
ISplider
using Ohye.Film.DTO.Film;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;namespace Ohye.Film.Service.Spider
{public interface ISplider{/// <summary>/// URL/// </summary>string Url { get; set; }void SpliderResult();}
}
后续....
感兴趣的可以加入下面群