实体店选址大数据分析-保定地区-POI数据-powermap-爬虫获取真实数据
讲一下项目的思路
预期通过获得poi数据进行分析,以为实体店选址进行决策
数据源
58同城店铺转让数据
安居客小区详细信息数据
安居客小区平均房价数据
上爬虫代码
package wubaSpider;import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import wubaSpider.pojo.AnjukePojo;
import Util.AnjukeDataUtil;
import Util.AnjukeDataUtil2;
import bossSpider.Blibli_Selemium;public class Anjuke_spider3 implements PageProcessor{List<String> list=new ArrayList<String>();List<String> list2=new ArrayList<String>();int z=0;private Site site = Site.me().setRetryTimes(1).setSleepTime(20000);public Site getSite() {return site;}public void process(Page page) {page.putField("author", page.getUrl().toString());if (page.getResultItems().get("name") == null) {page.setSkip(true);}z++;String a=page.getHtml().css("h1").toString();String a0=page.getHtml().getDocument().getElementsByClass("basic-infos-box").toString();if (a!=null||a0!=null) {String str=a+a0+"/001";list2.add(str);System.out.println(str);}else {list2.add(list.get(z-1)+"/001");}IOwriteData("G:五八爬虫数据", "list10Demo223333.txt", "utf-8", removeDuplicate(list2));System.out.println("已经进行第"+z+"次爬取");HashSet<String>set=new HashSet<String>();String path="G:\\五八爬虫数据\\房地产\\demo";try {AnjukeDataUtil.getCollectionsData(path, set);list.addAll(set);} catch (Exception e) {e.printStackTrace();}page.addTargetRequests(removeDuplicate(list));} public static void main(String[] args) throws IOException {//创建爬虫实体类Anjuke_spider3 selemium=new Anjuke_spider3();//设置selemium浏览器配置驱动SeleniumDownloader seleniumDownloader=new SeleniumDownloader("G:\\爬虫\\drive\\chromedriver.exe");//配置当前浏览器配置System.setProperty("selenuim_config", "G:\\workspace\\Git\\webmagic\\config.ini"); //request类型配置,使用responsebody配置请求头,调用method方法选择post/get请求Request request=new Request();//设置被爬取页面String st="https://baoding.anjuke.com/community/view/1019536";//爬虫入口Spider.create(selemium).addUrl(st).setDownloader(seleniumDownloader).thread(1)//启动n个线程(此语句表示启动3个线程).run();//启动爬虫,会阻塞当前线程执行(及n个线程不是同时执行的)
// seleniumDownloader.close();
}private static void IOwriteData(String dataPath,String fileName,String charset,List<String> list){String str;FileOutputStream fos = null;OutputStreamWriter writer = null;try {fos=new FileOutputStream(dataPath+fileName, true);writer = new OutputStreamWriter(fos,charset);for (String string : list) {str = string;writer.append(str);}} catch (Exception e) {e.printStackTrace();} finally {try {writer.close();fos.close();} catch (IOException e) {e.printStackTrace();}}
}
public static List removeDuplicate(List list) { HashSet h = new HashSet(list); list.clear(); list.addAll(h); return list;
}
}
因为都差不多,所以不一个一个放了
值得说的是58的发爬虫跟安居客比起来还差点意思.
爬虫使用selenium+webmagic写的,数据量不大图省事
不过即使图省事,安居客还是在封我ip,大家可以多试几次找到不被封的接口就搞定啦
然后是数据处理
这里用dom4j方便一点,我自己试着用正则和split写了下麻烦的要死
package Util;import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;import wubaSpider.pojo.AnjukePojo;
import wubaSpider.pojo.AnjukePojo2;
import wubaSpider.pojo.pojo;public class AnjukeDataUtil3 {public static void main(String[] args) throws Exception {// G://五八爬虫数据HashSet<String>set=new HashSet<String>();String path="G:\\五八爬虫数据\\安居客爬虫数据\\详细数据";getCollectionsData(path, set);List<AnjukePojo2> list =getData(set);for (AnjukePojo2 anjukePojo2 : list) {System.out.println(anjukePojo2);}}public static List<AnjukePojo2> getData(HashSet<String>set) throws Exception{List<AnjukePojo2> list=new ArrayList<AnjukePojo2>();for (String string : set) {AnjukePojo2 obj=new AnjukePojo2();final String startStr = "<?xml version=\"1.0\" encoding=\"utf-8\" ?> <root>";final String endStr = "</root>";String str=string.replace(" ", " ");Document doc =DocumentHelper.parseText(startStr+str+endStr);Element rootElt = doc.getRootElement(); // 获取根节点Element name = (Element) rootElt.selectSingleNode("h1");Element addr = (Element) rootElt.selectSingleNode("//span[@class='sub-hd']");Element price = (Element) rootElt.selectSingleNode("//span[@class='average']");try {if (name.getText().isEmpty()||name.getText()==null) {System.out.println(string);}else {obj.setName(name.getText().trim());}obj.setAddr(addr.getText().trim());obj.setPrice(Integer.parseInt(price.getText().trim()));} catch (Exception e) {e.printStackTrace();}List<Element> listMessage = rootElt.selectNodes("//dd");for (int j = 0; j < listMessage.size(); j++) {obj.setRopertyType(listMessage.get(0).getText().trim());obj.setRopertyCost(listMessage.get(1).getText().trim());obj.setArea(listMessage.get(2).getText().trim());if (listMessage.get(3).getText().trim().equalsIgnoreCase("暂无数据")!=true) {int a=Integer.parseInt(listMessage.get(3).getText().trim().replace("户", ""));obj.setUv(a);}if (listMessage.get(4).getText().trim().equalsIgnoreCase("暂无数据")!=true) {int year=Integer.parseInt(listMessage.get(4).getText().trim().replace("年", ""));obj.setYear(year);}if (listMessage.get(5).getText().trim().equalsIgnoreCase("暂无数据")!=true) {int park=Integer.parseInt(listMessage.get(5).getText().trim());obj.setParkings(park);}obj.setPotRatio(listMessage.get(6).getText().trim());obj.setGreeningRate(listMessage.get(7).getText().trim());obj.setDeveloper(listMessage.get(8).getText().trim());obj.setPropertyAgent(listMessage.get(9).getText().trim());obj.setBusinessDistrict(listMessage.get(10).getText().trim());}System.out.println(obj+"------");list.add(obj);}return list;}public static HashSet<String> getCollectionsData(String path,HashSet<String> set) throws Exception{List<String> list =new ArrayList<String>();List<String> list2 =new ArrayList<String>();getFiles(path, list);list2=readFileFromFilePath(list);int i=0;for (String string : list2) {String[] arr=string.split("/001");for (String string2 : arr) {if (string2.trim().isEmpty()!=true) {i++;set.add(string2);}}}System.out.println(set.size());//去重后577***去重前7298return set;}/*** 边缘校验*/public static void checkFilePath(String path) {if (path.trim().length() == 0 || path == null) {throw new IllegalArgumentException("路径不合法");}}public static void getFiles(String path, List<String> list) {checkFilePath(path);File file = new File(path);if (file.isDirectory()) {File[] files = file.listFiles();for (int i = 0; i < files.length; i++) {if (files[i].isDirectory()) {getFiles(files[i].getPath(), list);} else {list.add(files[i].getPath());}}} else {list.add(file.getPath());}}public static List<String> readFileFromFilePath(List<String> list)throws IOException {FileInputStream fil;BufferedReader bfr = null;StringBuffer sbf = new StringBuffer();List<String> stringData = new ArrayList<String>();for (String string : list) {checkFilePath(string);File file = new File(string);try {bfr = new BufferedReader(new java.io.FileReader(file));String tempStr;while ((tempStr = bfr.readLine()) != null) {sbf.append(tempStr + "\n");}bfr.close();stringData.add(sbf.toString());} catch (FileNotFoundException e) {e.printStackTrace();}}return stringData;}
}
这个看看
package Util;import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import wubaSpider.pojo.pojo;public class wubaDataDemo {public static void main(String[] args) throws Exception {// G://五八爬虫数据HashSet<String>set=new HashSet<String>();set=getCollectionsData("G://五八爬虫数据");System.out.println(set.size());HashSet<pojo>set2=getTheData(set);for (pojo pojo : set2) {System.out.println(pojo);}System.out.println(set2.size());}public static HashSet<pojo> getTheData(HashSet<String> set){HashSet<pojo>set2=new HashSet<pojo>();for (String string : set) {pojo obj=new pojo();//名称String str=string.trim().split("</span> </h2>")[0];String[] arr=str.trim().split("<h2 class=\"title\"> <span class=\"title_des\">");for (int i = 0; i < arr.length; i++) {String st=arr[i];if (i>0) {if (st.length()<20) {obj.setName(st);
// System.out.println(st+"----------------");}else {String str2=string.trim().split("</span> <i class=\"icon icon-j-x\" title=")[0];String[] arr2=str.trim().split("<h2 class=\"title\"> <span class=\"title_des\">");obj.setName(arr2[i]);}}}//地区解析 String baseinfo=string.trim().split("</span> <i class=\"split\"></i> <span class=\"withI\">")[0];
// System.out.println(baseinfo+"------------");String[] baseinfo2=baseinfo.trim().split("<p class=\"baseinfo\"> <span class=\"withI\">");for (int i = 0; i < baseinfo2.length; i++) {if (i>0) {if (baseinfo2[i].trim().length()>10) {obj.setRegion(baseinfo2[i].split("</span> <i class=\"split\"></i>")[0]);}else {obj.setRegion(baseinfo2[i]);}}}//position位置String position=string.trim().split("</span> <i class=\"split\"></i> <span>")[0];String[] pos2=position.trim().split("</span> <i class=\"split\"></i> <span class=\"withI\">");for (int i = 0; i < pos2.length; i++) {if (i>0) {obj.setPosition(pos2[i]);}}//stateString[] state=string.trim().split("</span> <i class=\"split\"></i> <span>");for (int i = 0; i < state.length; i++) {if (i>0) {obj.setState(state[i].split("</span> </p> ")[0]);}}//agentString agent=string.trim().split("</span><span class=\"managerCompany\">")[0];String[] agent2=agent.trim().split("<i class=\"managerIcon\"></i><span class=\"manager\">");for (int i = 0; i < agent2.length; i++) {if (i>0) {obj.setAgent(agent2[i]);}else {obj.setAgent("暂无");}}//managerCompanyString[] managerCompany=string.trim().split("<span class=\"managerCompany\">");for (int i = 0; i < managerCompany.length; i++) {if (i>0) {String manager2=managerCompany[i].trim().split("</span> </p>")[0];if (manager2.trim().equals("")||manager2.trim().isEmpty()) {obj.setManagerCompany("暂无");}else {obj.setManagerCompany(manager2);}}else {obj.setManagerCompany("暂无");}}//priceString[] price=string.trim().split("<span class=\"num\">");double pr;String numebr = "0";for (int i = 0; i < price.length; i++) {if (i>0) {numebr=price[i].split("</span> <span class")[0].trim();String[] price2=price[i].split("<span class=\"unit\">");if (numebr.equalsIgnoreCase("面议")) {numebr="0";}if (numebr.indexOf(".")==-1 && numebr.length()!=1) {
// System.out.println(numebr);pr=Double.parseDouble(numebr.trim());obj.setPrice(pr);}else {if (numebr.split("\\.")[0].length()<=2) {pr=Double.parseDouble(numebr.trim())*10000;obj.setPrice(pr);}}}}//areString are =string.trim().split("</span> <span class=\"unit\">㎡")[0];String[] are2=are.trim().split(" <p class=\"num\"> <span>");for (int i = 0; i < are2.length; i++) {if (i>0) {obj.setArea(Double.parseDouble(are2[i]));}}//informationString information=string.trim().split("<div class=\"pic\">")[0];String[] src=information.trim().split("a href=");for (int i = 0; i < src.length; i++) {if (i>0) {obj.setInformation(src[i].replace("\"", "").replace(">", ""));}}//tag_wrapString tag=string.trim().split("<!-- 标签处理end -->")[0];String[] tag2=tag.trim().split("<!-- 标签处理start -->");String tagStr = null;for (int i = 0; i < tag2.length; i++) {if (i>0) {String[] s=tag2[1].trim().split("<span class=\"tag-item\" style=\"background:");for (int j = 0; j < s.length; j++) {if (j>0) {String ss=s[j].split("</span>")[0];String sss=ss.split(">")[1];tagStr=sss;}}}}obj.setTag_wrap(tagStr);set2.add(obj);}return set2; }public static HashSet<String> getCollectionsData(String path){List<String>list=new ArrayList<String>();List<String>list2=new ArrayList<String>();HashSet<String> set=new HashSet<String>();HashSet<String> set2=new HashSet<String>();getFiles(path, list);try {list2=readFileFromFilePath(list);} catch (IOException e) {e.printStackTrace();}for (String string : list2) {String[] arr=string.split("<ul id=\"house-list-wrap\" class=\"list-main-style\"> ");for (String string2 : arr) {String str=string2.replace("</ul>","");set.add(str);}}System.out.println("set总数"+set.size());int number=0;//店铺出租总数for (String string : set) {String[] arr=string.split("<li");number=arr.length+number;for (String string2 : arr) {String str=string2.replace("logr=", "<li logr=");set2.add(str);}}
// System.out.println(number);
// System.out.println(set2.size());/*** set总数118* 去重前6311* 去重后3894店铺总数*/return set2;}/*** 边缘校验*/public static void checkFilePath(String path) {if (path.trim().length() == 0 || path == null) {throw new IllegalArgumentException("路径不合法");}}public static void getFiles(String path, List<String> list) {checkFilePath(path);File file = new File(path);if (file.isDirectory()) {File[] files = file.listFiles();for (int i = 0; i < files.length; i++) {if (files[i].isDirectory()) {getFiles(files[i].getPath(), list);} else {list.add(files[i].getPath());}}} else {list.add(file.getPath());}}public static List<String> readFileFromFilePath(List<String> list)throws IOException {FileInputStream fil;BufferedReader bfr = null;StringBuffer sbf = new StringBuffer();List<String> stringData = new ArrayList<String>();for (String string : list) {checkFilePath(string);File file = new File(string);try {bfr = new BufferedReader(new java.io.FileReader(file));String tempStr;while ((tempStr = bfr.readLine()) != null) {sbf.append(tempStr + "\n");}bfr.close();stringData.add(sbf.toString());} catch (FileNotFoundException e) {e.printStackTrace();}}return stringData;}
}
自己解析不仅不优雅还高耦合…恶心
做出来的数据用springboot直接导入mysql了,没用@test 这个junit直接写入,成品舒服
总之免不了的麻烦
数据库如图
其实只要保定的顺带把北京石家庄的数据一起趴了
一共6个表吧
用sql清洗下没洗干净的数据就行了
这里没有用百度或者高德开源的api或者前端框架,直接用execle专业版的powermap做的可视化
确实好使,比敲代码来说好多了
用脑子想想ajax响应返回七八个图层的json数据还是麻烦hhhhh溜了