星座屋(http://www.xzw.com/fortune/)运势界面:
最终爬取数据结果展示在APP上的效果:
下面就是使用正则实现的代码,是自己一年多前花了半天时间写的。现在想来,如果使用Scrapy或者phpspider只用几行代码就搞定了,不用这么费力气了~
<?php
/**
* 星座运势
* author: pengfei
* http://www.xzw.com/fortune/aries/ 今日
* http://www.xzw.com/fortune/aries/1.html 明日
* http://www.xzw.com/fortune/aries/2.html 本周
* http://www.xzw.com/fortune/aries/3.html 本月
* http://www.xzw.com/fortune/aries/4.html 今年
* http://www.xzw.com/fortune/aries/5.html 爱情
*/
define('IN_FTE', true);
require(dirname(__FILE__) . '/includes/init.php');
date_default_timezone_set('Asia/Shanghai');
$json = new JSON();/*
$constellation = array ('白羊座' => array('aries', '03/21-04/19'),'金牛座' => array('taurus', '04/20-05/20'),'双子座' => array('gemini', '05/21-06/21'),'巨蟹座' => array('cancer', '06/22-07/22'),'狮子座' => array('leo', '07/23-08/22'),'处女座' => array('virgo', '08/23-09/22'),'天秤座' => array('libra', '09/23-10/23'),'天蝎座' => array('scorpio', '10/24-11/22'),'射手座' => array('sagittarius', '11/23-12/21'),'魔羯座' => array('capricorn', '12/22-01/19'),'水瓶座' => array('aquarius', '01/20-02/18'),'双鱼座' => array('pisces', '02/19-03/20')
);*/$constellation = isset($_REQUEST['xingzuo']) && !empty($_REQUEST['xingzuo']) ? trim($_REQUEST['xingzuo']) : null;
$category = isset($_REQUEST['category']) && !empty($_REQUEST['category']) ? intval($_REQUEST['category']) : null;$all_xingzuo = array('aries','taurus','gemini','cancer','leo','virgo','libra','scorpio','sagittarius','capricorn','aquarius','pisces');
$all_category = array(0,1,2,3,4,5);
if(!in_array($constellation, $all_xingzuo) || !in_array($category, $all_category)){exit('Params error');
}$domain = 'http://www.xzw.com/fortune/';
$apiUrl = '';
if($category){$apiUrl = $domain.$constellation.'/'.$category.'.html';
} else {$apiUrl = $domain.$constellation.'/';
}header("Content-type: text/html; charset=utf-8");
function getFortuneData($url){$fortune_data = array();$data = file_get_contents($url);$data = mb_convert_encoding($data, 'utf-8', 'gbk');preg_match('/<div class="c_main">(.*)<\/div>/ism', $data, $div_c_main);preg_match('/<dl>(.*?)<\/dl>/ism', $div_c_main[1], $dl);preg_match('/<dd>(.*?)<\/dd>/ism', $dl[1], $dd);preg_match('/<ul>(.*?)<\/ul>/ism', $dd[1], $ul);$ul = str_replace('<label>', '{label}', $ul[1]);$ul = preg_replace('/<span[^>]*?>/ism', '', $ul);$ul = preg_replace('/<li[^>]*>/ism', '', $ul);$ul = preg_replace('/<\/label>/ism', '', $ul);$ul = preg_replace('/<\/li>/ism', '', $ul);//$ul = preg_replace('/\s+/','',$ul);$ul_arr = explode('{label}', $ul);array_shift($ul_arr);foreach ($ul_arr as $key => &$li) {//preg_match_all("/([\x81-\xfe][\x40-\xfe])+/", $li, $matches);//转换编码 $matches[1][0]表示":"preg_match('/<em style="width:(\d{1,}).*">/ism', $li, $width);if(!empty($width)){$li = explode(":",$li);$li['label'] = preg_replace('/<em[^>]*?>/ism', '', $li[0]);$li['value'] = sprintf('%0.2f', floatval($width[1]/80));unset($li[0]);unset($li[1]);//$val[1] = $width[1]/16;} else {$li = explode(":",$li);$li['label'] = $li[0];$li['value'] = $li[1];unset($li[0]);unset($li[1]);}}$fortune_data['ul'] = $ul_arr;//获取c_contpreg_match('/<div class="c_cont">(.*?)<\/div>/ism', $data, $cont);$p_cont = preg_replace('/<strong[^>]*?>/ism', '', $cont[1]);$p_cont = str_replace('<span>', '{span}', $p_cont);$p_cont = str_replace('<p>', '{p}', $p_cont);$p_cont = preg_replace("'<[/!]*?[^<>]*?>'si","",$p_cont);$p_cont = preg_replace('/\s+/','',$p_cont);//$p_cont = preg_replace("'([rn])[s]+'","",$p_cont);$p_cont = str_replace('<div class="z">', '', $p_cont);$p_cont_arr = explode('{p}',$p_cont);array_shift($p_cont_arr);foreach ($p_cont_arr as $key => $val) {$temp = explode('{span}', $val);$temp_arr['label'] = $temp[0];$temp_arr['value'] = $temp[1]; $fortune_data['cont'][] = $temp_arr;unset($temp);}return $fortune_data;
}$write_result = '';//写入状态 默认为空表示不写入
$local_data = '';
$result = array();
$fileName = !empty($category) ? $constellation.'-'.$category.'.php' : $constellation.'.php';
$fortune_data_path = 'fortune_data/'.$fileName;if(file_exists($fortune_data_path)){$local_data = @file_get_contents('fortune_data/'.$fileName);
}if (!empty($local_data)) {$filemtime = filemtime($fortune_data_path);//判断缓存时间是否在当天内产生$todayStart = mktime(0, 0, 0, date("m"), date("d"), date("Y"));if($filemtime < $todayStart){ //缓存过期$data = getFortuneData($apiUrl);$write_result = write_fortune_cache($data, $fileName);if(empty($data)){$result['result'] = -1;$result['msg'] = '数据抓取失败!'; $result['write_result'] = $write_result;$result['data'] = array();exit($json->encode($result));}} else {$data = unserialize($local_data);}$result['result'] = 0;$result['msg'] = 'success';$result['write_result'] = $write_result;$result['data'] = $data;exit($json->encode($result));
} else {$data = getFortuneData($apiUrl);if(!empty($data)){$write_result = write_fortune_cache($data, $fileName);$result['result'] = 0;$result['msg'] = 'success'; $result['write_result'] = $write_result;$result['data'] = $data;exit($json->encode($result));} else {$result['result'] = -1;$result['msg'] = '数据抓取失败!'; $result['write_result'] = $write_result;$result['data'] = array();exit($json->encode($result));}
}function write_fortune_cache($data, $fileName){$fp = fopen('./fortune_data/'.$fileName, 'w+') or die('fortune_data/'.$fileName.'不存在!');$fw = fwrite($fp, serialize($data));if($fw){$write_result = 'success';} else {$write_result = 'fail';}fclose($fp);return $write_result;
}
?>
End