创新性
选取题材,政府网站访问数据。想通过使用人工智能的方法,根据已有数据建立模型,对新的访问数据进行鉴别,区分网站访问是否为正常访问,从而进行一些业务处理。更进一步,预测哪些是危险访问(网络攻击),从而能及时做出一些策略应对,保障政府网站的安全,保障正常网民的访问。
实验完整性
数据描述
相关约定
- 此处正常访问,定义为网民的正常访问,其余皆为垃圾访问
- 垃圾访问可能来自网站扫描(死链检测等)、网页抓取、恶意攻击等,都不算是正常的访问数据
- 以一个月某ip访问是否超过3000次(30天,每天100次)为临界值,判定是否为正常有效的访问,高于3000,认为是垃圾访问。
数据及来源
- 选取的是某市政府网站访问数据。数据库总数据量6000多万,其中2019年3月份数据320万左右。
- 现在只取3月1日-10日的数据中,session id尾号为1的,4万2000多条
数据处理
- 把访问记录的一些维度,通过sql脚本数据处理,转为数字,方便构造测试样本
- 原始数据:导出的mysql脚本 huainan_guest_statistic190310.sql
- 超过3000的ip:ip3000.xls
- 处理过程:数据处理1.sql 数据处理2.sql
-- 根据统计出的一月访问量超过3000的认为是垃圾数据,对数据加标签
UPDATE cms_site_chart_main_copy1 set IS_GOOD = 1;
UPDATE cms_site_chart_main_copy1 set IS_GOOD = 0 where ip in ('45.194.128.134','220.178.96.24','121.41.29.248','183.166.59.35','120.78.131.13','59.111.104.146','183.166.61.6','183.166.61.1','203.207.121.14','60.171.44.29','58.242.245.178','59.111.104.158','183.166.61.13','203.207.121.3','203.207.121.13','218.22.156.219','183.166.61.2','203.207.121.8','59.111.104.147','112.26.205.196','43.249.175.170','114.255.59.101','218.22.163.50','183.166.61.7','203.207.121.9','203.207.121.1','119.23.27.3','59.111.104.49','210.45.123.30','183.166.61.10','203.207.121.5','203.207.121.2','59.111.104.115','59.111.104.137','183.166.59.149','60.166.58.227','59.111.104.134','183.166.61.3','60.21.221.203','119.23.40.231','59.111.104.138','59.111.104.75','203.207.121.11','183.166.61.14','59.111.104.135','59.111.104.44','183.166.61.5','220.178.199.10','59.111.104.132','120.77.240.164','59.111.104.14','59.111.104.148','221.231.138.216','183.166.79.204','203.207.121.7','58.242.245.179','183.166.61.8','58.242.245.180','220.178.96.26','183.166.24.222','220.178.96.25','218.22.161.170','111.39.184.24','59.111.104.127');-- 处理is_pc字段,转为0,1
UPDATE cms_site_chart_main_copy1 set IS_PC = 1 where IS_PC = 'true';
UPDATE cms_site_chart_main_copy1 set IS_PC = 0 where IS_PC = 'false' or IS_PC is null or IS_PC = '';-- 处理国家字段,转为数字
SELECT COUNTRY,count(1) FROM cms_site_chart_main_copy1 GROUP BY COUNTRY;
UPDATE cms_site_chart_main_copy1 set COUNTRY = '-1' where COUNTRY is null or COUNTRY = '';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '0' where COUNTRY = '未分配或者内网IP';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '1' where COUNTRY = '中国';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '2' where COUNTRY = '俄罗斯';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '3' where COUNTRY = '加拿大';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '4' where COUNTRY = '印度';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '5' where COUNTRY = '巴拿马';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '6' where COUNTRY = '新加坡';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '7' where COUNTRY = '日本';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '8' where COUNTRY = '柬埔寨';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '9' where COUNTRY = '比利时';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '10' where COUNTRY = '沙特阿拉伯';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '11' where COUNTRY = '泰国';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '12' where COUNTRY = '爱尔兰';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '13' where COUNTRY = '美国';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '14' where COUNTRY = '英国';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '15' where COUNTRY = '荷兰';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '16' where COUNTRY = '菲律宾';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '17' where COUNTRY = '越南';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '18' where COUNTRY = '韩国';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '19' where COUNTRY = '香港';
UPDATE cms_site_chart_main_copy1 set COUNTRY = '20' where COUNTRY = '马来西亚'; -- 处理语言
SELECT `LANGUAGE`,count(1) FROM cms_site_chart_main_copy1 GROUP BY `LANGUAGE` HAVING count(1) > 10;
UPDATE cms_site_chart_main_copy1 set `LANGUAGE` = '-1' where `LANGUAGE` is null or `LANGUAGE` = '' or `LANGUAGE` not in ('en-US','zh-CN','zh-Hans-CN');
UPDATE cms_site_chart_main_copy1 set `LANGUAGE` = '0' where `LANGUAGE` = 'en-US';
UPDATE cms_site_chart_main_copy1 set `LANGUAGE` = '1' where `LANGUAGE` = 'zh-CN';
UPDATE cms_site_chart_main_copy1 set `LANGUAGE` = '2' where `LANGUAGE` = 'zh-Hans-CN';-- 处理省份
SELECT PROVINCE,count(1) FROM cms_site_chart_main_copy1 GROUP BY `PROVINCE` HAVING count(1) > 20;
UPDATE cms_site_chart_main_copy1 set PROVINCE = '-1' where PROVINCE is null or PROVINCE = '';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '0' where PROVINCE = '上海市';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '2' where PROVINCE = '内蒙古自治区';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '2' where PROVINCE = '北京市';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '3' where PROVINCE = '四川省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '4' where PROVINCE = '天津市';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '5' where PROVINCE = '安徽省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '6' where PROVINCE = '山东省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '7' where PROVINCE = '山西省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '8' where PROVINCE = '广东省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '9' where PROVINCE = '江苏省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '10' where PROVINCE = '江西省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '11' where PROVINCE = '河北省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '12' where PROVINCE = '河南省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '13' where PROVINCE = '浙江省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '14' where PROVINCE = '湖北省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '15' where PROVINCE = '湖南省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '16' where PROVINCE = '福建省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '17' where PROVINCE = '贵州省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '18' where PROVINCE = '辽宁省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '19' where PROVINCE = '重庆市';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '20' where PROVINCE = '陕西省';
UPDATE cms_site_chart_main_copy1 set PROVINCE = '21' where PROVINCE = '黑龙江省';-- 处理信息来源
SELECT SOURCE_TYPE,count(1) FROM cms_site_chart_main_copy1 GROUP BY `SOURCE_TYPE` HAVING count(1) > 20;
UPDATE cms_site_chart_main_copy1 set SOURCE_TYPE = '0' where SOURCE_TYPE = '其他外部链接';
UPDATE cms_site_chart_main_copy1 set SOURCE_TYPE = '1' where SOURCE_TYPE = '标签或浏览器输入地址';
UPDATE cms_site_chart_main_copy1 set SOURCE_TYPE = '2' where SOURCE_TYPE = '站内链接';-- 处理操作系统
SELECT OS,count(1) FROM cms_site_chart_main_copy1 GROUP BY `OS` HAVING count(1) > 20;
UPDATE cms_site_chart_main_copy1 set OS = '0' where OS = 'Linux';
UPDATE cms_site_chart_main_copy1 set OS = '1' where OS = 'Mac';
UPDATE cms_site_chart_main_copy1 set OS = '2' where OS = 'Windows';
UPDATE cms_site_chart_main_copy1 set OS = '3' where OS = 'Windows 10';
UPDATE cms_site_chart_main_copy1 set OS = '4' where OS = 'Windows 2003';
UPDATE cms_site_chart_main_copy1 set OS = '5' where OS = 'Windows 7';
UPDATE cms_site_chart_main_copy1 set OS = '6' where OS = 'Windows 8';
UPDATE cms_site_chart_main_copy1 set OS = '7' where OS = 'Windows XP';
UPDATE cms_site_chart_main_copy1 set OS = '8' where OS = 'Windows+7';
UPDATE cms_site_chart_main_copy1 set OS = '9' where OS = 'Windows7';
UPDATE cms_site_chart_main_copy1 set OS = '10' where OS = 'X11';-- 导出结果和需要的维度
SELECT IS_GOOD,COUNTRY,PROVINCE,SOURCE_TYPE,OS,COLOR_DEPTH,`LANGUAGE`,IS_PC FROM cms_site_chart_main_copy1
UPDATE cms_site_chart_main_copy1 set IS_GOOD = -1 where IS_GOOD = 0;
UPDATE cms_site_chart_main_copy1 set IS_GOOD = '+1' where IS_GOOD = 1;-- 查出少于20的,舍弃
SELECT PROVINCE,count(1) FROM cms_site_chart_main_copy1 GROUP BY `PROVINCE` HAVING count(1) <= 20;
DELETE FROM cms_site_chart_main_copy1 WHERE PROVINCE in ('云南省','吉林省','宁夏回族自治区','广西壮族自治区','新疆维吾尔自治区','海南省','甘肃省','西藏自治区','青海省','香港特别行政区')SELECT CONCAT(IS_GOOD,' ', '1:',COUNTRY,' 2:',PROVINCE,' 3:',SOURCE_TYPE,' 4:',OS,' 5:',COLOR_DEPTH,' 6:',`LANGUAGE`,' 7:',IS_PC) FROM cms_site_chart_main_copy1 LIMIT 40000;SELECT CONCAT(IS_GOOD,' ', '1:',COUNTRY,' 2:',PROVINCE,' 3:',SOURCE_TYPE,' 4:',OS,' 5:',COLOR_DEPTH,' 6:',`LANGUAGE`,' 7:',IS_PC) FROM cms_site_chart_main_copy1 ORDER BY ID desc LIMIT 1000;
- 处理之后的数据:mysql脚本文件 cms_site_chart_main_copy1.sql 和excel文件的处理好的数据.xls
- 处理之后的数据文件:
训练数据:trainfile
测试数据:testfile
实验条件
- Centos7
- Windows7
- Mysql
- libsvm
- Python
- gnuplot
实验过程
支持向量机 SVM(多组实验)
使用libsvm库(中国 台湾 林智仁教授封装的SVM库)
支持向量机(support vector machine,SVM)是目前最好的有监督学习算法之一。
实验结果
cd /home/yyl/libsvm-3.24
./svm-train trainfile
./svm-predict -q testfile trainfile.model outputs1
实验分析
PPT展示
下载地址:https://download.csdn.net/download/u010882234/12122680
参考
- https://www.cnblogs.com/mfmdaoyou/p/7258707.html
- https://www.csie.ntu.edu.tw/~cjlin/
- http://blog.sciencenet.cn/blog-713101-797917.html
- http://www.360doc.com/content/18/0516/15/54605916_754421513.shtml
- https://www.cnblogs.com/leezx/p/5668702.html