网站行为日志信息统计分析

一、开发环境
二、项目思路
三、系统实现
- (一)、原始数据上传hdfs
- (二)、数据清洗（第一遍）
- (三)、数据清洗（第二遍）
- (三)、通过hive对数据分析
四、总结
五、完整代码：
- (一)、pom.xml文件
- (二)、初始数据清洗：截取需要字段
- (三)、数据清洗：清理网络连接中的资源文件和清洗不完整数据

一、开发环境

(一)、开发环境：
Windows + JDK1.8 + Hadop-2.9.2+Eclipse+linux
(二)、需要的只知识：
hdfs、mapreduce、hive、简单正则表达式、用户画像等等
(三)、开发时间：2019年1月

二、项目思路

(一)、对以采集的信息先上传到hdfs上
(二)、通过打标签，对网站进行用指标画像，提取出最能描述网站指标的字段，对网站性能负载进行综合调整、评估、优化！
(三)、根据对网站指标画提取出的特征字段，对数据进行清洗
(四)、分析网站的访问量，跳出率，网络连接状态，单个ip流量的总和等，对网站进行研究和分析

三、系统实现

(一)、原始数据上传hdfs

1、原始数据格式
在这里插入图片描述
2、上传到hdfs上：

hadoop dfs -put ./access_2015_03_30.log   /

(二)、数据清洗（第一遍）

1、利用正则表达式对原始数据处理，提取出想要的字段：
我对网址指标画像的字段是：

ip,time,timeArea,request,url,state,dataSize

public String[] parser(String line) {String str="27.19.74.143 - - [30/Mar/2015:17:38:20 +0800] \"GET /static/image/common/faq.gif HTTP/1.1\" 200 1127";//Pattern.compile("(27.19.74.143) - - \\[(30/Mar/2015:17:38:20) ( +0800)\\] \"(GET) (.*) HTTP/1.1\" (200) (1127)");Pattern compile = Pattern.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}) - - \\[(.*) ([-|+][0-9]{1,4})\\] \"([A-Z]{1,4}) (.*) HTTP/1.1\" ([0-9]*) ([0-9]*)");Matcher matcher = compile.matcher(line);//System.out.println(matcher.find());while(matcher.find()){if (matcher.group() != null) {System.out.println("成功！");String ip = matcher.group(1);String time = matcher.group(2);String timeArea = matcher.group(3);String request = matcher.group(4);String url = matcher.group(5);String state = matcher.group(6);String dataSize = matcher.group(7);SimpleDateFormat sdf1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.ENGLISH);SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");Date date;try {date = sdf1.parse(time);time = sdf.format(date);} catch (ParseException e) {// TODO Auto-generated catch blocke.printStackTrace();}return new String[]{ip,time,timeArea,request,url,state,dataSize};}}return new String[]{};}

(三)、数据清洗（第二遍）

1、通过mapreduce对已经提取出的字段再次清洗：清理网络连接中的资源文件和清理不完整数据

public class clean {static final String INPUT_PATH = "hdfs://192.168.56.30:9000/access_2015_03_30.log";static final String OUT_PATH = "hdfs://192.168.56.30:9000/user/hive/warehouse/t1";public static void main(String[] args) throws Exception {String str="27.19.74.143 - - [30/Mar/2015:17:38:20 +0800] \"GET /static/image/common/faq.gif HTTP/1.1\" 200 1127";System.out.println("==========================数据清洗============================");String[] parser = new LongParser().parser(str);for (int i = 0; i < parser.length; i++) {System.out.println("字段:"+(i+1)+" : "+parser[i]);}System.out.println("==========================数据清洗============================");Configuration conf = new Configuration();Job job =Job.getInstance(conf,clean.class.getSimpleName());job.setJarByClass(clean.class);//打jar包必须在这一行//文件的输入格式FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));job.setInputFormatClass(TextInputFormat.class);//map序列化job.setMapperClass(MyMapper.class);job.setMapOutputKeyClass(LongWritable.class);job.setMapOutputValueClass(Text.class);//reduce序列化job.setReducerClass(MyReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(LongWritable.class);//文件的输出格式String OUT_DIR =OUT_PATH;FileOutputFormat.setOutputPath(job, new Path(OUT_DIR));job.setOutputFormatClass(TextOutputFormat.class);//判断输出文件是否存在，若存在，则删除deleteOutDir(conf, OUT_DIR);job.waitForCompletion(true);}private static void deleteOutDir(Configuration conf, String OUT_DIR) throws IOException, URISyntaxException {FileSystem fileSystem = FileSystem.get(new URI(OUT_DIR), conf);if(fileSystem.exists(new Path(OUT_DIR))){fileSystem.delete(new Path(OUT_DIR), true);}}public static  class MyMapper extends Mapper<LongWritable, Text, LongWritable, Text>{@Overrideprotected void map(LongWritable key, Text value,org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,LongWritable,Text>.Context context)throws IOException ,InterruptedException {String line = value.toString();String[] parser = new LongParser().parser(line);//清理网络连接中的资源文件if (line.contains(".gif")||line.contains("jpg")||line.contains("png")||line.contains(".css")||line.contains(".js")) {return;}//清理不完整数据if(parser.length != 7){return;}Text text = new Text();text.set(parser[0]+"\t"+parser[1]+"\t"+parser[2]+"\t"+parser[3]+"\t"+parser[4]+"\t"+parser[5]+"\t"+parser[6]+"\t");context.write(key, text);}}public static	class  MyReducer extends Reducer<LongWritable,Text, Text, NullWritable>{@Overrideprotected void reduce(LongWritable arg0, Iterable<Text>text,Reducer<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {// TODO Auto-generated method stubfor (Text value : text) {context.write(value, NullWritable.get());}}}}

(三)、通过hive对数据分析

1、创建表

create table t1(ip String,
time String,
timeArea String,
request String,
url String,
State String,
dataSize int
)row format delimited fields terminated by "\t";

2、pageview:用户的总访问量

select count(1) as PV from t1;

3、uv：独立用户（去重）

select count(distinct ip) as UV from t1;

4、只浏览了一次就离开的用户

select count(1) from t1 group by ip having count(1)=1;

5、只浏览了一次就离开用户的总数

select count(1) from (select count(1) from t1 group by ip having count(1)=1) nums;

6、所有浏览的总数

select ip,count(1) as nums from t1 group by ip;

7、跳出率

select sum(case when a.nums=1 then 1 else 0 end)/sum(1)
from(select count(1) as nums from t1 group by ip) a;

结果：7348/21645=0.33947793947793947

跳出率（取精度）：round()

select round(sum(case when a.nums=1 then 1 else 0 end)/sum(1)*100,2)
from(select count(1) as nums from t1 group by ip) a;

结果：33.95

跳出率（字符转换）：concat()

select concat(round(sum(case when a.nums=1 then 1 else 0 end)/sum(1)*100,2),"%")
from(select count(1) as nums from t1 group by ip) a;

结果：33.95%

8、ip浏览量的top100

select ip,count(1) as nums from t1 group by ip sort by nums desc limit 100;

9、统计时区

select timeArea,count(1) from t1 group by timeArea;

10、统计页面热点

select url,count(1) as nums from t1 group by url sort by nums desc limit 100;

11、网站用户连接状态

select state,count(1) as nums from t1 group by state;

12、单个ip流量的总和

select ip,sum(dataSize) as totalSize from t1 group by ip sort by totalSize desc limit 100;

四、总结

通过完成此次项目，学到了很多东西！我觉得最难的地方是正则表达式，因为以前用正则表达式较少，所以就没有学习正则表达式，真到用的时候不会，很着急！就只好现学了，通过本次项目，对正则表达式有了新的认识和理解。通过这次项目对用户画像，如何给某一事物打标签有了深刻的了解，同时也对mapreduce这一知识进行了复习掌握，最终要的是对hive的掌握也有了一定程度上的提升！真的是受益颇多！

五、完整代码：

(一)、pom.xml文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>Clean</groupId><artifactId>clean</artifactId><version>0.0.1-SNAPSHOT</version><dependencies><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-common</artifactId><version>2.2.0</version></dependency><dependency><groupId>jdk.tools</groupId><artifactId>jdk.tools</artifactId><version>1.8</version><scope>system</scope><systemPath>D:/java/jdk1.8/lib/tools.jar</systemPath></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-hdfs</artifactId><version>2.2.0</version></dependency><!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-mapreduce-client-core</artifactId><version>2.2.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency><groupId>org.apache.hbase</groupId><artifactId>hbase-client</artifactId><version>1.2.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-common --><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-mapreduce-client-common</artifactId><version>2.2.0</version></dependency><!-- https://mvnrepository.com/artifact/log4j/log4j -->
<dependency><groupId>log4j</groupId><artifactId>log4j</artifactId><version>1.2.17</version>
</dependency><!-- https://mvnrepository.com/artifact/junit/junit -->
<dependency><groupId>junit</groupId><artifactId>junit</artifactId><version>4.12</version><scope>test</scope>
</dependency></dependencies></project>

初始数据清洗：截取需要字段

(二)、初始数据清洗：截取需要字段

package data;import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class LongParser {public String[] parser(String line) {String str="27.19.74.143 - - [30/Mar/2015:17:38:20 +0800] \"GET /static/image/common/faq.gif HTTP/1.1\" 200 1127";//Pattern.compile("(27.19.74.143) - - \\[(30/Mar/2015:17:38:20) ( +0800)\\] \"(GET) (.*) HTTP/1.1\" (200) (1127)");Pattern compile = Pattern.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}) - - \\[(.*) ([-|+][0-9]{1,4})\\] \"([A-Z]{1,4}) (.*) HTTP/1.1\" ([0-9]*) ([0-9]*)");Matcher matcher = compile.matcher(line);//System.out.println(matcher.find());while(matcher.find()){if (matcher.group() != null) {System.out.println("成功！");String ip = matcher.group(1);String time = matcher.group(2);String timeArea = matcher.group(3);String request = matcher.group(4);String url = matcher.group(5);String state = matcher.group(6);String dataSize = matcher.group(7);SimpleDateFormat sdf1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.ENGLISH);SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");Date date;try {date = sdf1.parse(time);time = sdf.format(date);} catch (ParseException e) {// TODO Auto-generated catch blocke.printStackTrace();}return new String[]{ip,time,timeArea,request,url,state,dataSize};}}return new String[]{};}}

(三)、数据清洗：清理网络连接中的资源文件和清洗不完整数据

package data;import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;public class clean {static final String INPUT_PATH = "hdfs://192.168.56.30:9000/access_2015_03_30.log";static final String OUT_PATH = "hdfs://192.168.56.30:9000/user/hive/warehouse/t1";public static void main(String[] args) throws Exception {String str="27.19.74.143 - - [30/Mar/2015:17:38:20 +0800] \"GET /static/image/common/faq.gif HTTP/1.1\" 200 1127";System.out.println("==========================数据清洗============================");String[] parser = new LongParser().parser(str);for (int i = 0; i < parser.length; i++) {System.out.println("字段:"+(i+1)+" : "+parser[i]);}System.out.println("==========================数据清洗============================");Configuration conf = new Configuration();Job job =Job.getInstance(conf,clean.class.getSimpleName());job.setJarByClass(clean.class);//打jar包必须在这一行//文件的输入格式FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));job.setInputFormatClass(TextInputFormat.class);//map序列化job.setMapperClass(MyMapper.class);job.setMapOutputKeyClass(LongWritable.class);job.setMapOutputValueClass(Text.class);//reduce序列化job.setReducerClass(MyReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(LongWritable.class);//文件的输出格式String OUT_DIR =OUT_PATH;FileOutputFormat.setOutputPath(job, new Path(OUT_DIR));job.setOutputFormatClass(TextOutputFormat.class);//判断输出文件是否存在，若存在，则删除deleteOutDir(conf, OUT_DIR);job.waitForCompletion(true);}private static void deleteOutDir(Configuration conf, String OUT_DIR) throws IOException, URISyntaxException {FileSystem fileSystem = FileSystem.get(new URI(OUT_DIR), conf);if(fileSystem.exists(new Path(OUT_DIR))){fileSystem.delete(new Path(OUT_DIR), true);}}public static  class MyMapper extends Mapper<LongWritable, Text, LongWritable, Text>{@Overrideprotected void map(LongWritable key, Text value,org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,LongWritable,Text>.Context context)throws IOException ,InterruptedException {String line = value.toString();String[] parser = new LongParser().parser(line);//清理网络连接中的资源文件if (line.contains(".gif")||line.contains("jpg")||line.contains("png")||line.contains(".css")||line.contains(".js")) {return;}//清理不完整数据if(parser.length != 7){return;}Text text = new Text();text.set(parser[0]+"\t"+parser[1]+"\t"+parser[2]+"\t"+parser[3]+"\t"+parser[4]+"\t"+parser[5]+"\t"+parser[6]+"\t");context.write(key, text);}}public static	class  MyReducer extends Reducer<LongWritable,Text, Text, NullWritable>{@Overrideprotected void reduce(LongWritable arg0, Iterable<Text>text,Reducer<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {// TODO Auto-generated method stubfor (Text value : text) {context.write(value, NullWritable.get());}}}}