1.日志部分内容:
66.249.79.35 - - [14/Jun/2018:06:45:24 +0000] "GET /img/20180504/702434-20180302101540805-554506523.jpg HTTP/1.1" 200 10013 "-" "Googlebot-Image/1.0" 66.249.79.35 - - [14/Jun/2018:06:45:25 +0000] "GET /img/20180504/702434-20180302161346635-1714710787.jpg HTTP/1.1" 200 45157 "-" "Googlebot-Image/1.0" 66.249.79.35 - - [14/Jun/2018:06:45:32 +0000] "GET /img/2018/05/21/89993124.jpg HTTP/1.1" 200 42160 "-" "Googlebot-Image/1.0" 66.249.79.35 - - [14/Jun/2018:06:45:32 +0000] "GET /archives/148618 HTTP/1.1" 200 8932 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 54.36.148.126 - - [14/Jun/2018:06:45:33 +0000] "GET /archives/91429 HTTP/1.1" 200 8223 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)" 54.36.149.31 - - [14/Jun/2018:06:45:34 +0000] "GET /?s=Community HTTP/1.1" 200 6741 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)" 66.249.79.35 - - [14/Jun/2018:06:45:40 +0000] "GET /img/20180505/1018770-20180131142516171-907427428.jpg HTTP/1.1" 200 8652 "-" "Googlebot-Image/1.0" 5.255.250.200 - - [14/Jun/2018:06:45:46 +0000] "GET /robots.txt HTTP/1.1" 200 445 "-" "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" 66.249.79.35 - - [14/Jun/2018:06:45:46 +0000] "GET /archives/148211 HTTP/1.1" 200 8514 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 66.249.79.35 - - [14/Jun/2018:06:45:48 +0000] "GET /img/2018/05/20/1339446-20180517152850212-272519877.jpg HTTP/1.1" 200 124550 "-" "Googlebot-Image/1.0" 220.181.108.147 - - [14/Jun/2018:06:45:52 +0000] "GET /img/20180407/592104-20180302134147548-901544498.jpg HTTP/1.1" 404 22994 "-" "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36" 66.249.79.35 - - [14/Jun/2018:06:45:56 +0000] "GET /img/2018/05/21/60662344.jpg HTTP/1.1" 200 14133 "-" "Googlebot-Image/1.0" 66.249.79.35 - - [14/Jun/2018:06:46:00 +0000] "GET /archives/119633 HTTP/1.1" 200 9306 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 54.36.148.129 - - [14/Jun/2018:06:46:01 +0000] "GET /archives/91007 HTTP/1.1" 200 8332 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)" 54.36.148.201 - - [14/Jun/2018:06:46:03 +0000] "GET /archives/88741/feed HTTP/1.1" 200 983 "-" "Mozilla/5.0 (compatible; AhrefsBot/5.2; +http://ahrefs.com/robot/)" 5.255.250.200 - - [14/Jun/2018:06:46:03 +0000] "GET /archives/87084 HTTP/1.1" 200 9951 "-" "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
2.spark 数据清洗统计把结果插入到mysql
package com.codeblogbt import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import scala.collection.mutable.ListBuffer object SparkStatFormatJob { def main(args: Array[String]): Unit = { val spark = SparkSession.builder().appName("SparkStatFormatJob") .master("local[2]").getOrCreate() // val access = spark.sparkContext.textFile("hdfs://localhost:9000/user/walle/access.log.3") val access = spark.sparkContext.textFile("file:///Users/walle/Documents/D2/log/apache2/access.log.3") // access.take(10).foreach(println) // val result = access.filter(line => line.contains("archives")).collect() // val baiduCount = access.filter(line => line.contains("archives") && line.contains("Googlebot")).collect().size val filterRobot = access.filter(line => line.contains("archives")) import spark.implicits._ val visitDF = filterRobot.map(line =>{ val splits = line.split(" ") val ip = splits(0) val time = splits(3) + " " + splits(4) val url = splits(6) var archivesId = 0 if(url.contains("archives")){ try{ archivesId = url.substring(url.lastIndexOf('/') + 1, url.length).toInt }catch { case e: Exception => { e.printStackTrace() } } } val statusCode = splits(8).toInt val traffic = splits(9).toLong var robotId = 0 if(line.contains("Googlebot")){ robotId = 3 }else if(line.contains("Baiduspider")){ robotId = 4 }else if(line.contains("Yandex")){ robotId = 1 }else if(line.contains("ahrefs")){ robotId = 2 }else if(line.contains("ia_archiver")){ robotId = 5 } VisitInfo(ip, DateUtils.parse(time),archivesId, statusCode,traffic,robotId) }).filter(info => info.archivesId != 0).toDF() // visitDF.show() val resultDF = visitDF.groupBy("robotId").agg(count("archivesId").as("id_count")) val robotDF = Seq((0, "people"),(1,"Yandex"),(2,"Ahrefs"),(3,"Google"),(4,"Baidu"),(5,"ia_archiver")) .toDF("id", "robot_name") val joinDF = resultDF.join(robotDF, resultDF.col("robotId") === robotDF.col("id")) // joinDF.show() joinDF.foreachPartition(partitionOfRecords =>{ val list = new ListBuffer[RobotVisitInfo] partitionOfRecords.foreach(info =>{ val id = info.getAs[Int]("id") val robot_name = info.getAs[String]("robot_name") val count = info.getAs[Long]("id_count") list.append(RobotVisitInfo(id,robot_name, count)) }) MysqlAction.insertTopRobot(list) }) // resultDF.show() spark.stop() } }
3. 用echarts 对mysql的结果进行可视化展示
点击查看更多内容
为 TA 点赞
评论
共同学习,写下你的评论
评论加载中...
作者其他优质文章
正在加载中
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦