基于Spark的网站日志分析

本文只展示核心代码，完整代码见文末链接。

Web Log Analysis

(1)按日期和内容（video）的ID进行分组，并根据访问次数进行倒序排序。

(2)按日期，内容（video）的ID和省份进行分组，并根据访问次数排名取前3。

最后将（1）和（2）数据写入MySQL。

注意：（1）写入数据库时分partition写入，而非逐条写入。

（2）先filter出公用的df并进行cache

（3）下面代码应该能进一步优化，例如将videoAccessTopNStat的try/catch中生成partition list和StatDAO.inserDayVideoAccessTopN(list)中生成batch应该可以合并，避免两次遍历。

设计和编写思路：

1.设计输入参数args（如inputPath和outputPath）

2.设计转换的工具类，包括StructType（需要提取什么信息，分别是什么格式），parseLog（split并提取各index的信息，用try/catch包裹，设置默认输出）。其中对时间的提取可另外定义一个工具类，包括inputFormat，outputFormat，getTime和parse。而对地域的提取，可另外定义一个IpUtils，引入开源代码ipdatabase。这些工具类写完后都要在自身main方法中测试。最后生成DF。

3.filter出commonDF。

4.实现特定的数据统计

5.输出数据，如果写入MySQL，就另外创建一个StatDAO类，包括获取链接，分批写入数据和release链接。

//Step One:

/**

  * 将原始日志数据进行解析，返回信息包括visit time, url, traffic, ip

  * @param .log, example: 183.162.52.7 - - [10/Nov/2016:00:01:02 +0800]

  * "POST /api3/getadv HTTP/1.1" ...

  * @return partitioned files, example: 1970-01-01 08:00:00\t-

  * \t813\t183.162.52.7

  */

if (args.length != 2) {

  println("Usage: logCleanYarn <inputPath> <outputPath>")

  System.exit(1)

}

val Array(inputPath, outputPath) = args

val spark = SparkSession.builder().getOrCreate()

val access = spark.sparkContext.textFile(inputPath)

//access.take(10).foreach(println)

val splited = access.map(line => {

   val splits = line.split(" ")

   val ip = splits(0)

   val time = splits(3) + " " + splits(4)

   val url = splits(11).replaceAll("\"", "") //remove quotation mark

   val traffic = splits(9)

// (ip, DataUtils.parse(time), url, traffic)

   DataUtils.parse(time) + "\t" + url + "\t" + traffic + "\t" + ip

    })

splited.saveAsTextFile(outputPath)

spark.stop()

/**

  * 用于解析日志时间

  */

object DataUtils {

  //input_format: [10/Nov/2016:00:01:02 +0800]

  val YYYYMMDDHHMM_TIME_FORMAT = FastDateFormat.getInstance("dd/MMM/yyyy:HH:mm:SS Z", Locale.ENGLISH)

  //output_format: yyyy-MM-dd HH:mm:ss

  val TARGET_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")

  def getTime(time: String) = {

    try {

      YYYYMMDDHHMM_TIME_FORMAT.parse(time.substring(time.indexOf("[") + 1, time.lastIndexOf("]"))).getTime

    } catch {

      case _ => 0l

    }

  }

  /**

  * example: [10/Nov/2016:00:01:02 +0800] ==> 2016-11-10 00:01:00

  */

  def parse(time: String) = {

    TARGET_FORMAT.format(new Date(getTime(time)))

  }

//  def main(args: Array[String]): Unit = {

//    println(parse("[10/Nov/2016:00:01:02 +0800]"))

//  }

}

//Step Two:

/**

  * 将第一步解析出来的数据转化为DataFrame，并保存为一份parquet文件。

  */

if (args.length != 2) {

  println("Usage: logCleanYarn <inputPath> <outputPath>")

  System.exit(1)

}

val Array(inputPath, outputPath) = args

val spark = SparkSession.builder().getOrCreate()

val access = spark.sparkContext.textFile(inputPath)

// access.take(10).foreach(println)

val accessDF = spark.createDataFrame(access.map(line => AccessConvertUtil.parseLog(line)), AccessConvertUtil.struct)

// accessDF.printSchema()

// accessDF.show(false)

accessDF.coalesce(1).write.format("parquet").partitionBy("day")

      .save(outputPath)

spark.stop()

/**

  * 工具类，定义了schema和进一步解析log的方法

  */

object AccessConvertUtil {

  val struct = StructType(Seq(

    StructField("url", StringType),

    StructField("cmsType", StringType),

    StructField("cmsId", IntegerType),

    StructField("traffic", IntegerType),

    StructField("ip", StringType),

    StructField("city", StringType),

    StructField("time", StringType),

    StructField("day", StringType)

  ))

  /**

    * 进一步解析log，如转化数据类型，解析网址，ip映射具体省份，最后以Row输出

    */

  def parseLog(log: String) = {

    try{

      val splited = log.split("\t")

      val url = splited(1)

      val traffic = splited(2).toInt

      val ip = splited(3)

      // 网址："http://www.xxx.com/article/101"中article为网页内容，101为article的ID

      val domain = "http://www.xxx.com/"

      val cms = url.substring(url.indexOf(domain) + domain.length)

      val cmsTypeId = cms.split("/")

      var cmsType = ""

      var cmsId = 0

      if (cmsTypeId.length > 1) {

        cmsType = cmsTypeId(0)

        cmsId = cmsTypeId(1).toInt

      }

      val city = IpUtils.getCity(ip)

      val time = splited(0)

      val day = time.substring(0, 10).replaceAll("-", "")

      Row(url, cmsType, cmsId, traffic, ip, city, time, day)

    } catch {

      case _ => {

        Row(null, null, null, null, null, null, null, null)

      }

    }

  }

}

/**

  * Ip工具类，将IP映射为省份，利用开源代码ipdatabase

  * https://github.com/wzhe06/ipdatabase

  */

object IpUtils {

  def getCity(ip: String) = {

    IpHelper.findRegionByIp(ip)

  }

  def main(args: Array[String]): Unit = {

    println(getCity("58.30.15.255"))

  }

}

//Step Three:

/**

  * 在第二步的结果数据中，按日期和video的ID进行分组，并根据访问次数进行倒序排序。

  * 最后将数据写入MySQL。

  */

if (args.length != 2) {

  println("Usage: logCleanYarn <inputPath> <day>")

  System.exit(1)

}

val Array(inputPath, day) = args

val spark = SparkSession.builder()

  .config("spark.sql.sources.partitionColumnTypeInference.enabled", "false")

  .getOrCreate()

val accessDF = spark.read.format("parquet").load(inputPath)

//    accessDF.printSchema()

//    accessDF.show(false)

//预先筛选和cache后面两个函数要复用的df

import spark.implicits._

val commonDF = accessDF.filter($"day" === day && $"cmsType" === "video")

commonDF.cache()

//删除已有的内容，避免重复

StatDAO.deleteData(day)

//groupBy video

videoAccessTopNStat(spark, commonDF)

//groupBy city

cityAccessTopNStat(spark, commonDF)

commonDF.unpersist(true)

//    videoAccessTopDF.show(false)

spark.stop()

/**

  * 两个样例类，用于储存不同数据类型，应用于下面两个方法。

  */

case class DayVideoAccessStat(day: String, cmsId: Long, times: Long)

case class DayCityVideoAccessStat(day: String, cmsId: Long, city: String, times: Long, timesRank: Int)

/**

  * 按内容ID分组后排序，并把结果写到Mysql

  */

def videoAccessTopNStat(spark: SparkSession, comDF: DataFrame): Unit = {

  import spark.implicits._

  val videoAccessTopNStat = comDF

    .groupBy($"day", $"cmsId")

    .agg(count("cmsId").as("times"))

    .orderBy(desc("times"))

  try {

    videoAccessTopNStat.foreachPartition(partitionOfRecords =>{

      val list = new ListBuffer[DayVideoAccessStat]

      partitionOfRecords.foreach(info => {

        val day = info.getAs[String]("day")

        val cmsId = info.getAs[Long]("cmsId")

        val times = info.getAs[Long]("times")

        list.append(DayVideoAccessStat(day, cmsId, times))

      })

      StatDAO.inserDayVideoAccessTopN(list)

    })

  } catch {

    case e:Exception => e.printStackTrace()

  }

}

/**

  * 按内容ID和省份分组后排名，并把结果写到Mysql

  */

def cityAccessTopNStat(spark: SparkSession, comDF: DataFrame): Unit = {

  import spark.implicits._

  val videoAccessTopNStat = comDF

    .groupBy($"day", $"city", $"cmsId")

    .agg(count("cmsId").as("times"))

  val windowSpec = Window.partitionBy($"city").orderBy(desc("times"))

  val videoAccessTopNStatDF = videoAccessTopNStat.select(expr("*"), rank().over(windowSpec).as("times_rank"))

    .filter($"times_rank" <= 3)

  try {

    videoAccessTopNStatDF.foreachPartition(partitionOfRecords => {

      val list = new ListBuffer[DayCityVideoAccessStat]

      partitionOfRecords.foreach(info => {

        val day = info.getAs[String]("day")

        val cmsId = info.getAs[Long]("cmsId")

        val city = info.getAs[String]("city")

        val times = info.getAs[Long]("times")

        val timesRank = info.getAs[Int]("times_rank")

        list.append(DayCityVideoAccessStat(day, cmsId, city, times, timesRank))

      })

      StatDAO.inserDayCityVideoAccessTopN(list)

    })

  } catch {

    case e: Exception => e.printStackTrace()

  }

}

/**

  * 分组后排序方法

  */

def videoAccessSortedStat(spark: SparkSession, accessDF: DataFrame) : Unit = {

  import spark.implicits._

  val sortedStat= accessDF

    .filter($"day" === "20170511" && $"cmsType" === "video")

    .groupBy($"day", $"cmsId")

    .agg(count("cmsId").as("times"))

    .orderBy(desc("times"))

  // 分块创建存储每条信息的list，并调用函数将数据写到到MySQL

  try {

      sortedStat.foreachPartition(partitionOfRecords =>{

        val list = new ListBuffer[DayVideoAccessStat]

        partitionOfRecords.foreach(info => {

          val day = info.getAs[String]("day")

          val cmsId = info.getAs[Long]("cmsId")

          val times = info.getAs[Long]("times")

          list.append(DayVideoAccessStat(day, cmsId, times))

        })

        StatDAO.inserDayVideoAccessSortedStat(list)

      })

   } catch {

  case e:Exception => e.printStackTrace()

 }

}

//Step Three:

/**

  * 工具类，提供两类方法：

  * 1.连接数据库，将数据写入MySQL，并释放连接的方法。

  * 2.删除MySQL中已存在的（相同entry的数据）

  */

object StatDAO {

  def inserDayVideoAccessTopN(list: ListBuffer[DayVideoAccessStat]): Unit = {

    var connection: Connection = null

    var pstmt: PreparedStatement = null

    try{

      connection = MySQLUtils.getConnect()

      val sql = "insert into day_video_access_topn_stat(day, cms_id, times) values (?, ?, ?)"

      val pstmt = connection.prepareStatement(sql)

      connection.setAutoCommit(false)

      for (ele <- list) {

        pstmt.setString(1, ele.day)

        pstmt.setLong(2, ele.cmsId)

        pstmt.setLong(3, ele.times)

        pstmt.addBatch()

      }

      pstmt.executeBatch()

      connection.commit()

    } catch {

      case e:Exception => e.printStackTrace()

    } finally {

      MySQLUtils.release(connection, pstmt)

    }

  }

  def inserDayCityVideoAccessTopN(list: ListBuffer[DayCityVideoAccessStat]): Unit = {

    var connection: Connection = null

    var pstmt: PreparedStatement = null

    try{

      connection = MySQLUtils.getConnect()

      val sql = "insert into day_video_city_access_topn_stat(day, cms_id, city, times, times_rank) values (?, ?, ?, ?, ?)"

      val pstmt = connection.prepareStatement(sql)

      connection.setAutoCommit(false)

      for (ele <- list) {

        pstmt.setString(1, ele.day)

        pstmt.setLong(2, ele.cmsId)

        pstmt.setString(3, ele.city)

        pstmt.setLong(4, ele.times)

        pstmt.setInt(5, ele.timesRank)

        pstmt.addBatch()

      }

      pstmt.executeBatch()

      connection.commit()

    } catch {

      case e:Exception => e.printStackTrace()

    } finally {

      MySQLUtils.release(connection, pstmt)

    }

  }

  def deleteData(day: String): Unit = {

    val tables = Array("day_video_access_topn_stat", "day_video_city_access_topn_stat")

    var connection: Connection = null

    var pstmt: PreparedStatement = null

    try {

      connection = MySQLUtils.getConnect()

      for (table <- tables) {

        val sql = s"delete from $table where day = ?"

        val pstmt = connection.prepareStatement(sql)

        pstmt.setString(1, day)

        pstmt.executeUpdate()

      }

    } catch {

      case e: Exception => e.printStackTrace()

    } finally {

      MySQLUtils.release(connection, pstmt)

    }

  }

}

/**

  * 工具类，包含连接数据库和释放连接的方法。

  */

object MySQLUtils {

  def getConnect() = {

      DriverManager.getConnection("jdbc:mysql://localhost:3306/log_project","root", "password")

  }

  def release(connection: Connection, pstmt: PreparedStatement): Unit ={

    try{

      if (pstmt != null) {

        pstmt.close()

      }

    } catch {

      case e: Exception => e.printStackTrace()

    } finally {

      if (connection != null) {

        connection.close()

      }

    }

  }

  def main(args: Array[String]): Unit = {

    println(getConnect())

  }

}

参考：

大数据 Spark SQL慕课网日志分析

GitHub源码

基于Spark的网站日志分析

Web Log Analysis

基于Spark的网站日志分析的相关教程结束。

相关推荐

详解Redis的慢查询日志

Sourcetrail 代码分析工具的使用

python的日志模块：logging；django的日志系统；django日志输出时间修改

Django 日志输出及打印--logging

八.django日志配置

Django日志器的使用

[转]django 日志logging的配置以及处理

Django日志