Skip to content

Commit cf9ada0

Browse files
Initial Commit
1 parent fd38ddb commit cf9ada0

File tree

5 files changed

+10082
-0
lines changed

5 files changed

+10082
-0
lines changed

Diff for: 09-LogFileDemo/LogFileDemo.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from pyspark.sql import *
2+
from pyspark.sql.functions import regexp_extract, substring_index
3+
4+
if __name__ == "__main__":
5+
spark = SparkSession \
6+
.builder \
7+
.master("local[3]") \
8+
.appName("LogFileDemo") \
9+
.getOrCreate()
10+
11+
file_df = spark.read.text("data/apache_logs.txt")
12+
file_df.printSchema()
13+
14+
log_reg = r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\S+) "(\S+)" "([^"]*)'
15+
16+
logs_df = file_df.select(regexp_extract('value', log_reg, 1).alias('ip'),
17+
regexp_extract('value', log_reg, 4).alias('date'),
18+
regexp_extract('value', log_reg, 6).alias('request'),
19+
regexp_extract('value', log_reg, 10).alias('referrer'))
20+
21+
logs_df \
22+
.where("trim(referrer) != '-' ") \
23+
.withColumn("referrer", substring_index("referrer", "/", 3)) \
24+
.groupBy("referrer") \
25+
.count() \
26+
.show(100, truncate=False)

0 commit comments

Comments
 (0)