fork download
  1. import re
  2. pattern = re.compile(r"""(?x)^
  3. (?P<host>\S+) \s+ # host %h
  4. \S+ \s+ # indent %l (unused)
  5. (?P<user>\S+) \s+ # user %u
  6. \[(?P<time>.*?)\] \s+ # time %t
  7. "\S+\s+(?P<request>[^"?\s]*)[^"]*" \s+ # request "%r"
  8. (?P<status>[0-9]+) \s+ # status %>s
  9. (?P<size>\S+) (?:\s+ # size %b (careful, can be '-')
  10. "(?P<referrer>[^"?\s]*[^"]*)" \s+ # referrer "%{Referer}i"
  11. "(?P<agent>[^"]*)" (?:\s+ # user agent "%{User-agent}i"
  12. "[^"]*" )?)? # optional argument (unused)
  13. $""")
  14.  
  15. def get_structured_access_logs_list(access_logs):
  16. # Initialize required variables
  17. log_data = []
  18. # Get components from each line of the log file into a structured dict
  19. for line in access_logs:
  20. try:
  21. log_data.append(pattern.match(line).groupdict())
  22. except:
  23. pass
  24. return log_data
  25.  
  26. lines = ['83.198.250.175 - - [22/Mar/2009:07:40:06 +0100] "GET /images/ht1.gif HTTP/1.1" 200 61 "http://w...content-available-to-author-only...s.fr/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Wanadoo 6.7; Orange 8.0)" "-"',
  27. '65.33.94.190 - - [05/Apr/2003:17:26:27 -0500] "POST /samples/dem/tt.php?x=e2323 HTTP/1.0" 404 276',
  28. '151.227.152.48 - - [02/Jul/2014:14:35:55 +0100] "GET /css/main.css HTTP/1.1" 200 4658 "http://s...content-available-to-author-only...o.uk/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"',
  29. '10.143.2.119 64.103.161.112 - [06/Jan/1970:00:48:01 +0000] "GET /right_arrow.jpg HTTP/1.1" 304 0 "http://64.103.161.112/index_eth_diag.html" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36"']
  30. for res in get_structured_access_logs_list(lines):
  31. print(res)
  32.  
Success #stdin #stdout 0.02s 27728KB
stdin
Standard input is empty
stdout
{'host': '83.198.250.175', 'request': '/images/ht1.gif', 'referrer': 'http://w...content-available-to-author-only...s.fr/', 'time': '22/Mar/2009:07:40:06 +0100', 'agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Wanadoo 6.7; Orange 8.0)', 'user': '-', 'status': '200', 'size': '61'}
{'host': '65.33.94.190', 'request': '/samples/dem/tt.php', 'referrer': None, 'time': '05/Apr/2003:17:26:27 -0500', 'agent': None, 'user': '-', 'status': '404', 'size': '276'}
{'host': '151.227.152.48', 'request': '/css/main.css', 'referrer': 'http://s...content-available-to-author-only...o.uk/', 'time': '02/Jul/2014:14:35:55 +0100', 'agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36', 'user': '-', 'status': '200', 'size': '4658'}
{'host': '10.143.2.119', 'request': '/right_arrow.jpg', 'referrer': 'http://64.103.161.112/index_eth_diag.html', 'time': '06/Jan/1970:00:48:01 +0000', 'agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36', 'user': '-', 'status': '304', 'size': '0'}