Skip to content

Commit

Permalink
Add some working metrics for log analyser project (#1983)
Browse files Browse the repository at this point in the history
* adds folder for log analyser project

* adds code for input and log segregation

* Adds open handles and read pattern metrics

* Adds usage info, .txt support and minor changes

* Adds stat/list call metric

* Adds closed handles metric

* Adds structured code

* Adds output module

* minor fix

* Adds csv generation function

* Adds some structural changes

* Deletes extra_log_info.go

* Minor changes

* Changed output to gsheets

* Minor changes

* Adds write patterns, GCS read patterns, GCS call stats for top files

* Adds JSON support for GKE logs

* Minor fixes

* Adds support for files which contain logs with decreasing timestamp

* Adds some documentation

---------

Co-authored-by: Ankita Luthra <lankita@google.com>
  • Loading branch information
pat-vish and ankitaluthra1 authored Jul 15, 2024
1 parent 3525748 commit 2a07dc3
Show file tree
Hide file tree
Showing 14 changed files with 1,205 additions and 0 deletions.
54 changes: 54 additions & 0 deletions tools/log_analyser/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
This log analyzer takes log files and a few filters and outputs the analysis.

Install python if not installed already.

Create a python environment using the following steps-


(Replace python3 with python if you don't have python3)

1. Run this command to install python3-venv, `sudo apt install python3-venv`

2. Run `python3 -m venv /path`, replace path with the location you want to create virtual environment (preferably outside the repo, to avoid creating unwanted files)
3. Activate the environment using command `source venv_name/bin/activate`, replace venv_name with the location where you created the environment

(Deactivate the environment using the command `deactivate`, once you finish running the code)



Install numpy using command- `pip install numpy`

Install gspread using command - `pip install gspread`


Run the code using command-
`python3 main.py` ,if python3 is installed,
else use `python main.py`

Enter the name of the directory that contains log files (with absolute paths)
for exp- `/usr/local/google/home/patelvishvesh/tmp/test_dir`

Make sure that directory contains only files and not folders.

You can also give a zip file inside the directory. This zip should contain files only and not folders.


Choose if you want a time window filter (by pressing y/n)

If yes, enter the start and end time (epoch)

Enter the type of logs (gke/gcsfuse)

If chosen gke, enter the format in which logs are (CSV/JSON)

Enter your ldap (to give access of the created sheet)

Exp- `patelvishvesh`


Enter the name and location of the credential file

Exp- `/usr/local/google/home/patelvishvesh/Downloads/credentials.json`

After this a google sheet link will be generated.

193 changes: 193 additions & 0 deletions tools/log_analyser/inputreader/get_logs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import zipfile
import json
import csv
import datetime


class GetLogs:
def iso_to_epoch(self, timestamp_str):
"""
converts iso to epoch time (seconds and nanoseconds)
:param timestamp_str: string with iso time
:return: epoch time json object
"""
try:
datetime_obj = datetime.datetime.fromisoformat(timestamp_str)
seconds = int(datetime_obj.timestamp())
nanos = datetime_obj.microsecond * 1000
return {"seconds": seconds, "nanos": nanos}
except ValueError as e:
print(f"Error parsing timestamp: {e}")
return None

def get_sorted_files(self, files, log_type, log_format):
"""
for each file it reads the first log with valid timestamp and then arranges
the files in order of timestamps
:param files: list of file names
:param log_type: gke/gcsfuse
:param log_format: json/csv
:return: ordered list of files
"""
unordered_list = []
for file in files:
if file.find(".zip") != -1:
destination_dir = file[0:file.rfind("/")+1]
zip_ref = zipfile.ZipFile(file, "r")
zip_list = zip_ref.namelist()
zip_ref.extractall(destination_dir)
# adding the extracted files to the files
for zipf in zip_list:
files.append(destination_dir+zipf)
else:
unordered_list.append(file)
# to arrange the files sequentially
file_tuple = []
pos = 0
if log_type == "gcsfuse":
for file in unordered_list:
with open(file, "r") as handle:
for line in handle:
data = line.strip()
try:
json_object = json.loads(data)
sec = json_object["timestamp"]["seconds"]
nano = json_object["timestamp"]["nanos"]
file_tuple.append([[sec, nano], pos])
break
except json.JSONDecodeError:
print(f"Error parsing line: {line}")
pos += 1
elif log_type == "gke":
for file in unordered_list:
if log_format == "JSON":
with open(file, "r") as handle:
data = json.load(handle)
for obj in data:
timestamp = self.iso_to_epoch(obj["timestamp"])
if timestamp is not None:
file_tuple.append([[timestamp["seconds"], timestamp["nanos"]], pos])
break
else:
with open(file, 'r') as csvfile:
reader = csv.reader(csvfile)
header_row = next(reader)
fields_to_extract = ["timestamp", "textPayload"]
field_indices = {field: header_row.index(field) for field in fields_to_extract if field in header_row}

for row in reader:
timestamp = self.iso_to_epoch(row[field_indices["timestamp"]])
if timestamp is not None:
file_tuple.append([[timestamp["seconds"], timestamp["nanos"]], pos])
break
pos += 1
else:
for file in files:
with open(file, "r") as handle:
for line in handle:
start_ind = line.find("time=\"") + len("time=\"")
end_ind = line.find("\"", start_ind)
if start_ind != -1 and end_ind != -1:
time = line[start_ind:end_ind]
datetime_obj = datetime.datetime.strptime(time, "%d/%m/%Y %H:%M:%S.%f")
epoch_time = datetime_obj.timestamp()
microseconds_str = time.split('.')[-1]
microseconds = int(microseconds_str)
nanoseconds = microseconds * 1000
file_tuple.append([[epoch_time, nanoseconds], pos])
break
pos += 1
file_tuple.sort()
ordered_list = []
for file_tup in file_tuple:
ordered_list.append(unordered_list[file_tup[1]])
return ordered_list

def append_logs(self, logs, temp_logs):
"""
extracts timestamp and message (textPayload) and appends to the list of logs
:param logs: list of logs
:param temp_logs: logs of a single file with more fields than we need
"""
first_log = self.iso_to_epoch(temp_logs[0]["timestamp"])
last_log = self.iso_to_epoch(temp_logs[len(temp_logs) - 1]["timestamp"])
first_log_time = first_log["seconds"] + 1e-9*first_log["nanos"]
last_log_time = last_log["seconds"] + 1e-9*last_log["nanos"]
if first_log_time < last_log_time:
for obj in temp_logs:
if "timestamp" in obj.keys() and "textPayload" in obj.keys():
json_log = {"timestamp": self.iso_to_epoch(obj["timestamp"]), "message": obj["textPayload"]}
logs.append(json_log)
else:
file_len = len(temp_logs) - 1
for i in range(len(temp_logs)):
obj = temp_logs[file_len - i]
if "timestamp" in obj.keys() and "textPayload" in obj.keys():
json_log = {"timestamp": self.iso_to_epoch(obj["timestamp"]), "message": obj["textPayload"]}
logs.append(json_log)

def get_json_logs(self, files, log_type, interval, log_format):
"""
calls get_sorted_files to get sorted files and the depending on the format
it opens file and calls append_logs or just directly appends logs (for json gcsfuse logs)
:param files: list of file names
:param log_type: gke/gcsfuse
:param interval: time interval for which logs are wanted
:param log_format: json/csv
:return: a list of json logs with two fields message and timestamp(epoch)
"""
ordered_files = self.get_sorted_files(files, log_type, log_format)
logs = []
for file in ordered_files:
if log_type == "gcsfuse":
with open(file, "r") as handle:
for line in handle:
data = line.strip()
try:
json_object = json.loads(data)
if json_object["timestamp"]["seconds"] < interval[0]:
continue
elif json_object["timestamp"]["seconds"] > interval[1]:
break
logs.append(json_object)
except json.JSONDecodeError:
print(f"Error parsing line: {line}")

elif log_type == "gke":
if log_format == "JSON":
with open(file, "r") as handle:
data = json.load(handle)
if not isinstance(data, list):
raise ValueError("Expected a JSON list in the file")
self.append_logs(logs, data)
else:
temp_logs = []
with open(file, 'r') as csvfile:
reader = csv.reader(csvfile)
header_row = next(reader)
fields_to_extract = ["timestamp", "textPayload"]
field_indices = {field: header_row.index(field) for field in fields_to_extract if field in header_row}
for row in reader:
json_log = {"timestamp": row[field_indices["timestamp"]], "textPayload": row[field_indices["textPayload"]]}
temp_logs.append(json_log)
self.append_logs(logs, temp_logs)
else:
with open(file, 'r') as handle:
for line in handle:
start_ind = line.find("time=\"") + len("time=\"")
end_ind = line.find("\"", start_ind)
start_ind1 = line.find("message=\"") + len("message=\"")
end_ind1 = line.rfind("\"")
if start_ind1 != -1 and end_ind1 != -1 and start_ind != -1 and end_ind != -1:
time = line[start_ind:end_ind]
message = line[start_ind1:end_ind1]
message = message.replace(r"\"", "\"")
datetime_obj = datetime.datetime.strptime(time, "%d/%m/%Y %H:%M:%S.%f")
epoch_time = int(datetime_obj.timestamp())
microseconds_str = time.split('.')[-1]
microseconds = int(microseconds_str)
nanoseconds = microseconds * 1000
log_data = {"timestamp": {"seconds": epoch_time, "nanos": nanoseconds}, "message": message}
logs.append(log_data)

return logs
35 changes: 35 additions & 0 deletions tools/log_analyser/inputreader/user_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from inputreader.get_logs import GetLogs
import os


class UserInput:
def get_input(self):
"""
takes the directory and log information from user
and calls appropriate functions to get sorted logs
:return: list of sorted logs
"""
files = []
directory_path = input("Enter the path to the directory containing log files: ")
for filename in os.listdir(directory_path):
# Construct the full path to the file
file_path = os.path.join(directory_path, filename)

# Check if it's a regular file (not a directory or hidden file)
if os.path.isfile(file_path):
files.append(file_path)

add_time_filter = input("Do you want the time filter(y/n):" )
if add_time_filter == "y" or add_time_filter == "Y":
start_time = int(input("start time(epoch): "))
end_time = int(input("end time(epoch): "))
else:
start_time = 0
end_time = 1e18
get_logs_obj = GetLogs()
log_type = input("Enter the type of logs (gcsfuse/gke): ")
log_format = ""
if log_type == "gke":
log_format = input("Enter the format of the GKE logs (CSV/JSON): ")
logs = get_logs_obj.get_json_logs(files, log_type, [start_time, end_time], log_format)
return logs
7 changes: 7 additions & 0 deletions tools/log_analyser/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from parser import log_parser
from outputgenerator import generate_gsheet
from inputreader.user_input import UserInput
user_input_obj = UserInput()
logs = user_input_obj.get_input()
global_data = log_parser.general_parser(logs)
generate_gsheet.main_gsheet_generator(global_data)
Loading

0 comments on commit 2a07dc3

Please sign in to comment.