Add some working metrics for log analyser project (#1983)

* adds folder for log analyser project * adds code for input and log segregation * Adds open handles and read pattern metrics * Adds usage info, .txt support and minor changes * Adds stat/list call metric * Adds closed handles metric * Adds structured code * Adds output module * minor fix * Adds csv generation function * Adds some structural changes * Deletes extra_log_info.go * Minor changes * Changed output to gsheets * Minor changes * Adds write patterns, GCS read patterns, GCS call stats for top files * Adds JSON support for GKE logs * Minor fixes * Adds support for files which contain logs with decreasing timestamp * Adds some documentation --------- Co-authored-by: Ankita Luthra <lankita@google.com>
GoogleCloudPlatform · Jul 15, 2024 · 2a07dc3 · 2a07dc3
1 parent 3525748
commit 2a07dc3
Show file tree

Hide file tree

Showing 14 changed files with 1,205 additions and 0 deletions.
diff --git a/tools/log_analyser/README.md b/tools/log_analyser/README.md
@@ -0,0 +1,54 @@
+This log analyzer takes log files and a few filters and outputs the analysis.
+
+Install python if not installed already.
+
+Create a python environment using the following steps-
+
+
+(Replace python3 with python if you don't have python3)
+
+1. Run this command to install python3-venv, `sudo apt install python3-venv`
+
+2. Run `python3 -m venv /path`, replace path with the location you want to create virtual environment (preferably outside the repo, to avoid creating unwanted files)
+3. Activate the environment using command `source venv_name/bin/activate`, replace venv_name with the location where you created the environment
+
+(Deactivate the environment using the command `deactivate`, once you finish running the code)
+
+
+
+Install numpy using command- `pip install numpy`
+
+Install gspread using command - `pip install gspread`
+
+
+Run the code using command-
+`python3 main.py` ,if python3 is installed,
+else use `python main.py`
+
+Enter the name of the directory that contains log files (with absolute paths)
+for exp- `/usr/local/google/home/patelvishvesh/tmp/test_dir`
+
+Make sure that directory contains only files and not folders.
+
+You can also give a zip file inside the directory. This zip should contain files only and not folders.
+
+
+Choose if you want a time window filter (by pressing y/n)
+
+If yes, enter the start and end time (epoch)
+
+Enter the type of logs (gke/gcsfuse)
+
+If chosen gke, enter the format in which logs are (CSV/JSON)
+
+Enter your ldap (to give access of the created sheet)
+
+Exp- `patelvishvesh`
+
+
+Enter the name and location of the credential file
+
+Exp- `/usr/local/google/home/patelvishvesh/Downloads/credentials.json`
+
+After this a google sheet link will be generated.
+
diff --git a/tools/log_analyser/inputreader/get_logs.py b/tools/log_analyser/inputreader/get_logs.py
@@ -0,0 +1,193 @@
+import zipfile
+import json
+import csv
+import datetime
+
+
+class GetLogs:
+    def iso_to_epoch(self, timestamp_str):
+        """
+        converts iso to epoch time (seconds and nanoseconds)
+        :param timestamp_str: string with iso time
+        :return: epoch time json object
+        """
+        try:
+            datetime_obj = datetime.datetime.fromisoformat(timestamp_str)
+            seconds = int(datetime_obj.timestamp())
+            nanos = datetime_obj.microsecond * 1000
+            return {"seconds": seconds, "nanos": nanos}
+        except ValueError as e:
+            print(f"Error parsing timestamp: {e}")
+            return None
+
+    def get_sorted_files(self, files, log_type, log_format):
+        """
+        for each file it reads the first log with valid timestamp and then arranges
+        the files in order of timestamps
+        :param files: list of file names
+        :param log_type: gke/gcsfuse
+        :param log_format: json/csv
+        :return: ordered list of files
+        """
+        unordered_list = []
+        for file in files:
+            if file.find(".zip") != -1:
+                destination_dir = file[0:file.rfind("/")+1]
+                zip_ref = zipfile.ZipFile(file, "r")
+                zip_list = zip_ref.namelist()
+                zip_ref.extractall(destination_dir)
+                # adding the extracted files to the files
+                for zipf in zip_list:
+                    files.append(destination_dir+zipf)
+            else:
+                unordered_list.append(file)
+        # to arrange the files sequentially
+        file_tuple = []
+        pos = 0
+        if log_type == "gcsfuse":
+            for file in unordered_list:
+                with open(file, "r") as handle:
+                    for line in handle:
+                        data = line.strip()
+                        try:
+                            json_object = json.loads(data)
+                            sec = json_object["timestamp"]["seconds"]
+                            nano = json_object["timestamp"]["nanos"]
+                            file_tuple.append([[sec, nano], pos])
+                            break
+                        except json.JSONDecodeError:
+                            print(f"Error parsing line: {line}")
+                pos += 1
+        elif log_type == "gke":
+            for file in unordered_list:
+                if log_format == "JSON":
+                    with open(file, "r") as handle:
+                        data = json.load(handle)
+                        for obj in data:
+                            timestamp = self.iso_to_epoch(obj["timestamp"])
+                            if timestamp is not None:
+                                file_tuple.append([[timestamp["seconds"], timestamp["nanos"]], pos])
+                                break
+                else:
+                    with open(file, 'r') as csvfile:
+                        reader = csv.reader(csvfile)
+                        header_row = next(reader)
+                        fields_to_extract = ["timestamp", "textPayload"]
+                        field_indices = {field: header_row.index(field) for field in fields_to_extract if field in header_row}
+
+                        for row in reader:
+                            timestamp = self.iso_to_epoch(row[field_indices["timestamp"]])
+                            if timestamp is not None:
+                                file_tuple.append([[timestamp["seconds"], timestamp["nanos"]], pos])
+                                break
+                pos += 1
+        else:
+            for file in files:
+                with open(file, "r") as handle:
+                    for line in handle:
+                        start_ind = line.find("time=\"") + len("time=\"")
+                        end_ind = line.find("\"", start_ind)
+                        if start_ind != -1 and end_ind != -1:
+                            time = line[start_ind:end_ind]
+                            datetime_obj = datetime.datetime.strptime(time, "%d/%m/%Y %H:%M:%S.%f")
+                            epoch_time = datetime_obj.timestamp()
+                            microseconds_str = time.split('.')[-1]
+                            microseconds = int(microseconds_str)
+                            nanoseconds = microseconds * 1000
+                            file_tuple.append([[epoch_time, nanoseconds], pos])
+                            break
+                pos += 1
+        file_tuple.sort()
+        ordered_list = []
+        for file_tup in file_tuple:
+            ordered_list.append(unordered_list[file_tup[1]])
+        return ordered_list
+
+    def append_logs(self, logs, temp_logs):
+        """
+        extracts timestamp and message (textPayload) and appends to the list of logs
+        :param logs: list of logs
+        :param temp_logs: logs of a single file with more fields than we need
+        """
+        first_log = self.iso_to_epoch(temp_logs[0]["timestamp"])
+        last_log = self.iso_to_epoch(temp_logs[len(temp_logs) - 1]["timestamp"])
+        first_log_time = first_log["seconds"] + 1e-9*first_log["nanos"]
+        last_log_time = last_log["seconds"] + 1e-9*last_log["nanos"]
+        if first_log_time < last_log_time:
+            for obj in temp_logs:
+                if "timestamp" in obj.keys() and "textPayload" in obj.keys():
+                    json_log = {"timestamp": self.iso_to_epoch(obj["timestamp"]), "message": obj["textPayload"]}
+                    logs.append(json_log)
+        else:
+            file_len = len(temp_logs) - 1
+            for i in range(len(temp_logs)):
+                obj = temp_logs[file_len - i]
+                if "timestamp" in obj.keys() and "textPayload" in obj.keys():
+                    json_log = {"timestamp": self.iso_to_epoch(obj["timestamp"]), "message": obj["textPayload"]}
+                    logs.append(json_log)
+
+    def get_json_logs(self, files, log_type, interval, log_format):
+        """
+        calls get_sorted_files to get sorted files and the depending on the format
+        it opens file and calls append_logs or just directly appends logs (for json gcsfuse logs)
+        :param files: list of file names
+        :param log_type: gke/gcsfuse
+        :param interval: time interval for which logs are wanted
+        :param log_format: json/csv
+        :return: a list of json logs with two fields message and timestamp(epoch)
+        """
+        ordered_files = self.get_sorted_files(files, log_type, log_format)
+        logs = []
+        for file in ordered_files:
+            if log_type == "gcsfuse":
+                with open(file, "r") as handle:
+                    for line in handle:
+                        data = line.strip()
+                        try:
+                            json_object = json.loads(data)
+                            if json_object["timestamp"]["seconds"] < interval[0]:
+                                continue
+                            elif json_object["timestamp"]["seconds"] > interval[1]:
+                                break
+                            logs.append(json_object)
+                        except json.JSONDecodeError:
+                            print(f"Error parsing line: {line}")
+
+            elif log_type == "gke":
+                if log_format == "JSON":
+                    with open(file, "r") as handle:
+                        data = json.load(handle)
+                    if not isinstance(data, list):
+                        raise ValueError("Expected a JSON list in the file")
+                    self.append_logs(logs, data)
+                else:
+                    temp_logs = []
+                    with open(file, 'r') as csvfile:
+                        reader = csv.reader(csvfile)
+                        header_row = next(reader)
+                        fields_to_extract = ["timestamp", "textPayload"]
+                        field_indices = {field: header_row.index(field) for field in fields_to_extract if field in header_row}
+                        for row in reader:
+                            json_log = {"timestamp": row[field_indices["timestamp"]], "textPayload": row[field_indices["textPayload"]]}
+                            temp_logs.append(json_log)
+                    self.append_logs(logs, temp_logs)
+            else:
+                with open(file, 'r') as handle:
+                    for line in handle:
+                        start_ind = line.find("time=\"") + len("time=\"")
+                        end_ind = line.find("\"", start_ind)
+                        start_ind1 = line.find("message=\"") + len("message=\"")
+                        end_ind1 = line.rfind("\"")
+                        if start_ind1 != -1 and end_ind1 != -1 and start_ind != -1 and end_ind != -1:
+                            time = line[start_ind:end_ind]
+                            message = line[start_ind1:end_ind1]
+                            message = message.replace(r"\"", "\"")
+                            datetime_obj = datetime.datetime.strptime(time, "%d/%m/%Y %H:%M:%S.%f")
+                            epoch_time = int(datetime_obj.timestamp())
+                            microseconds_str = time.split('.')[-1]
+                            microseconds = int(microseconds_str)
+                            nanoseconds = microseconds * 1000
+                            log_data = {"timestamp": {"seconds": epoch_time, "nanos": nanoseconds}, "message": message}
+                            logs.append(log_data)
+
+        return logs
diff --git a/tools/log_analyser/inputreader/user_input.py b/tools/log_analyser/inputreader/user_input.py
@@ -0,0 +1,35 @@
+from inputreader.get_logs import GetLogs
+import os
+
+
+class UserInput:
+    def get_input(self):
+        """
+        takes the directory and log information from user
+        and calls appropriate functions to get sorted logs
+        :return: list of sorted logs
+        """
+        files = []
+        directory_path = input("Enter the path to the directory containing log files: ")
+        for filename in os.listdir(directory_path):
+            # Construct the full path to the file
+            file_path = os.path.join(directory_path, filename)
+
+            # Check if it's a regular file (not a directory or hidden file)
+            if os.path.isfile(file_path):
+                files.append(file_path)
+
+        add_time_filter = input("Do you want the time filter(y/n):" )
+        if add_time_filter == "y" or add_time_filter == "Y":
+            start_time = int(input("start time(epoch): "))
+            end_time = int(input("end time(epoch): "))
+        else:
+            start_time = 0
+            end_time = 1e18
+        get_logs_obj = GetLogs()
+        log_type = input("Enter the type of logs (gcsfuse/gke): ")
+        log_format = ""
+        if log_type == "gke":
+            log_format = input("Enter the format of the GKE logs (CSV/JSON): ")
+        logs = get_logs_obj.get_json_logs(files, log_type, [start_time, end_time], log_format)
+        return logs
diff --git a/tools/log_analyser/main.py b/tools/log_analyser/main.py
@@ -0,0 +1,7 @@
+from parser import log_parser
+from outputgenerator import generate_gsheet
+from inputreader.user_input import UserInput
+user_input_obj = UserInput()
+logs = user_input_obj.get_input()
+global_data = log_parser.general_parser(logs)
+generate_gsheet.main_gsheet_generator(global_data)