From 4a0c3c78e7f688b71d97a601d5b44146fccf8b90 Mon Sep 17 00:00:00 2001 From: hariharandev1 Date: Fri, 4 Oct 2024 12:07:52 -0700 Subject: [PATCH] added pid tid and hhash column to metadata --- dfanalyzer/main.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dfanalyzer/main.py b/dfanalyzer/main.py index e0c57b7..8e69cee 100644 --- a/dfanalyzer/main.py +++ b/dfanalyzer/main.py @@ -286,7 +286,6 @@ def io_function(json_object, current_dict, time_approximate,condition_fn): def io_columns(): conf = get_dft_configuration() return { - 'hhash': "uint64[pyarrow]", 'compute_time': "string[pyarrow]" if not conf.time_approximate else "uint64[pyarrow]", 'io_time': "string[pyarrow]" if not conf.time_approximate else "uint64[pyarrow]", 'app_io_time': "string[pyarrow]" if not conf.time_approximate else "uint64[pyarrow]", @@ -458,10 +457,10 @@ def __init__(self, file_pattern, load_fn=None, load_cols={}, load_data = {}, met 'tinterval': "string[pyarrow]" if not self.conf.time_approximate else "uint64[pyarrow]", 'trange': "uint64[pyarrow]"} columns.update(io_columns()) columns.update(load_cols) - file_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]"} - hostname_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]"} - string_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]"} - other_metadata_columns = { 'name':"string[pyarrow]" ,'value':"string[pyarrow]" } + file_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]",'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]", 'hhash': "uint64[pyarrow]"} + hostname_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]",'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]", 'hhash': "uint64[pyarrow]"} + string_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]",'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]", 'hhash': "uint64[pyarrow]"} + other_metadata_columns = { 'name':"string[pyarrow]" ,'value':"string[pyarrow]",'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]", 'hhash': "uint64[pyarrow]"} if "FH" in metadata_cols: file_hash_columns.update(metadata_cols["FH"]) if "HH" in metadata_cols: @@ -480,7 +479,7 @@ def __init__(self, file_pattern, load_fn=None, load_cols={}, load_data = {}, met self.file_hash = self.all_events.query("type == 1")[list(file_hash_columns.keys())].groupby('hash').first().persist() self.host_hash = self.all_events.query("type == 2")[list(hostname_hash_columns.keys())].groupby('hash').first().persist() self.string_hash = self.all_events.query("type == 3")[list(string_hash_columns.keys())].groupby('hash').first().persist() - self.metadata = self.all_events.query("type == 4")[list(other_metadata_columns.keys())].persist() + self.metadata = self.all_events.query("type == 4")[list(other_metadata_columns.keys())].persist() self.n_partition = math.ceil(total_size.compute() / (128 * 1024 ** 2)) logging.debug(f"Number of partitions used are {self.n_partition}") self.events = events.repartition(npartitions=self.n_partition).persist()