Added 'timings' to the file processing stats

2023-12-08 16:26:09 -08:00 · 2023-12-08 16:26:09 -08:00 · 4073ca5a9f
parent 580c25f36e
commit 4073ca5a9f
1 changed files with 23 additions and 2 deletions
--- a/lib_afc_mosaic/universe_map_file.py
+++ b/lib_afc_mosaic/universe_map_file.py
@ -6,6 +6,7 @@ import json
 import os
 import logging
 import uuid
+import time

 import phonenumbers

@ -54,7 +55,8 @@ class AFCMUniverseMapFile:
            "incoming_count": { "total": 0 },
            "removed_count": { "total": 0 },
            "cleaned_count": { "total": 0 },
-            "removed_details": {}
+            "removed_details": {},
+            "timings": {}
        }


@ -201,6 +203,7 @@ class AFCMUniverseMapFile:

        :param voterset_filename: The VoterSet filename to process
        """
+        overall_time_start = time.time()

        if 'mapping' not in self.mosaic_file_list[voterset_filename]:
            logging.debug("Missing 'mapping' key, skipping %s", voterset_filename)
@ -230,6 +233,7 @@ class AFCMUniverseMapFile:
        amplify_rows = []
        removed_rows = []

+        reading_time_start = time.time()
        try:
            voterset_filename_s3_key = os.path.join(self.metadata.get("s3_key", ""), voterset_filename)
            file_data_stream = self.voterset_s3_connection.get_object(self.voterset_s3_bucket,
@ -238,12 +242,14 @@ class AFCMUniverseMapFile:
            raise Exception(f"universe_map_file.process_file: Failed to get {self.voterset_s3_bucket}/{voterset_filename_s3_key}: {ex}") from ex

        rows = read_voterdata_csv_stream(file_data_stream, csv_keys_needed)
+        reading_time = time.time() - reading_time_start
        logging.debug("rows = %s", len(rows))

        # Update incoming stats after reading this file map
        self.stats["incoming_count"][voterset_filename] = len(rows)
        self.stats["incoming_count"]["total"] += self.stats["incoming_count"][voterset_filename]

+        processing_time_start = time.time()
        for i,r in enumerate(rows):

            if 'combine' in mapping['Cell_Phone']:
@ -342,10 +348,12 @@ class AFCMUniverseMapFile:
            amplify_rows.append(new_row)
            self.final_rows.append(new_row)

+        processing_time = time.time() - processing_time_start

        (filename_prefix, extension) = os.path.splitext(voterset_filename)

        # Need to write *_PREPPED.csv
+        writing_cleaned_time_start = time.time()
        prep_file_name = f"{filename_prefix}_PREPPED.csv"
        prep_full_pathname = f"/tmp/{prep_file_name}"
        if len(amplify_rows) > 0:
@ -362,8 +370,10 @@ class AFCMUniverseMapFile:

            # remove the temp file now
            os.remove(prep_full_pathname)
+        writing_cleaned_time = time.time() - writing_cleaned_time_start

        # Need to write *_REMOVED.csv
+        writing_removed_time_start = time.time()
        removed_file_name = f"{filename_prefix}_REMOVED.csv"
        removed_full_pathname = f"/tmp/{removed_file_name}"
        if len(removed_rows) > 0:
@ -380,17 +390,28 @@ class AFCMUniverseMapFile:

            # remove the temp file now
            os.remove(removed_full_pathname)
+        writing_removed_time = time.time() - writing_removed_time_start

        self.removed_row_count += len(removed_rows)
        self.processed_row_count += len(rows)

-        # Update cleaned & removed stats for this file
+        # Update stats for this file
        self.stats['cleaned_count'][voterset_filename] = len(amplify_rows)
        self.stats['cleaned_count']["total"] += self.stats["cleaned_count"][voterset_filename]

        self.stats['removed_count'][voterset_filename] = len(removed_rows)
        self.stats['removed_count']["total"] += self.stats["removed_count"][voterset_filename]

+        overall_time = time.time() - overall_time_start
+
+        self.stats["timings"][voterset_filename] = {
+            "reading": reading_time,
+            "processing": processing_time,
+            "writing cleaned file": writing_cleaned_time,
+            "writing removed file": writing_removed_time,
+            "file total": overall_time
+            }
+
        return None