From 580c25f36e8aeccf3d4319efef053542e010e29d Mon Sep 17 00:00:00 2001 From: Rick Ross Date: Fri, 8 Dec 2023 13:26:49 -0800 Subject: [PATCH] Modified to only read in keys needed from CSV file. --- lib_afc_mosaic/helpers.py | 8 ++++-- lib_afc_mosaic/universe_map_file.py | 39 ++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/lib_afc_mosaic/helpers.py b/lib_afc_mosaic/helpers.py index 0ae0b11..5244059 100644 --- a/lib_afc_mosaic/helpers.py +++ b/lib_afc_mosaic/helpers.py @@ -5,7 +5,7 @@ Helper functions for the Mosaic library import csv import codecs -def read_voterdata_csv_stream(input_stream) -> list: +def read_voterdata_csv_stream(input_stream, keys_needed=None) -> list: """ read_voterdata_csv_stream: Read csv stream to list @@ -16,9 +16,13 @@ def read_voterdata_csv_stream(input_stream) -> list: rows = [] # note: using 'utf-8-sig' removes the annoying \ufeff BOM signature + if keys_needed is None or len(keys_needed) > 0: + key_list = keys_needed + reader = csv.DictReader(codecs.getreader("utf-8-sig")(input_stream)) for row in reader: - key_list = row.keys() + if key_list is None: + key_list = row.keys() new_row = {} for k in key_list: new_row[k] = row[k] diff --git a/lib_afc_mosaic/universe_map_file.py b/lib_afc_mosaic/universe_map_file.py index f40aa93..7d913ad 100644 --- a/lib_afc_mosaic/universe_map_file.py +++ b/lib_afc_mosaic/universe_map_file.py @@ -202,18 +202,9 @@ class AFCMUniverseMapFile: :param voterset_filename: The VoterSet filename to process """ - amplify_rows = [] - removed_rows = [] - - try: - voterset_filename_s3_key = os.path.join(self.metadata.get("s3_key", ""), voterset_filename) - file_data_stream = self.voterset_s3_connection.get_object(self.voterset_s3_bucket, - voterset_filename_s3_key) - except Exception as ex: - raise Exception(f"universe_map_file.process_file: Failed to get {self.voterset_s3_bucket}/{voterset_filename_s3_key}: {ex}") from ex - - rows = read_voterdata_csv_stream(file_data_stream) - logging.debug("rows = %s", len(rows)) + if 'mapping' not in self.mosaic_file_list[voterset_filename]: + logging.debug("Missing 'mapping' key, skipping %s", voterset_filename) + return None mapping = self.mosaic_file_list[voterset_filename]['mapping'] @@ -225,6 +216,30 @@ class AFCMUniverseMapFile: voterset_filename) return None + # Faster & less memory to only read the keys we need from CSV + csv_keys_needed = [] + for k,v in mapping.items(): + if isinstance(v, dict): + if 'field' in v: + csv_keys_needed.append(v['field']) + elif 'combine' in v: + for k1,v1 in v['combine'].items(): + if k1.startswith('field'): + csv_keys_needed.append(v1) + + amplify_rows = [] + removed_rows = [] + + try: + voterset_filename_s3_key = os.path.join(self.metadata.get("s3_key", ""), voterset_filename) + file_data_stream = self.voterset_s3_connection.get_object(self.voterset_s3_bucket, + voterset_filename_s3_key) + except Exception as ex: + raise Exception(f"universe_map_file.process_file: Failed to get {self.voterset_s3_bucket}/{voterset_filename_s3_key}: {ex}") from ex + + rows = read_voterdata_csv_stream(file_data_stream, csv_keys_needed) + logging.debug("rows = %s", len(rows)) + # Update incoming stats after reading this file map self.stats["incoming_count"][voterset_filename] = len(rows) self.stats["incoming_count"]["total"] += self.stats["incoming_count"][voterset_filename]