diff --git a/ml/digest-score/app.py b/ml/digest-score/app.py index 501a417be..f9c428100 100644 --- a/ml/digest-score/app.py +++ b/ml/digest-score/app.py @@ -90,13 +90,13 @@ def compute_interaction_score(user_id, item_features): 'item_has_thumbnail': 1 if item_features.get('has_thumbnail') else 0, "item_has_site_icon": 1 if item_features.get('has_site_icon') else 0, - 'item_word_count': item_features.get('words_count'), + 'item_word_count': item_features.get('words_count'), 'is_subscription': 1 if item_features.get('is_subscription') else 0, - 'is_newsletter': 1 if item_features.get('is_newsletter') else 0, + 'is_newsletter': 1 if item_features.get('is_newsletter') else 0, 'is_feed': 1 if item_features.get('is_feed') else 0, - 'days_since_subscribed': item_features.get('days_since_subscribed'), - 'subscription_count': item_features.get('subscription_count'), - 'subscription_auto_add_to_library': item_features.get('subscription_auto_add_to_library'), + 'days_since_subscribed': item_features.get('days_since_subscribed'), + 'subscription_count': item_features.get('subscription_count'), + 'subscription_auto_add_to_library': item_features.get('subscription_auto_add_to_library'), 'subscription_fetch_content': item_features.get('subscription_fetch_content'), 'has_author': 1 if item_features.get('author') else 0, diff --git a/ml/digest-score/features/user_history.py b/ml/digest-score/features/user_history.py index 3906e6170..289ecc64d 100644 --- a/ml/digest-score/features/user_history.py +++ b/ml/digest-score/features/user_history.py @@ -83,10 +83,6 @@ def parquet_to_dataframe(file_path): df = table.to_pandas() return df -def load_local_raw_library_items(): - local_file_path = '/Users/jacksonh/Downloads/data_raw_library_items_2024-03-01.parquet' - df = parquet_to_dataframe(local_file_path) - return df def load_tables_from_pickle(pickle_file): with open(pickle_file, 'rb') as handle: @@ -119,14 +115,6 @@ def load_feather_files(feature_directory): return dataframes -# def save_tables_to_arrow_ipc(tables, output_file): -# with pa.OSFile(output_file, 'wb') as sink: -# with pa.ipc.new_stream(sink, tables[next(iter(tables))].schema) as writer: -# for name, table in tables.items(): -# print("NAME:", name, "TABLE", table) -# writer.write_table(table) - - def save_tables_to_arrow_ipc_with_schemas(tables, output_file): with pa.OSFile(output_file, 'wb') as sink: with pa.ipc.new_stream(sink, pa.schema([])) as writer: @@ -153,7 +141,6 @@ def upload_to_gcs(bucket_name, source_file_name, destination_blob_name): def generate_and_upload_user_history(execution_date, gcs_bucket_name): df = download_raw_library_items(execution_date, gcs_bucket_name) - # df = load_local_raw_library_items() with tempfile.TemporaryDirectory() as tmpdir: user_preferences = aggregate_user_preferences(df, tmpdir) dataframes = load_feather_files(tmpdir)