This commit is contained in:
Jackson Harper
2024-06-20 15:52:23 +08:00
parent 17e5978792
commit 0e321143f6
2 changed files with 5 additions and 18 deletions

View File

@ -90,13 +90,13 @@ def compute_interaction_score(user_id, item_features):
'item_has_thumbnail': 1 if item_features.get('has_thumbnail') else 0,
"item_has_site_icon": 1 if item_features.get('has_site_icon') else 0,
'item_word_count': item_features.get('words_count'),
'item_word_count': item_features.get('words_count'),
'is_subscription': 1 if item_features.get('is_subscription') else 0,
'is_newsletter': 1 if item_features.get('is_newsletter') else 0,
'is_newsletter': 1 if item_features.get('is_newsletter') else 0,
'is_feed': 1 if item_features.get('is_feed') else 0,
'days_since_subscribed': item_features.get('days_since_subscribed'),
'subscription_count': item_features.get('subscription_count'),
'subscription_auto_add_to_library': item_features.get('subscription_auto_add_to_library'),
'days_since_subscribed': item_features.get('days_since_subscribed'),
'subscription_count': item_features.get('subscription_count'),
'subscription_auto_add_to_library': item_features.get('subscription_auto_add_to_library'),
'subscription_fetch_content': item_features.get('subscription_fetch_content'),
'has_author': 1 if item_features.get('author') else 0,

View File

@ -83,10 +83,6 @@ def parquet_to_dataframe(file_path):
df = table.to_pandas()
return df
def load_local_raw_library_items():
local_file_path = '/Users/jacksonh/Downloads/data_raw_library_items_2024-03-01.parquet'
df = parquet_to_dataframe(local_file_path)
return df
def load_tables_from_pickle(pickle_file):
with open(pickle_file, 'rb') as handle:
@ -119,14 +115,6 @@ def load_feather_files(feature_directory):
return dataframes
# def save_tables_to_arrow_ipc(tables, output_file):
# with pa.OSFile(output_file, 'wb') as sink:
# with pa.ipc.new_stream(sink, tables[next(iter(tables))].schema) as writer:
# for name, table in tables.items():
# print("NAME:", name, "TABLE", table)
# writer.write_table(table)
def save_tables_to_arrow_ipc_with_schemas(tables, output_file):
with pa.OSFile(output_file, 'wb') as sink:
with pa.ipc.new_stream(sink, pa.schema([])) as writer:
@ -153,7 +141,6 @@ def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
def generate_and_upload_user_history(execution_date, gcs_bucket_name):
df = download_raw_library_items(execution_date, gcs_bucket_name)
# df = load_local_raw_library_items()
with tempfile.TemporaryDirectory() as tmpdir:
user_preferences = aggregate_user_preferences(df, tmpdir)
dataframes = load_feather_files(tmpdir)