From 49cce94b297e35c028786e0b5129daa218394f26 Mon Sep 17 00:00:00 2001 From: Jackson Harper Date: Fri, 21 Jun 2024 09:18:33 +0800 Subject: [PATCH] Linting clean ups --- ml/digest-score/train.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/ml/digest-score/train.py b/ml/digest-score/train.py index fdd4d0b52..db8deff84 100644 --- a/ml/digest-score/train.py +++ b/ml/digest-score/train.py @@ -76,14 +76,10 @@ def load_and_sample_library_items_from_parquet(raw_file_path, sample_size): def merge_user_preference_data(sampled_raw_df, feature_dict): - # Start with the sampled raw DataFrame merged_df = sampled_raw_df - # Iterate through the files in the feature directory for key in feature_dict.keys(): user_preference_df = feature_dict[key] - - # Determine the dimension to join on if 'author' in key: merge_keys = ['user_id', 'author'] elif 'site' in key: @@ -95,13 +91,8 @@ def merge_user_preference_data(sampled_raw_df, feature_dict): else: print("skipping feature: ", key) continue # Skip files that don't match expected patterns - - # Merge with the current user preference DataFrame merged_df = pd.merge(merged_df, user_preference_df, on=merge_keys, how='left') - - # Optionally, fill NaNs after each merge step to avoid growing NaNs merged_df = merged_df.fillna(0) - return merged_df def prepare_data(df): @@ -119,7 +110,7 @@ def prepare_data(df): df['days_since_subscribed'] = df['days_since_subscribed'].fillna(0).astype(int) df['is_feed'] = df['subscription_type'].apply(lambda x: 1 if x == 'RSS' else 0) - df['is_newsletter'] = df['subscription_type'].apply(lambda x: 1 if x == 'NEWSLETTER' else 0) + df['is_newsletter'] = df['subscription_type'].apply(lambda x: 1 if x == 'NEWSLETTER' else 0) df = df.dropna(subset=['user_clicked'])