Update digest-score to run new model
This commit is contained in:
0
ml/digest-score/features/__init__.py
Normal file
0
ml/digest-score/features/__init__.py
Normal file
109
ml/digest-score/features/extract.py
Normal file
109
ml/digest-score/features/extract.py
Normal file
@ -0,0 +1,109 @@
|
||||
# extract and upload raw data used for feature generation
|
||||
|
||||
import psycopg2
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sqlalchemy import create_engine, text
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import os
|
||||
from io import BytesIO
|
||||
import tempfile
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
from google.cloud import storage
|
||||
|
||||
DB_PARAMS = {
|
||||
'dbname': os.getenv('DB_NAME') or 'omnivore',
|
||||
'user': os.getenv('DB_USER'),
|
||||
'password': os.getenv('DB_PASSWORD'),
|
||||
'host': os.getenv('DB_HOST') or 'localhost',
|
||||
'port': os.getenv('DB_PORT') or '5432'
|
||||
}
|
||||
|
||||
def extract_host(url):
|
||||
try:
|
||||
return urlparse(url).netloc
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def fetch_raw_data(date_str, num_days_history):
|
||||
end_date = pd.to_datetime(date_str)
|
||||
start_date = end_date - timedelta(days=num_days_history)
|
||||
start_date_str = start_date.strftime('%Y-%m-%d 00:00:00')
|
||||
end_date_str = end_date.strftime('%Y-%m-%d 23:59:59')
|
||||
|
||||
conn_str = f"postgresql://{DB_PARAMS['user']}:{DB_PARAMS['password']}@{DB_PARAMS['host']}:{DB_PARAMS['port']}/{DB_PARAMS['dbname']}"
|
||||
# conn_str = f"postgresql://{DB_PARAMS['host']}:{DB_PARAMS['port']}/{DB_PARAMS['dbname']}"
|
||||
engine = create_engine(conn_str)
|
||||
|
||||
query = text("""
|
||||
SELECT
|
||||
li.id as library_item_id,
|
||||
li.user_id,
|
||||
li.created_at,
|
||||
li.archived_at,
|
||||
li.deleted_at,
|
||||
CASE WHEN li.folder = 'inbox' then 1 else 0 END as inbox_folder,
|
||||
li.item_type,
|
||||
li.item_language AS language,
|
||||
li.content_reader,
|
||||
li.word_count as item_word_count,
|
||||
CASE WHEN li.thumbnail IS NOT NULL then 1 else 0 END as item_has_thumbnail,
|
||||
CASE WHEN li.site_icon IS NOT NULL then 1 else 0 END as item_has_site_icon,
|
||||
li.original_url,
|
||||
li.site_name AS site,
|
||||
li.author,
|
||||
li.subscription,
|
||||
sub.type as subscription_type,
|
||||
sub.created_at as subscription_start_date,
|
||||
sub.count as subscription_count,
|
||||
sub.auto_add_to_library as subscription_auto_add_to_library,
|
||||
sub.fetch_content as subscription_fetch_content,
|
||||
sub.folder as subscription_folder,
|
||||
CASE WHEN li.read_at is not NULL then 1 else 0 END as user_clicked,
|
||||
CASE WHEN li.reading_progress_bottom_percent > 10 THEN 1 ELSE 0 END AS user_read,
|
||||
CASE WHEN li.reading_progress_bottom_percent > 50 THEN 1 ELSE 0 END AS user_long_read
|
||||
FROM omnivore.library_item AS li
|
||||
LEFT JOIN omnivore.subscriptions sub on li.subscription = sub.name AND sub.user_id = li.user_id
|
||||
WHERE li.created_at >= :start_date AND li.created_at <= :end_date;
|
||||
""")
|
||||
|
||||
chunk_size = 100000 # Adjust based on available memory and performance needs
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
parquet_files = []
|
||||
with engine.connect() as conn:
|
||||
for i, chunk in enumerate(pd.read_sql(query, conn, params={'start_date': start_date_str, 'end_date': end_date_str}, chunksize=chunk_size)):
|
||||
chunk['library_item_id'] = chunk['library_item_id'].astype(str)
|
||||
chunk['user_id'] = chunk['user_id'].astype(str)
|
||||
chunk['original_url_host'] = chunk['original_url'].apply(extract_host)
|
||||
|
||||
parquet_file = os.path.join(tmpdir, f'chunk_{i}.parquet')
|
||||
chunk.to_parquet(parquet_file)
|
||||
parquet_files.append(parquet_file)
|
||||
|
||||
concatenated_df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)
|
||||
|
||||
parquet_buffer = BytesIO()
|
||||
table = pa.Table.from_pandas(concatenated_df)
|
||||
pq.write_table(table, parquet_buffer)
|
||||
parquet_buffer.seek(0)
|
||||
|
||||
return parquet_buffer
|
||||
|
||||
|
||||
def upload_raw_databuffer(feather_buffer, execution_date, gcs_bucket_name):
|
||||
client = storage.Client()
|
||||
bucket = client.bucket(gcs_bucket_name)
|
||||
blob = bucket.blob(f'data/raw/library_items_{execution_date}.parquet')
|
||||
blob.upload_from_file(feather_buffer, content_type='application/octet-stream')
|
||||
|
||||
print("Data stored successfully.")
|
||||
|
||||
|
||||
def extract_and_upload_raw_data(execution_date, num_days_history, gcs_bucket_name):
|
||||
buffer = fetch_raw_data(execution_date, int(num_days_history))
|
||||
upload_raw_databuffer(buffer, execution_date, gcs_bucket_name)
|
||||
|
||||
235
ml/digest-score/features/user_history.py
Normal file
235
ml/digest-score/features/user_history.py
Normal file
@ -0,0 +1,235 @@
|
||||
# download raw user data, aggregate user history, and upload to GCS
|
||||
|
||||
import psycopg2
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sqlalchemy import create_engine, text
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import os
|
||||
from io import BytesIO
|
||||
import tempfile
|
||||
|
||||
import pickle
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow.feather as feather
|
||||
from google.cloud import storage
|
||||
|
||||
FEATURE_COLUMNS=[
|
||||
# targets
|
||||
# 'user_clicked', 'user_read', 'user_long_read',
|
||||
|
||||
# item attributes / user setup attributes
|
||||
'item_word_count','item_has_site_icon', 'is_subscription',
|
||||
'inbox_folder', 'has_author',
|
||||
|
||||
# how the user has setup the subscription
|
||||
'is_newsletter', 'is_feed', 'days_since_subscribed',
|
||||
'subscription_count', 'subscription_auto_add_to_library',
|
||||
'subscription_fetch_content',
|
||||
|
||||
# user/item interaction history
|
||||
'user_original_url_host_saved_count_week_1',
|
||||
'user_original_url_host_interaction_count_week_1',
|
||||
'user_original_url_host_rate_week_1',
|
||||
'user_original_url_host_proportion_week_1',
|
||||
|
||||
'user_original_url_host_saved_count_week_2',
|
||||
'user_original_url_host_interaction_count_week_2',
|
||||
'user_original_url_host_rate_week_2',
|
||||
'user_original_url_host_proportion_week_2',
|
||||
'user_original_url_host_saved_count_week_3',
|
||||
'user_original_url_host_interaction_count_week_3',
|
||||
'user_original_url_host_rate_week_3',
|
||||
'user_original_url_host_proportion_week_3',
|
||||
'user_original_url_host_saved_count_week_4',
|
||||
'user_original_url_host_interaction_count_week_4',
|
||||
'user_original_url_host_rate_week_4',
|
||||
'user_original_url_host_proportion_week_4',
|
||||
|
||||
'user_subscription_saved_count_week_1',
|
||||
'user_subscription_interaction_count_week_1',
|
||||
'user_subscription_rate_week_1', 'user_subscription_proportion_week_1',
|
||||
'user_site_saved_count_week_3', 'user_site_interaction_count_week_3',
|
||||
'user_site_rate_week_3', 'user_site_proportion_week_3',
|
||||
'user_site_saved_count_week_2', 'user_site_interaction_count_week_2',
|
||||
'user_site_rate_week_2', 'user_site_proportion_week_2',
|
||||
'user_subscription_saved_count_week_2',
|
||||
'user_subscription_interaction_count_week_2',
|
||||
'user_subscription_rate_week_2', 'user_subscription_proportion_week_2',
|
||||
'user_site_saved_count_week_1', 'user_site_interaction_count_week_1',
|
||||
'user_site_rate_week_1', 'user_site_proportion_week_1',
|
||||
'user_subscription_saved_count_week_3',
|
||||
'user_subscription_interaction_count_week_3',
|
||||
'user_subscription_rate_week_3', 'user_subscription_proportion_week_3',
|
||||
'user_author_saved_count_week_4',
|
||||
'user_author_interaction_count_week_4', 'user_author_rate_week_4',
|
||||
'user_author_proportion_week_4', 'user_author_saved_count_week_1',
|
||||
'user_author_interaction_count_week_1', 'user_author_rate_week_1',
|
||||
'user_author_proportion_week_1', 'user_site_saved_count_week_4',
|
||||
'user_site_interaction_count_week_4', 'user_site_rate_week_4',
|
||||
'user_site_proportion_week_4', 'user_author_saved_count_week_2',
|
||||
'user_author_interaction_count_week_2', 'user_author_rate_week_2',
|
||||
'user_author_proportion_week_2', 'user_author_saved_count_week_3',
|
||||
'user_author_interaction_count_week_3', 'user_author_rate_week_3',
|
||||
'user_author_proportion_week_3', 'user_subscription_saved_count_week_4',
|
||||
'user_subscription_interaction_count_week_4',
|
||||
'user_subscription_rate_week_4', 'user_subscription_proportion_week_4'
|
||||
]
|
||||
|
||||
def parquet_to_dataframe(file_path):
|
||||
table = pq.read_table(file_path)
|
||||
df = table.to_pandas()
|
||||
return df
|
||||
|
||||
def load_local_raw_library_items():
|
||||
local_file_path = '/Users/jacksonh/Downloads/data_raw_library_items_2024-03-01.parquet'
|
||||
df = parquet_to_dataframe(local_file_path)
|
||||
return df
|
||||
|
||||
def load_tables_from_pickle(pickle_file):
|
||||
with open(pickle_file, 'rb') as handle:
|
||||
tables = pickle.load(handle)
|
||||
return tables
|
||||
|
||||
|
||||
def download_raw_library_items(execution_date, gcs_bucket_name):
|
||||
local_file_path = 'raw_library_items.parquet'
|
||||
|
||||
client = storage.Client()
|
||||
bucket = client.bucket(gcs_bucket_name)
|
||||
blob = bucket.blob(f'data/raw/library_items_{execution_date}.parquet')
|
||||
blob.download_to_filename(local_file_path)
|
||||
|
||||
df = parquet_to_dataframe(local_file_path)
|
||||
|
||||
os.remove(local_file_path)
|
||||
return df
|
||||
|
||||
|
||||
def load_feather_files(feature_directory):
|
||||
dataframes = {}
|
||||
for file_name in os.listdir(feature_directory):
|
||||
if file_name.endswith('.feather'):
|
||||
file_path = os.path.join(feature_directory, file_name)
|
||||
df_name = os.path.splitext(file_name)[0] # Use the file name (without extension) as key
|
||||
table = feather.read_table(file_path)
|
||||
dataframes[df_name] = table
|
||||
return dataframes
|
||||
|
||||
|
||||
# def save_tables_to_arrow_ipc(tables, output_file):
|
||||
# with pa.OSFile(output_file, 'wb') as sink:
|
||||
# with pa.ipc.new_stream(sink, tables[next(iter(tables))].schema) as writer:
|
||||
# for name, table in tables.items():
|
||||
# print("NAME:", name, "TABLE", table)
|
||||
# writer.write_table(table)
|
||||
|
||||
|
||||
def save_tables_to_arrow_ipc_with_schemas(tables, output_file):
|
||||
with pa.OSFile(output_file, 'wb') as sink:
|
||||
with pa.ipc.new_stream(sink, pa.schema([])) as writer:
|
||||
for name, table in tables.items():
|
||||
metadata = table.schema.metadata or {}
|
||||
metadata = {**metadata, b'table_name': name.encode('utf-8')}
|
||||
schema = table.schema.add_metadata(metadata)
|
||||
print("NAME:", name, "TABLE", table)
|
||||
writer.write_table(table.replace_schema_metadata(schema.metadata))
|
||||
|
||||
|
||||
def save_tables_to_pickle(tables, output_file):
|
||||
with open(output_file, 'wb') as handle:
|
||||
pickle.dump(tables, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
|
||||
client = storage.Client()
|
||||
bucket = client.bucket(bucket_name)
|
||||
blob = bucket.blob(destination_blob_name)
|
||||
blob.upload_from_filename(source_file_name)
|
||||
print(f'File {source_file_name} uploaded to {destination_blob_name} in bucket {bucket_name}.')
|
||||
|
||||
|
||||
def generate_and_upload_user_history(execution_date, gcs_bucket_name):
|
||||
df = download_raw_library_items(execution_date, gcs_bucket_name)
|
||||
# df = load_local_raw_library_items()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
user_preferences = aggregate_user_preferences(df, tmpdir)
|
||||
dataframes = load_feather_files(tmpdir)
|
||||
filename = os.path.join(tmpdir, 'user_features.pkl')
|
||||
save_tables_to_pickle(dataframes, filename)
|
||||
files = load_tables_from_pickle(filename)
|
||||
print("GENERATED FEATURE TABLES:", files.keys())
|
||||
for table in files.keys():
|
||||
print("TABLE: ", table, "LEN: ", len(files[table]))
|
||||
upload_to_gcs(gcs_bucket_name, filename, f'data/features/user_features.pkl')
|
||||
|
||||
|
||||
|
||||
def compute_dimension_aggregates(df, dimension, bucket_name):
|
||||
# Compute initial aggregates to filter out items with less than 2 saved counts
|
||||
initial_agg = df.groupby(['user_id', dimension]).size().reset_index(name='count')
|
||||
filtered_df = df[df.set_index(['user_id', dimension]).index.isin(initial_agg[initial_agg['count'] >= 2].set_index(['user_id', dimension]).index)]
|
||||
|
||||
agg = filtered_df.groupby(['user_id', dimension]).agg(
|
||||
saved_count=(dimension, 'count'),
|
||||
interaction_count=('user_clicked', 'sum')
|
||||
).reset_index()
|
||||
|
||||
agg[f'user_{dimension}_rate_{bucket_name}'] = agg['interaction_count'] / agg['saved_count']
|
||||
agg[f'user_{dimension}_proportion_{bucket_name}'] = agg.groupby('user_id')['interaction_count'].transform(lambda x: x / x.sum())
|
||||
|
||||
agg = agg.rename(columns={
|
||||
'saved_count': f'user_{dimension}_saved_count_{bucket_name}',
|
||||
'interaction_count': f'user_{dimension}_interaction_count_{bucket_name}'
|
||||
})
|
||||
|
||||
return agg
|
||||
|
||||
def calculate_and_save_aggregates(bucket_name, bucket_df, output_dir):
|
||||
# Compute aggregates for each dimension
|
||||
dimensions = ['author', 'site', 'original_url_host', 'subscription']
|
||||
for dimension in dimensions:
|
||||
agg_df = compute_dimension_aggregates(bucket_df, dimension, bucket_name)
|
||||
|
||||
# Save the aggregated DataFrame to a Feather file
|
||||
filename = os.path.join(output_dir, f'user_{dimension}_{bucket_name}.feather')
|
||||
save_aggregated_data(agg_df, filename)
|
||||
print(f"Saved aggregated data for {dimension} in {bucket_name} to {filename}")
|
||||
|
||||
|
||||
def save_aggregated_data(df, filename):
|
||||
buffer = BytesIO()
|
||||
df.to_feather(buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(buffer.getbuffer())
|
||||
|
||||
|
||||
def aggregate_user_preferences(df, output_dir):
|
||||
# Convert 'created_at' to datetime
|
||||
df['created_at'] = pd.to_datetime(df['created_at'])
|
||||
|
||||
end_date = df['created_at'].max()
|
||||
|
||||
# Define bucket ranges for the past four weeks
|
||||
buckets = {
|
||||
'week_4': (end_date - timedelta(weeks=4), end_date - timedelta(weeks=3)),
|
||||
'week_3': (end_date - timedelta(weeks=3), end_date - timedelta(weeks=2)),
|
||||
'week_2': (end_date - timedelta(weeks=2), end_date - timedelta(weeks=1)),
|
||||
'week_1': (end_date - timedelta(weeks=1), end_date)
|
||||
}
|
||||
|
||||
# Calculate aggregates for each bucket and save to file
|
||||
for bucket_name, (start_date, end_date) in buckets.items():
|
||||
bucket_df = df[(df['created_at'] >= start_date) & (df['created_at'] < end_date)]
|
||||
calculate_and_save_aggregates(bucket_name, bucket_df, output_dir)
|
||||
|
||||
|
||||
|
||||
def create_and_upload_user_history(execution_date, num_days_history, gcs_bucket_name):
|
||||
buffer = download_raw_library_items(execution_date, gcs_bucket_name)
|
||||
buffer = open_raw_library_items()
|
||||
upload_raw_databuffer(buffer, execution_date, gcs_bucket_name)
|
||||
Reference in New Issue
Block a user