Update digest-score to run new model

This commit is contained in:
Jackson Harper
2024-06-20 15:49:12 +08:00
parent 99c95194d0
commit 17e5978792
9 changed files with 638 additions and 465 deletions

View File

View File

@ -0,0 +1,109 @@
# extract and upload raw data used for feature generation
import psycopg2
import numpy as np
import pandas as pd
from sqlalchemy import create_engine, text
from datetime import datetime, timedelta
import os
from io import BytesIO
import tempfile
import pyarrow as pa
import pyarrow.parquet as pq
from google.cloud import storage
DB_PARAMS = {
'dbname': os.getenv('DB_NAME') or 'omnivore',
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'host': os.getenv('DB_HOST') or 'localhost',
'port': os.getenv('DB_PORT') or '5432'
}
def extract_host(url):
try:
return urlparse(url).netloc
except Exception as e:
return None
def fetch_raw_data(date_str, num_days_history):
end_date = pd.to_datetime(date_str)
start_date = end_date - timedelta(days=num_days_history)
start_date_str = start_date.strftime('%Y-%m-%d 00:00:00')
end_date_str = end_date.strftime('%Y-%m-%d 23:59:59')
conn_str = f"postgresql://{DB_PARAMS['user']}:{DB_PARAMS['password']}@{DB_PARAMS['host']}:{DB_PARAMS['port']}/{DB_PARAMS['dbname']}"
# conn_str = f"postgresql://{DB_PARAMS['host']}:{DB_PARAMS['port']}/{DB_PARAMS['dbname']}"
engine = create_engine(conn_str)
query = text("""
SELECT
li.id as library_item_id,
li.user_id,
li.created_at,
li.archived_at,
li.deleted_at,
CASE WHEN li.folder = 'inbox' then 1 else 0 END as inbox_folder,
li.item_type,
li.item_language AS language,
li.content_reader,
li.word_count as item_word_count,
CASE WHEN li.thumbnail IS NOT NULL then 1 else 0 END as item_has_thumbnail,
CASE WHEN li.site_icon IS NOT NULL then 1 else 0 END as item_has_site_icon,
li.original_url,
li.site_name AS site,
li.author,
li.subscription,
sub.type as subscription_type,
sub.created_at as subscription_start_date,
sub.count as subscription_count,
sub.auto_add_to_library as subscription_auto_add_to_library,
sub.fetch_content as subscription_fetch_content,
sub.folder as subscription_folder,
CASE WHEN li.read_at is not NULL then 1 else 0 END as user_clicked,
CASE WHEN li.reading_progress_bottom_percent > 10 THEN 1 ELSE 0 END AS user_read,
CASE WHEN li.reading_progress_bottom_percent > 50 THEN 1 ELSE 0 END AS user_long_read
FROM omnivore.library_item AS li
LEFT JOIN omnivore.subscriptions sub on li.subscription = sub.name AND sub.user_id = li.user_id
WHERE li.created_at >= :start_date AND li.created_at <= :end_date;
""")
chunk_size = 100000 # Adjust based on available memory and performance needs
with tempfile.TemporaryDirectory() as tmpdir:
parquet_files = []
with engine.connect() as conn:
for i, chunk in enumerate(pd.read_sql(query, conn, params={'start_date': start_date_str, 'end_date': end_date_str}, chunksize=chunk_size)):
chunk['library_item_id'] = chunk['library_item_id'].astype(str)
chunk['user_id'] = chunk['user_id'].astype(str)
chunk['original_url_host'] = chunk['original_url'].apply(extract_host)
parquet_file = os.path.join(tmpdir, f'chunk_{i}.parquet')
chunk.to_parquet(parquet_file)
parquet_files.append(parquet_file)
concatenated_df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)
parquet_buffer = BytesIO()
table = pa.Table.from_pandas(concatenated_df)
pq.write_table(table, parquet_buffer)
parquet_buffer.seek(0)
return parquet_buffer
def upload_raw_databuffer(feather_buffer, execution_date, gcs_bucket_name):
client = storage.Client()
bucket = client.bucket(gcs_bucket_name)
blob = bucket.blob(f'data/raw/library_items_{execution_date}.parquet')
blob.upload_from_file(feather_buffer, content_type='application/octet-stream')
print("Data stored successfully.")
def extract_and_upload_raw_data(execution_date, num_days_history, gcs_bucket_name):
buffer = fetch_raw_data(execution_date, int(num_days_history))
upload_raw_databuffer(buffer, execution_date, gcs_bucket_name)

View File

@ -0,0 +1,235 @@
# download raw user data, aggregate user history, and upload to GCS
import psycopg2
import numpy as np
import pandas as pd
from sqlalchemy import create_engine, text
from datetime import datetime, timedelta
import os
from io import BytesIO
import tempfile
import pickle
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.feather as feather
from google.cloud import storage
FEATURE_COLUMNS=[
# targets
# 'user_clicked', 'user_read', 'user_long_read',
# item attributes / user setup attributes
'item_word_count','item_has_site_icon', 'is_subscription',
'inbox_folder', 'has_author',
# how the user has setup the subscription
'is_newsletter', 'is_feed', 'days_since_subscribed',
'subscription_count', 'subscription_auto_add_to_library',
'subscription_fetch_content',
# user/item interaction history
'user_original_url_host_saved_count_week_1',
'user_original_url_host_interaction_count_week_1',
'user_original_url_host_rate_week_1',
'user_original_url_host_proportion_week_1',
'user_original_url_host_saved_count_week_2',
'user_original_url_host_interaction_count_week_2',
'user_original_url_host_rate_week_2',
'user_original_url_host_proportion_week_2',
'user_original_url_host_saved_count_week_3',
'user_original_url_host_interaction_count_week_3',
'user_original_url_host_rate_week_3',
'user_original_url_host_proportion_week_3',
'user_original_url_host_saved_count_week_4',
'user_original_url_host_interaction_count_week_4',
'user_original_url_host_rate_week_4',
'user_original_url_host_proportion_week_4',
'user_subscription_saved_count_week_1',
'user_subscription_interaction_count_week_1',
'user_subscription_rate_week_1', 'user_subscription_proportion_week_1',
'user_site_saved_count_week_3', 'user_site_interaction_count_week_3',
'user_site_rate_week_3', 'user_site_proportion_week_3',
'user_site_saved_count_week_2', 'user_site_interaction_count_week_2',
'user_site_rate_week_2', 'user_site_proportion_week_2',
'user_subscription_saved_count_week_2',
'user_subscription_interaction_count_week_2',
'user_subscription_rate_week_2', 'user_subscription_proportion_week_2',
'user_site_saved_count_week_1', 'user_site_interaction_count_week_1',
'user_site_rate_week_1', 'user_site_proportion_week_1',
'user_subscription_saved_count_week_3',
'user_subscription_interaction_count_week_3',
'user_subscription_rate_week_3', 'user_subscription_proportion_week_3',
'user_author_saved_count_week_4',
'user_author_interaction_count_week_4', 'user_author_rate_week_4',
'user_author_proportion_week_4', 'user_author_saved_count_week_1',
'user_author_interaction_count_week_1', 'user_author_rate_week_1',
'user_author_proportion_week_1', 'user_site_saved_count_week_4',
'user_site_interaction_count_week_4', 'user_site_rate_week_4',
'user_site_proportion_week_4', 'user_author_saved_count_week_2',
'user_author_interaction_count_week_2', 'user_author_rate_week_2',
'user_author_proportion_week_2', 'user_author_saved_count_week_3',
'user_author_interaction_count_week_3', 'user_author_rate_week_3',
'user_author_proportion_week_3', 'user_subscription_saved_count_week_4',
'user_subscription_interaction_count_week_4',
'user_subscription_rate_week_4', 'user_subscription_proportion_week_4'
]
def parquet_to_dataframe(file_path):
table = pq.read_table(file_path)
df = table.to_pandas()
return df
def load_local_raw_library_items():
local_file_path = '/Users/jacksonh/Downloads/data_raw_library_items_2024-03-01.parquet'
df = parquet_to_dataframe(local_file_path)
return df
def load_tables_from_pickle(pickle_file):
with open(pickle_file, 'rb') as handle:
tables = pickle.load(handle)
return tables
def download_raw_library_items(execution_date, gcs_bucket_name):
local_file_path = 'raw_library_items.parquet'
client = storage.Client()
bucket = client.bucket(gcs_bucket_name)
blob = bucket.blob(f'data/raw/library_items_{execution_date}.parquet')
blob.download_to_filename(local_file_path)
df = parquet_to_dataframe(local_file_path)
os.remove(local_file_path)
return df
def load_feather_files(feature_directory):
dataframes = {}
for file_name in os.listdir(feature_directory):
if file_name.endswith('.feather'):
file_path = os.path.join(feature_directory, file_name)
df_name = os.path.splitext(file_name)[0] # Use the file name (without extension) as key
table = feather.read_table(file_path)
dataframes[df_name] = table
return dataframes
# def save_tables_to_arrow_ipc(tables, output_file):
# with pa.OSFile(output_file, 'wb') as sink:
# with pa.ipc.new_stream(sink, tables[next(iter(tables))].schema) as writer:
# for name, table in tables.items():
# print("NAME:", name, "TABLE", table)
# writer.write_table(table)
def save_tables_to_arrow_ipc_with_schemas(tables, output_file):
with pa.OSFile(output_file, 'wb') as sink:
with pa.ipc.new_stream(sink, pa.schema([])) as writer:
for name, table in tables.items():
metadata = table.schema.metadata or {}
metadata = {**metadata, b'table_name': name.encode('utf-8')}
schema = table.schema.add_metadata(metadata)
print("NAME:", name, "TABLE", table)
writer.write_table(table.replace_schema_metadata(schema.metadata))
def save_tables_to_pickle(tables, output_file):
with open(output_file, 'wb') as handle:
pickle.dump(tables, handle, protocol=pickle.HIGHEST_PROTOCOL)
def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
client = storage.Client()
bucket = client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)
print(f'File {source_file_name} uploaded to {destination_blob_name} in bucket {bucket_name}.')
def generate_and_upload_user_history(execution_date, gcs_bucket_name):
df = download_raw_library_items(execution_date, gcs_bucket_name)
# df = load_local_raw_library_items()
with tempfile.TemporaryDirectory() as tmpdir:
user_preferences = aggregate_user_preferences(df, tmpdir)
dataframes = load_feather_files(tmpdir)
filename = os.path.join(tmpdir, 'user_features.pkl')
save_tables_to_pickle(dataframes, filename)
files = load_tables_from_pickle(filename)
print("GENERATED FEATURE TABLES:", files.keys())
for table in files.keys():
print("TABLE: ", table, "LEN: ", len(files[table]))
upload_to_gcs(gcs_bucket_name, filename, f'data/features/user_features.pkl')
def compute_dimension_aggregates(df, dimension, bucket_name):
# Compute initial aggregates to filter out items with less than 2 saved counts
initial_agg = df.groupby(['user_id', dimension]).size().reset_index(name='count')
filtered_df = df[df.set_index(['user_id', dimension]).index.isin(initial_agg[initial_agg['count'] >= 2].set_index(['user_id', dimension]).index)]
agg = filtered_df.groupby(['user_id', dimension]).agg(
saved_count=(dimension, 'count'),
interaction_count=('user_clicked', 'sum')
).reset_index()
agg[f'user_{dimension}_rate_{bucket_name}'] = agg['interaction_count'] / agg['saved_count']
agg[f'user_{dimension}_proportion_{bucket_name}'] = agg.groupby('user_id')['interaction_count'].transform(lambda x: x / x.sum())
agg = agg.rename(columns={
'saved_count': f'user_{dimension}_saved_count_{bucket_name}',
'interaction_count': f'user_{dimension}_interaction_count_{bucket_name}'
})
return agg
def calculate_and_save_aggregates(bucket_name, bucket_df, output_dir):
# Compute aggregates for each dimension
dimensions = ['author', 'site', 'original_url_host', 'subscription']
for dimension in dimensions:
agg_df = compute_dimension_aggregates(bucket_df, dimension, bucket_name)
# Save the aggregated DataFrame to a Feather file
filename = os.path.join(output_dir, f'user_{dimension}_{bucket_name}.feather')
save_aggregated_data(agg_df, filename)
print(f"Saved aggregated data for {dimension} in {bucket_name} to {filename}")
def save_aggregated_data(df, filename):
buffer = BytesIO()
df.to_feather(buffer)
buffer.seek(0)
with open(filename, 'wb') as f:
f.write(buffer.getbuffer())
def aggregate_user_preferences(df, output_dir):
# Convert 'created_at' to datetime
df['created_at'] = pd.to_datetime(df['created_at'])
end_date = df['created_at'].max()
# Define bucket ranges for the past four weeks
buckets = {
'week_4': (end_date - timedelta(weeks=4), end_date - timedelta(weeks=3)),
'week_3': (end_date - timedelta(weeks=3), end_date - timedelta(weeks=2)),
'week_2': (end_date - timedelta(weeks=2), end_date - timedelta(weeks=1)),
'week_1': (end_date - timedelta(weeks=1), end_date)
}
# Calculate aggregates for each bucket and save to file
for bucket_name, (start_date, end_date) in buckets.items():
bucket_df = df[(df['created_at'] >= start_date) & (df['created_at'] < end_date)]
calculate_and_save_aggregates(bucket_name, bucket_df, output_dir)
def create_and_upload_user_history(execution_date, num_days_history, gcs_bucket_name):
buffer = download_raw_library_items(execution_date, gcs_bucket_name)
buffer = open_raw_library_items()
upload_raw_databuffer(buffer, execution_date, gcs_bucket_name)