Add initial digest-score service

This commit is contained in:
Jackson Harper
2024-05-29 14:24:59 +08:00
parent e30157bffa
commit cd842393f6
5 changed files with 1021 additions and 0 deletions

341
ml/digest-score/app.py Normal file
View File

@ -0,0 +1,341 @@
import psycopg2
import logging
from flask import Flask, request, jsonify
from pydantic import BaseModel, ConfigDict, ValidationError, conlist
from typing import List
import os
import sys
import json
import pytz
import numpy as np
import pandas as pd
import joblib
from datetime import datetime, timedelta
from datetime import datetime
import dateutil.parser
from google.cloud import storage
app = Flask(__name__)
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
TRAIN_FEATURES = [
"item_has_thumbnail",
"item_has_site_icon",
'user_30d_interactions_author_count',
'user_30d_interactions_site_count',
'user_30d_interactions_subscription_count',
'user_30d_interactions_author_rate',
'user_30d_interactions_site_rate',
'user_30d_interactions_subscription_rate',
'global_30d_interactions_site_count',
'global_30d_interactions_author_count',
'global_30d_interactions_subscription_count',
'global_30d_interactions_site_rate',
'global_30d_interactions_author_rate',
'global_30d_interactions_subscription_rate'
]
DB_PARAMS = {
'dbname': os.getenv('DB_NAME') or 'omnivore',
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'host': os.getenv('DB_HOST') or 'localhost',
'port': os.getenv('DB_PORT') or '5432'
}
USER_FEATURES = {
"site": "user_30d_interactions_site",
"author": "user_30d_interactions_author",
"subscription": "user_30d_interactions_subscription",
}
GLOBAL_FEATURES = {
"site": "global_30d_interactions_site",
"author": "global_30d_interactions_author",
"subscription": "global_30d_interactions_subscription",
}
def download_from_gcs(bucket_name, gcs_path, destination_path):
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(gcs_path)
blob.download_to_filename(destination_path)
def load_pipeline():
bucket_name = os.getenv('GCS_BUCKET')
pipeline_gcs_path = os.getenv('PIPELINE_GCS_PATH')
download_from_gcs(bucket_name, pipeline_gcs_path, '/tmp/pipeline.pkl')
pipeline = joblib.load('/tmp/pipeline.pkl')
return pipeline
def load_pipeline_local():
pipeline = joblib.load('predict_user_clicked_random_forest_pipeline-v001.pkl')
return pipeline
def fetch_user_features(name, feature_name):
conn = psycopg2.connect(**DB_PARAMS)
cur = conn.cursor()
query = f"SELECT user_id, {name}, interactions, interaction_rate FROM {feature_name}"
cur.execute(query)
data = cur.fetchall()
cur.close()
conn.close()
columns = [
"user_id",
name,
"interactions",
"interaction_rate"
]
rate_feature_name = f"{feature_name}_rate"
count_feature_name = f"{feature_name}_count"
df_loaded = pd.DataFrame(data, columns=columns)
df_loaded = df_loaded.rename(columns={"interactions": count_feature_name}, errors="raise")
df_loaded = df_loaded.rename(columns={"interaction_rate": rate_feature_name}, errors="raise")
df_loaded[rate_feature_name] = df_loaded[rate_feature_name].fillna(0)
df_loaded[count_feature_name] = df_loaded[count_feature_name].fillna(0)
return df_loaded
def fetch_global_features(name, feature_name):
conn = psycopg2.connect(**DB_PARAMS)
cur = conn.cursor()
query = f"SELECT {name}, interactions, interaction_rate FROM {feature_name}"
cur.execute(query)
data = cur.fetchall()
cur.close()
conn.close()
columns = [
name,
"interactions",
"interaction_rate"
]
rate_feature_name = f"{feature_name}_rate"
count_feature_name = f"{feature_name}_count"
df_loaded = pd.DataFrame(data, columns=columns)
df_loaded = df_loaded.rename(columns={"interactions": count_feature_name}, errors="raise")
df_loaded = df_loaded.rename(columns={"interaction_rate": rate_feature_name}, errors="raise")
df_loaded[rate_feature_name] = df_loaded[rate_feature_name].fillna(0)
df_loaded[count_feature_name] = df_loaded[count_feature_name].fillna(0)
return df_loaded
def load_user_features():
result = {}
for view_name in USER_FEATURES.keys():
key_name = USER_FEATURES[view_name]
result[key_name] = fetch_user_features(view_name, key_name)
app.logger.info(f"loaded {len(result[key_name])} features for {key_name}")
return result
def load_global_features():
result = {}
for view_name in GLOBAL_FEATURES.keys():
key_name = GLOBAL_FEATURES[view_name]
result[key_name] = fetch_global_features(view_name, key_name)
app.logger.info(f"loaded {len(result[key_name])} features for {key_name}")
return result
def compute_score(user_id, item_features):
alpha = 0.2
interaction_score = compute_interaction_score(user_id, item_features)
time_bonus_score = compute_time_bonus_score(item_features)
return {
'score': (1 - alpha) * interaction_score + alpha * time_bonus_score,
'interaction_score': interaction_score,
'time_bonus_score': time_bonus_score
}
def compute_time_bonus_score(item_features):
saved_at = item_features['saved_at']
current_time = datetime.now(pytz.utc)
time_diff_hours = (current_time - saved_at).total_seconds() / 3600
max_diff_hours = 3 * 24
if time_diff_hours >= max_diff_hours:
return 0.0
else:
return max(0.0, min(1.0, 1 - (time_diff_hours / max_diff_hours)))
def compute_interaction_score(user_id, item_features):
print('item_features', item_features)
df_test = pd.DataFrame([{
'user_id': user_id,
'author': item_features.get('author'),
'site': item_features.get('site'),
'subscription': item_features.get('subscription'),
'item_has_thumbnail': 1 if item_features.get('has_thumbnail') else 0,
"item_has_site_icon": 1 if item_features.get('has_site_icon') else 0,
}])
for name in USER_FEATURES.keys():
feature_name = USER_FEATURES[name]
df_feature = user_features[feature_name]
df_test = df_test.merge(df_feature, on=['user_id', name], how='left')
df_test[f"{feature_name}_rate"] = df_test[f"{feature_name}_rate"].fillna(0)
df_test[f"{feature_name}_count"] = df_test[f"{feature_name}_count"].fillna(0)
for name in GLOBAL_FEATURES.keys():
feature_name = GLOBAL_FEATURES[name]
df_feature = global_features[feature_name]
df_test = df_test.merge(df_feature, on=name, how='left')
df_test[f"{feature_name}_rate"] = df_test[f"{feature_name}_rate"].fillna(0)
df_test[f"{feature_name}_count"] = df_test[f"{feature_name}_count"].fillna(0)
df_predict = df_test[TRAIN_FEATURES]
# Print out the columns with values, so we can know how sparse our data is
#scored_columns = df_predict.columns[(df_predict.notnull() & (df_predict != 0)).any()].tolist()
#print("scored columns", scored_columns)
interaction_score = pipeline.predict_proba(df_predict)
return interaction_score[0][1]
def get_library_item(library_item_id):
conn = psycopg2.connect(**DB_PARAMS)
cur = conn.cursor()
query = """
SELECT
li.title,
li.author,
li.saved_at,
li.site_name as site,
li.item_language as language,
li.subscription,
li.word_count,
li.directionality,
CASE WHEN li.thumbnail IS NOT NULL then 1 else 0 END as has_thumbnail,
CASE WHEN li.site_icon IS NOT NULL then 1 else 0 END as has_site_icon
FROM omnivore.library_item li
WHERE li.id = %s
"""
cur.execute(query, (library_item_id,))
data = cur.fetchone()
columns = [desc[0] for desc in cur.description]
cur.close()
conn.close()
if data:
item_dict = dict(zip(columns, data))
return item_dict
else:
return None
@app.route('/_ah/health', methods=['GET'])
def ready():
return jsonify({'OK': 'yes'}), 200
@app.route('/users/<user_id>/features', methods=['GET'])
def get_user_features(user_id):
result = {}
for name in USER_FEATURES.keys():
feature_name = USER_FEATURES[name]
rate_feature_name = f"{feature_name}_rate"
count_feature_name = f"{feature_name}_count"
df_feature = user_features[feature_name]
df_filtered = df_feature[df_feature['user_id'] == user_id]
if not df_filtered.empty:
rate = df_filtered[[name, rate_feature_name]].dropna().to_dict(orient='records')
count = df_filtered[[name, count_feature_name]].dropna().to_dict(orient='records')
result[feature_name] = {
'rate': rate,
'count': count
}
return jsonify(result), 200
@app.route('/users/<user_id>/library_items/<library_item_id>/score', methods=['GET'])
def get_library_item_score(user_id, library_item_id):
item_features = get_library_item(library_item_id)
score = compute_score(user_id, item_features)
return jsonify({'score': score})
@app.route('/predict', methods=['POST'])
def predict():
try:
data = request.get_json()
app.logger.info(f"predict scoring request: {data}")
user_id = data.get('user_id')
item_features = data.get('item_features')
item_features['saved_at'] = dateutil.parser.isoparse(item_features['saved_at'])
if user_id is None:
return jsonify({'error': 'Missing user_id'}), 400
score = compute_score(user_id, item_features)
return jsonify({'score': score})
except Exception as e:
app.logger.error(f"exception in predict endpoint: {request.get_json()}\n{e}")
return jsonify({'error': str(e)}), 500
@app.route('/batch', methods=['POST'])
def batch():
try:
result = {}
data = request.get_json()
app.logger.info(f"batch scoring request: {data}")
user_id = data.get('user_id')
items = data.get('items')
if user_id is None:
return jsonify({'error': 'Missing user_id'}), 400
for key, item in items.items():
print('key": ', key)
print('item: ', item)
library_item_id = item['library_item_id']
item['saved_at'] = dateutil.parser.isoparse(item['saved_at'])
result[library_item_id] = compute_score(user_id, item)
return jsonify(result)
except Exception as e:
app.logger.error(f"exception in batch endpoint: {request.get_json()}\n{e}")
return jsonify({'error': str(e)}), 500
if os.getenv('LOAD_LOCAL_MODEL'):
pipeline = load_pipeline_local()
else:
pipeline = load_pipeline()
user_features = load_user_features()
global_features = load_global_features()
if __name__ == '__main__':
app.run(debug=True, port=5000)