Feature/search highlights backend (#395)
* add highlight mappings * return highlight in resolvers * temporarily skip highlight tests * add test for getting highlights * update merge highlight * separate elastic methods * roll back merge highlight test * add highlight to elastic script * update delete highlight in elastic * migrate highlight data from postgres to elastic * rescue not found exception when page is not found in the migration script * exclude highlights in searching pages results * search pages with highlights only with has:highlight query * add search endpoint to search pages or highlights * reduce code smell in search api * fix rebase error * fix tests * add test for search highlight * add test for new search endpoint * add labels to search results * update schema * update search query * fix update/share highlights * fix rebase error * fix tests * add highlight model in elastic * add savedAt and publishedAt date range in search query * add sort by updated and recently read * fix tests * close db connection when tests are done * test github action * revert github action test * fix rebase error * add docker-compose for api-test * remove unused env * remove highlights with no page attached to * allow get_articles resolver to search for query so we can merge it without web changes
This commit is contained in:
188
packages/api/add_highlight_to_elastic.py
Normal file
188
packages/api/add_highlight_to_elastic.py
Normal file
@ -0,0 +1,188 @@
|
||||
#!/usr/bin/python
|
||||
import os
|
||||
import json
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from elasticsearch import Elasticsearch, NotFoundError
|
||||
|
||||
PG_HOST = os.getenv('PG_HOST', 'localhost')
|
||||
PG_PORT = os.getenv('PG_PORT', 5432)
|
||||
PG_USER = os.getenv('PG_USER', 'app_user')
|
||||
PG_PASSWORD = os.getenv('PG_PASSWORD', 'app_pass')
|
||||
PG_DB = os.getenv('PG_DB', 'omnivore')
|
||||
ES_URL = os.getenv('ES_URL', 'http://localhost:9200')
|
||||
ES_USERNAME = os.getenv('ES_USERNAME', 'elastic')
|
||||
ES_PASSWORD = os.getenv('ES_PASSWORD', 'password')
|
||||
UPDATE_TIME = os.getenv('UPDATE_TIME', '2019-01-01 00:00:00')
|
||||
INDEX_SETTINGS = os.getenv('INDEX_SETTINGS', 'index_settings.json')
|
||||
DATETIME_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS.MS"Z"'
|
||||
|
||||
|
||||
def update_mappings(client: Elasticsearch):
|
||||
print('updating mappings')
|
||||
try:
|
||||
with open(INDEX_SETTINGS, 'r') as f:
|
||||
settings = json.load(f)
|
||||
client.indices.put_mapping(
|
||||
index='pages_alias',
|
||||
body=settings['mappings'])
|
||||
print('mappings updated')
|
||||
except Exception as err:
|
||||
print('update mappings ERROR:', err)
|
||||
exit(1)
|
||||
|
||||
|
||||
def assertData(conn, client: Elasticsearch, pages):
|
||||
# get all users from postgres
|
||||
try:
|
||||
success = 0
|
||||
failure = 0
|
||||
skip = 0
|
||||
cursor = conn.cursor(cursor_factory=RealDictCursor)
|
||||
for page in pages:
|
||||
pageId = page['pageId']
|
||||
cursor.execute(
|
||||
f'''SELECT COUNT(*) FROM omnivore.highlight
|
||||
WHERE elastic_page_id = \'{pageId}\' AND deleted = false''')
|
||||
countInPostgres = cursor.fetchone()['count']
|
||||
try:
|
||||
countInElastic = len(client.get(
|
||||
index='pages_alias',
|
||||
id=pageId,
|
||||
_source=['highlights'])['_source']['highlights'])
|
||||
except NotFoundError as err:
|
||||
print('Elasticsearch get ERROR:', err)
|
||||
# if page is not found in elasticsearch, skip testing
|
||||
skip += 1
|
||||
continue
|
||||
|
||||
if countInPostgres == countInElastic:
|
||||
success += 1
|
||||
print(f'Page {pageId} OK')
|
||||
else:
|
||||
failure += 1
|
||||
print(
|
||||
f'Page {pageId} ERROR: postgres: {countInPostgres}, elastic: {countInElastic}')
|
||||
cursor.close()
|
||||
print(
|
||||
f'Asserted data, success: {success}, failure: {failure}, skip: {skip}')
|
||||
except Exception as err:
|
||||
print('Assert data ERROR:', err)
|
||||
exit(1)
|
||||
|
||||
|
||||
def ingest_highlights(conn, pages):
|
||||
try:
|
||||
import_count = 0
|
||||
cursor = conn.cursor(cursor_factory=RealDictCursor)
|
||||
|
||||
for page in pages:
|
||||
pageId = page['pageId']
|
||||
query = '''
|
||||
SELECT
|
||||
id,
|
||||
quote,
|
||||
prefix,
|
||||
to_char(created_at, '{DATETIME_FORMAT}') as "createdAt",
|
||||
to_char(COALESCE(updated_at, current_timestamp), '{DATETIME_FORMAT}') as "updatedAt",
|
||||
suffix,
|
||||
patch,
|
||||
annotation,
|
||||
short_id as "shortId",
|
||||
user_id as "userId",
|
||||
to_char(shared_at, '{DATETIME_FORMAT}') as "sharedAt"
|
||||
FROM omnivore.highlight
|
||||
WHERE
|
||||
elastic_page_id = \'{pageId}\'
|
||||
AND deleted = false
|
||||
AND created_at > '{UPDATE_TIME}'
|
||||
'''.format(pageId=pageId, DATETIME_FORMAT=DATETIME_FORMAT, UPDATE_TIME=UPDATE_TIME)
|
||||
|
||||
cursor.execute(query)
|
||||
result = cursor.fetchall()
|
||||
import_count += import_highlights_to_es(client, result, pageId)
|
||||
|
||||
print(f'Imported total {import_count} highlights to es')
|
||||
|
||||
cursor.close()
|
||||
except Exception as err:
|
||||
print('Export data to json ERROR:', err)
|
||||
|
||||
|
||||
def import_highlights_to_es(client, highlights, pageId) -> int:
|
||||
# import highlights to elasticsearch
|
||||
print(f'Writing {len(highlights)} highlights to page {pageId}')
|
||||
|
||||
if len(highlights) == 0:
|
||||
print('No highlights to import')
|
||||
return 0
|
||||
|
||||
try:
|
||||
resp = client.update(
|
||||
index='pages_alias',
|
||||
id=pageId,
|
||||
body={'doc': {'highlights': highlights}})
|
||||
|
||||
count = 0
|
||||
if resp['result'] == 'updated':
|
||||
count = len(highlights)
|
||||
|
||||
print(f'Added {count} highlights to page {pageId}')
|
||||
|
||||
return count
|
||||
except Exception as err:
|
||||
print('Elasticsearch update ERROR:', err)
|
||||
return 0
|
||||
|
||||
|
||||
def get_pages_with_highlights(conn):
|
||||
try:
|
||||
query = f'''
|
||||
SELECT DISTINCT
|
||||
elastic_page_id as "pageId"
|
||||
FROM omnivore.highlight
|
||||
WHERE
|
||||
elastic_page_id IS NOT NULL
|
||||
AND deleted = false
|
||||
AND created_at > '{UPDATE_TIME}'
|
||||
'''
|
||||
cursor = conn.cursor(cursor_factory=RealDictCursor)
|
||||
cursor.execute(query)
|
||||
result = cursor.fetchall()
|
||||
cursor.close()
|
||||
print('Found pages with highlights:', len(result))
|
||||
return result
|
||||
except Exception as err:
|
||||
print('Get pages with highlights ERROR:', err)
|
||||
|
||||
|
||||
print('Starting migration')
|
||||
|
||||
# test elastic client
|
||||
client = Elasticsearch(ES_URL, http_auth=(
|
||||
ES_USERNAME, ES_PASSWORD), retry_on_timeout=True)
|
||||
try:
|
||||
print('Elasticsearch client connected', client.info())
|
||||
except Exception as err:
|
||||
print('Elasticsearch client ERROR:', err)
|
||||
exit(1)
|
||||
|
||||
# test postgres client
|
||||
conn = psycopg2.connect(
|
||||
f'host={PG_HOST} port={PG_PORT} dbname={PG_DB} user={PG_USER} \
|
||||
password={PG_PASSWORD}')
|
||||
print('Postgres connection:', conn.info)
|
||||
|
||||
|
||||
update_mappings(client)
|
||||
|
||||
pages = get_pages_with_highlights(conn)
|
||||
|
||||
ingest_highlights(conn, pages)
|
||||
|
||||
assertData(conn, client, pages)
|
||||
|
||||
client.close()
|
||||
conn.close()
|
||||
|
||||
print('Migration complete')
|
||||
Reference in New Issue
Block a user