Catch error from converting string to datetime

This commit is contained in:
Hongbo Wu
2023-09-24 22:20:05 +08:00
parent 71c203cdb2
commit 475e32e859

View File

@ -111,7 +111,11 @@ def convert_string_to_uuid(val):
def convert_string_to_datetime(val):
if val is None:
return None
return datetime.strptime(val, DATE_FORMAT)
try:
return datetime.strptime(val, DATE_FORMAT)
except Exception as err:
print('Convert string to datetime ERROR:', err)
return None
async def insert_library_items(db_conn, library_items, original_ids):
@ -228,6 +232,12 @@ async def insert_recommendations(db_conn, recommendations, original_ids):
async def insert_into_postgres(insert_query, db_conn, records, original_ids):
try:
# sanitize input if input is a string
for record in records:
for i, val in enumerate(record):
if isinstance(val, str):
record[i] = sanitize_string(val)
await db_conn.executemany(insert_query, records, timeout=int(PG_TIMEOUT))
except Exception as err:
print('Batch insert into postgres ERROR:', err)
@ -258,6 +268,10 @@ async def insert_into_postgres(insert_query, db_conn, records, original_ids):
await asyncio.sleep(float(PG_COOLDOWN_TIME))
def sanitize_string(val):
return replace_surrogates(remove_null_bytes(val))
def remove_null_bytes(val):
if val is None:
return None
@ -377,8 +391,8 @@ async def main():
reading_progress_top_percent = source.get('readingProgressTopPercent', 0)
reading_progress_percent = source.get('readingProgressPercent', 0)
reading_progress_anchor = source.get('readingProgressAnchorIndex', 0)
content = replace_surrogates(remove_null_bytes(source['content']))
original_html = replace_surrogates(remove_null_bytes(source.get('originalHtml', None)))
content = source['content']
original_html = source.get('originalHtml', None)
description = source.get('description', None)
# skip item if content is larger than 1MB
@ -389,9 +403,9 @@ async def main():
library_item = (
id,
get_uuid(source['userId']),
remove_null_bytes(source['title']),
remove_null_bytes(source.get('author', None)),
remove_null_bytes(description),
source['title'],
source.get('author', None),
description,
content,
source['url'],
source.get('uploadFileId', None),
@ -408,11 +422,11 @@ async def main():
state,
updated_at,
convert_string_to_datetime(source.get('publishedAt', None)),
remove_null_bytes(source.get('language', None)),
source.get('language', None),
convert_string_to_datetime(source.get('readAt', None)),
source.get('wordsCount', None),
remove_null_bytes(source.get('siteIcon', None)),
remove_null_bytes(source.get('image', None)),
source.get('siteIcon', None),
source.get('image', None),
content_reader,
original_html,
deleted_at,
@ -457,7 +471,7 @@ async def main():
highlight_position_anchor_index if highlight_position_anchor_index is not None else 0,
highlight.get('type', 'HIGHLIGHT'),
highlight.get('color', None),
remove_null_bytes(highlight.get('html', None)),
highlight.get('html', None),
))
highlights_original_ids.append(highlight['id'])