truncate long description to 1MB

2023-09-23 15:06:25 +08:00
parent 85febfa8b6
commit 323be5376a
1 changed files with 10 additions and 7 deletions
--- a/packages/db/elastic_migrations/migrate_from_elastic.py
+++ b/packages/db/elastic_migrations/migrate_from_elastic.py
@ -315,8 +315,6 @@ async def main():
                }
            ]
        }
-        # get current time
-        migrated_at = datetime.utcnow().strftime(DATE_FORMAT)
        # Scan API for larger library
        docs = async_scan(es_client, index=ES_INDEX, query=query,
                          preserve_order=True, size=ES_SCAN_SIZE,
@ -343,19 +341,24 @@ async def main():
            reading_progress_top_percent = source.get('readingProgressTopPercent', 0)
            reading_progress_percent = source.get('readingProgressPercent', 0)
            reading_progress_anchor = source.get('readingProgressAnchorIndex', 0)
-
+            content = source['content']
            # skip item if content is larger than 1MB
-            if len(source['content']) > 1000000:
+            if len(content) > 1000000:
                print('Skipping item', doc['_id'], 'because content is larger than 1MB')
                continue

+            # truncate description to 1MB characters
+            description = source.get('description', None)
+            if description is not None and len(description) > 1000000:
+                description = description[:1000000]
+
            library_item = (
                id,
                get_uuid(source['userId']),
                source['title'],
                source.get('author', None),
-                source.get('description', None),
-                source['content'],
+                description,
+                content,
                source['url'],
                source.get('uploadFileId', None),
                page_type if page_type is not None else 'UNKNOWN',
@ -466,7 +469,7 @@ async def main():
        if len(recommendations) > 0:
            await insert_recommendations(db_conn, recommendations)

-        print('Migration complete', migrated_at)
+        print('Migration complete', END_TIME)

        await assert_data(db_conn, es_client, users, uploaded_files)
    except Exception as err: