From 5e79529e11788d6b34bd8dd53dae3a03a9ae4fe6 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 4 Oct 2022 17:40:21 +0800 Subject: [PATCH 1/5] Add sentence-level speech marks --- packages/text-to-speech/src/textToSpeech.ts | 24 ++++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/packages/text-to-speech/src/textToSpeech.ts b/packages/text-to-speech/src/textToSpeech.ts index 2c72a4852..4426c6f19 100644 --- a/packages/text-to-speech/src/textToSpeech.ts +++ b/packages/text-to-speech/src/textToSpeech.ts @@ -1,8 +1,10 @@ import { CancellationDetails, CancellationReason, + PropertyId, ResultReason, SpeechConfig, + SpeechSynthesisBoundaryType, SpeechSynthesisOutputFormat, SpeechSynthesisResult, SpeechSynthesizer, @@ -30,7 +32,7 @@ export interface SpeechMark { start?: number length?: number word: string - type: 'word' | 'bookmark' + type: 'word' | 'bookmark' | 'punctuation' | 'sentence' } export const synthesizeTextToSpeech = async ( @@ -47,6 +49,11 @@ export const synthesizeTextToSpeech = async ( ) speechConfig.speechSynthesisOutputFormat = SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3 + // Required for sentence-level WordBoundary events + speechConfig.setProperty( + PropertyId.SpeechServiceResponse_RequestSentenceBoundary, + 'true' + ) // Create the speech synthesizer. const synthesizer = new SpeechSynthesizer(speechConfig) @@ -87,13 +94,14 @@ export const synthesizeTextToSpeech = async ( // The unit of e.audioOffset is tick (1 tick = 100 nanoseconds), divide by 10,000 to convert to milliseconds. synthesizer.wordBoundary = (s, e) => { - speechMarks.push({ - word: e.text, - time: (timeOffset + e.audioOffset) / 10000, - start: wordOffset + e.textOffset, - length: e.wordLength, - type: 'word', - }) + e.boundaryType === SpeechSynthesisBoundaryType.Sentence && + speechMarks.push({ + word: e.text, + time: (timeOffset + e.audioOffset) / 10000, + start: wordOffset + e.textOffset, + length: e.wordLength, + type: 'sentence', + }) } synthesizer.bookmarkReached = (s, e) => { From 034f8335294d4b971cec20912d341cadeee3f74c Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 4 Oct 2022 19:23:26 +0800 Subject: [PATCH 2/5] Calculate the length of each sentence in speech marks --- packages/text-to-speech/src/textToSpeech.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/text-to-speech/src/textToSpeech.ts b/packages/text-to-speech/src/textToSpeech.ts index 4426c6f19..7d9dc0daf 100644 --- a/packages/text-to-speech/src/textToSpeech.ts +++ b/packages/text-to-speech/src/textToSpeech.ts @@ -99,7 +99,7 @@ export const synthesizeTextToSpeech = async ( word: e.text, time: (timeOffset + e.audioOffset) / 10000, start: wordOffset + e.textOffset, - length: e.wordLength, + length: e.text.length, type: 'sentence', }) } From edd1b908ee22f45ee9b60cc4a0414a83a20353d0 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 5 Oct 2022 09:05:46 +0800 Subject: [PATCH 3/5] Fix tests --- .../content-handler/test/newsletter.test.ts | 95 +++++++++++-------- 1 file changed, 57 insertions(+), 38 deletions(-) diff --git a/packages/content-handler/test/newsletter.test.ts b/packages/content-handler/test/newsletter.test.ts index 90197b00d..ff9396a12 100644 --- a/packages/content-handler/test/newsletter.test.ts +++ b/packages/content-handler/test/newsletter.test.ts @@ -21,29 +21,6 @@ const load = (path: string): string => { } describe('Newsletter email test', () => { - before(() => { - nock('https://email.mg2.substack.com') - .head( - '/c/eJxNkk2TojAQhn-N3KTyQfg4cGDGchdnYcsZx9K5UCE0EMVAkTiKv36iHnarupNUd7rfVJ4W3EDTj1M89No496Uw0wCxgovuwBgYnbOGsZBVjDHzKPWYU8VehUMWOlIX9Qhw4rKLzXgGZziXnRTcyF7dK0iIGMVOG_OS1aTmKPRDilgVhTQUPCQIcE0x-MFTmJ8rCUpA3KtuenR2urg1ZtAzmszI0tq_Z7m66y-ilQo0uAqMTQ7WRX8auJKg56blZg7WB-iHDuYEBzO6NP0R1IwuYFphQbbTjnTH9NBfs80nym4Zyj8uUvyKbtUyGr5eUz9fNDQ7JCxfJDo9dW1lY9lmj_JNivPbGmf2Pt_lN9tDit9b-WeTetni85Z9pDpVOd7L1E_Vy7egayNO23ZP34eSeLJeux1b0rer_xaZ7ykS78nuSjMY-nL98rparNZNcv07JCjN06_EkTFBxBqOUMACErnELUNMSxTUjLDQZwzcqa4bRjCfeejUEFefS224OLr2S5wxPtij7lVrs80d2CNseRV2P52VNFMBipcdVE-U5jkRD7hFAwpGOylVwU2Mfc9qBh7DoR89yVnWXhgQFHnIsbpVb6tU_B-hH_2yzWY' - ) - .reply(302, undefined, { - Location: - 'https://newsletter.slowchinese.net/p/companies-that-eat-people-217', - }) - .get('/p/companies-that-eat-people-217') - .reply(200, '') - - nock('https://u23463625.ct.sendgrid.net') - .head( - '/ss/c/AX1lEgEQaxtvFxLaVo0GBo_geajNrlI1TGeIcmMViR3pL3fEDZnbbkoeKcaY62QZk0KPFudUiUXc_uMLerV4nA/3k5/3TFZmreTR0qKSCgowABnVg/h30/zzLik7UXd1H_n4oyd5W8Xu639AYQQB2UXz-CsssSnno' - ) - .reply(302, undefined, { - Location: 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple', - }) - .get('/p/talked-guy-spent-30m-beeple') - .reply(200, '') - }) - describe('#getNewsletterUrl()', () => { it('returns url when email is from SubStack', async () => { const rawUrl = '' @@ -162,21 +139,63 @@ describe('Newsletter email test', () => { }) describe('findNewsletterUrl', async () => { - it('gets the URL from the header if it is a substack newsletter', async () => { - const html = load('./test/data/substack-forwarded-newsletter.html') - const url = await new SubstackHandler().findNewsletterUrl(html) - // Not sure if the redirects from substack expire, this test could eventually fail - expect(url).to.startWith( - 'https://newsletter.slowchinese.net/p/companies-that-eat-people-217' - ) - }).timeout(10000) - it('gets the URL from the header if it is a beehiiv newsletter', async () => { - const html = load('./test/data/beehiiv-newsletter.html') - const url = await new BeehiivHandler().findNewsletterUrl(html) - expect(url).to.startWith( - 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple' - ) - }).timeout(10000) + context('when email is from Substack', () => { + let scope: nock.Scope + + before(() => { + scope = nock('https://email.mg2.substack.com') + .head( + '/c/eJxNkk2TojAQhn-N3KTyQfg4cGDGchdnYcsZx9K5UCE0EMVAkTiKv36iHnarupNUd7rfVJ4W3EDTj1M89No496Uw0wCxgovuwBgYnbOGsZBVjDHzKPWYU8VehUMWOlIX9Qhw4rKLzXgGZziXnRTcyF7dK0iIGMVOG_OS1aTmKPRDilgVhTQUPCQIcE0x-MFTmJ8rCUpA3KtuenR2urg1ZtAzmszI0tq_Z7m66y-ilQo0uAqMTQ7WRX8auJKg56blZg7WB-iHDuYEBzO6NP0R1IwuYFphQbbTjnTH9NBfs80nym4Zyj8uUvyKbtUyGr5eUz9fNDQ7JCxfJDo9dW1lY9lmj_JNivPbGmf2Pt_lN9tDit9b-WeTetni85Z9pDpVOd7L1E_Vy7egayNO23ZP34eSeLJeux1b0rer_xaZ7ykS78nuSjMY-nL98rparNZNcv07JCjN06_EkTFBxBqOUMACErnELUNMSxTUjLDQZwzcqa4bRjCfeejUEFefS224OLr2S5wxPtij7lVrs80d2CNseRV2P52VNFMBipcdVE-U5jkRD7hFAwpGOylVwU2Mfc9qBh7DoR89yVnWXhgQFHnIsbpVb6tU_B-hH_2yzWY' + ) + .reply(302, undefined, { + Location: + 'https://newsletter.slowchinese.net/p/companies-that-eat-people-217', + }) + .get('/p/companies-that-eat-people-217') + .reply(200, '') + }) + after(() => { + scope.done() + }) + + it('gets the URL from the header', async () => { + const html = load('./test/data/substack-forwarded-newsletter.html') + const url = await new SubstackHandler().findNewsletterUrl(html) + // Not sure if the redirects from substack expire, this test could eventually fail + expect(url).to.startWith( + 'https://newsletter.slowchinese.net/p/companies-that-eat-people-217' + ) + }) + }) + + context('when email is from beehiiv', () => { + let scope: nock.Scope + + before(() => { + scope = nock('https://u23463625.ct.sendgrid.net') + .head( + '/ss/c/AX1lEgEQaxtvFxLaVo0GBo_geajNrlI1TGeIcmMViR3pL3fEDZnbbkoeKcaY62QZk0KPFudUiUXc_uMLerV4nA/3k5/3TFZmreTR0qKSCgowABnVg/h30/zzLik7UXd1H_n4oyd5W8Xu639AYQQB2UXz-CsssSnno' + ) + .reply(302, undefined, { + Location: 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple', + }) + .get('/p/talked-guy-spent-30m-beeple') + .reply(200, '') + }) + + after(() => { + scope.done() + }) + + it('gets the URL from the header', async () => { + const html = load('./test/data/beehiiv-newsletter.html') + const url = await new BeehiivHandler().findNewsletterUrl(html) + expect(url).to.startWith( + 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple' + ) + }) + }) + it('returns undefined if it is not a newsletter', async () => { const html = load('./test/data/substack-forwarded-welcome-email.html') const url = await new SubstackHandler().findNewsletterUrl(html) From c2a44f49911b8bfd460c810d844ee648a5827ee5 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 5 Oct 2022 10:04:41 +0800 Subject: [PATCH 4/5] Restore HTTP interceptor to the normal unmocked behaviour after testing --- packages/content-handler/test/newsletter.test.ts | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/packages/content-handler/test/newsletter.test.ts b/packages/content-handler/test/newsletter.test.ts index ff9396a12..ea97a0421 100644 --- a/packages/content-handler/test/newsletter.test.ts +++ b/packages/content-handler/test/newsletter.test.ts @@ -140,10 +140,8 @@ describe('Newsletter email test', () => { describe('findNewsletterUrl', async () => { context('when email is from Substack', () => { - let scope: nock.Scope - before(() => { - scope = nock('https://email.mg2.substack.com') + nock('https://email.mg2.substack.com') .head( '/c/eJxNkk2TojAQhn-N3KTyQfg4cGDGchdnYcsZx9K5UCE0EMVAkTiKv36iHnarupNUd7rfVJ4W3EDTj1M89No496Uw0wCxgovuwBgYnbOGsZBVjDHzKPWYU8VehUMWOlIX9Qhw4rKLzXgGZziXnRTcyF7dK0iIGMVOG_OS1aTmKPRDilgVhTQUPCQIcE0x-MFTmJ8rCUpA3KtuenR2urg1ZtAzmszI0tq_Z7m66y-ilQo0uAqMTQ7WRX8auJKg56blZg7WB-iHDuYEBzO6NP0R1IwuYFphQbbTjnTH9NBfs80nym4Zyj8uUvyKbtUyGr5eUz9fNDQ7JCxfJDo9dW1lY9lmj_JNivPbGmf2Pt_lN9tDit9b-WeTetni85Z9pDpVOd7L1E_Vy7egayNO23ZP34eSeLJeux1b0rer_xaZ7ykS78nuSjMY-nL98rparNZNcv07JCjN06_EkTFBxBqOUMACErnELUNMSxTUjLDQZwzcqa4bRjCfeejUEFefS224OLr2S5wxPtij7lVrs80d2CNseRV2P52VNFMBipcdVE-U5jkRD7hFAwpGOylVwU2Mfc9qBh7DoR89yVnWXhgQFHnIsbpVb6tU_B-hH_2yzWY' ) @@ -155,7 +153,7 @@ describe('Newsletter email test', () => { .reply(200, '') }) after(() => { - scope.done() + nock.restore() }) it('gets the URL from the header', async () => { @@ -165,14 +163,12 @@ describe('Newsletter email test', () => { expect(url).to.startWith( 'https://newsletter.slowchinese.net/p/companies-that-eat-people-217' ) - }) + }).timeout(10000) }) context('when email is from beehiiv', () => { - let scope: nock.Scope - before(() => { - scope = nock('https://u23463625.ct.sendgrid.net') + nock('https://u23463625.ct.sendgrid.net') .head( '/ss/c/AX1lEgEQaxtvFxLaVo0GBo_geajNrlI1TGeIcmMViR3pL3fEDZnbbkoeKcaY62QZk0KPFudUiUXc_uMLerV4nA/3k5/3TFZmreTR0qKSCgowABnVg/h30/zzLik7UXd1H_n4oyd5W8Xu639AYQQB2UXz-CsssSnno' ) @@ -184,7 +180,7 @@ describe('Newsletter email test', () => { }) after(() => { - scope.done() + nock.restore() }) it('gets the URL from the header', async () => { @@ -193,7 +189,7 @@ describe('Newsletter email test', () => { expect(url).to.startWith( 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple' ) - }) + }).timeout(10000) }) it('returns undefined if it is not a newsletter', async () => { From 0071d88443122fcaa485f2ac23df8e657bed52cc Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 5 Oct 2022 12:36:29 +0800 Subject: [PATCH 5/5] Stop deducting length of SSML starting tags in the offset --- packages/text-to-speech/src/textToSpeech.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/text-to-speech/src/textToSpeech.ts b/packages/text-to-speech/src/textToSpeech.ts index 7d9dc0daf..00cc06c58 100644 --- a/packages/text-to-speech/src/textToSpeech.ts +++ b/packages/text-to-speech/src/textToSpeech.ts @@ -59,7 +59,7 @@ export const synthesizeTextToSpeech = async ( const synthesizer = new SpeechSynthesizer(speechConfig) const speechMarks: SpeechMark[] = [] let timeOffset = 0 - let wordOffset = 0 + // let wordOffset = 0 synthesizer.synthesizing = function (s, e) { // convert arrayBuffer to stream and write to stream @@ -98,7 +98,7 @@ export const synthesizeTextToSpeech = async ( speechMarks.push({ word: e.text, time: (timeOffset + e.audioOffset) / 10000, - start: wordOffset + e.textOffset, + start: e.textOffset, length: e.text.length, type: 'sentence', }) @@ -151,7 +151,7 @@ export const synthesizeTextToSpeech = async ( const text = _.escape(input.text) const ssml = `${startSsmlTag}${text}${endSsml()}` // set the text offset to be the end of SSML start tag - wordOffset -= startSsmlTag.length + // wordOffset -= startSsmlTag.length const result = await speakSsmlAsyncPromise(ssml) if (result.reason === ResultReason.Canceled) { throw new Error(result.errorDetails)