Spaces:
Running
Running
Joshua Lochner
commited on
Commit
·
62ea1e5
1
Parent(s):
4d4de75
Add language preference list
Browse files- src/preprocess.py +9 -2
src/preprocess.py
CHANGED
|
@@ -30,6 +30,12 @@ PROFANITY_CONVERTED = '*****' # Safer version for tokenizing
|
|
| 30 |
|
| 31 |
NUM_DECIMALS = 3
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
def parse_transcript_json(json_data, granularity):
|
| 35 |
assert json_data['wireMagic'] == 'pb3'
|
|
@@ -203,9 +209,10 @@ def get_words(video_id, process=True, transcript_type='auto', fallback='manual',
|
|
| 203 |
if transcript_list is not None:
|
| 204 |
if transcript_type == 'manual':
|
| 205 |
ts = transcript_list.find_manually_created_transcript(
|
| 206 |
-
|
| 207 |
else:
|
| 208 |
-
ts = transcript_list.find_generated_transcript(
|
|
|
|
| 209 |
|
| 210 |
raw_transcript_json = ts._http_client.get(
|
| 211 |
f'{ts._url}&fmt=json3').json()
|
|
|
|
| 30 |
|
| 31 |
NUM_DECIMALS = 3
|
| 32 |
|
| 33 |
+
# https://www.fincher.org/Utilities/CountryLanguageList.shtml
|
| 34 |
+
# https://lingohub.com/developers/supported-locales/language-designators-with-regions
|
| 35 |
+
LANGUAGE_PREFERENCE_LIST = ['en-GB', 'en-US', 'en-CA', 'en-AU', 'en-NZ', 'en-ZA',
|
| 36 |
+
'en-IE', 'en-IN', 'en-JM', 'en-BZ', 'en-TT', 'en-PH', 'en-ZW',
|
| 37 |
+
'en']
|
| 38 |
+
|
| 39 |
|
| 40 |
def parse_transcript_json(json_data, granularity):
|
| 41 |
assert json_data['wireMagic'] == 'pb3'
|
|
|
|
| 209 |
if transcript_list is not None:
|
| 210 |
if transcript_type == 'manual':
|
| 211 |
ts = transcript_list.find_manually_created_transcript(
|
| 212 |
+
LANGUAGE_PREFERENCE_LIST)
|
| 213 |
else:
|
| 214 |
+
ts = transcript_list.find_generated_transcript(
|
| 215 |
+
LANGUAGE_PREFERENCE_LIST)
|
| 216 |
|
| 217 |
raw_transcript_json = ts._http_client.get(
|
| 218 |
f'{ts._url}&fmt=json3').json()
|