Update functions.py
Browse files- functions.py +5 -3
functions.py
CHANGED
|
@@ -329,6 +329,8 @@ def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'
|
|
| 329 |
|
| 330 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 331 |
sentences = sent_tokenize(text)
|
|
|
|
|
|
|
| 332 |
|
| 333 |
# initialize
|
| 334 |
length = 0
|
|
@@ -340,9 +342,9 @@ def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'
|
|
| 340 |
count += 1
|
| 341 |
combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
|
| 342 |
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
|
| 347 |
# if it is the last sentence
|
| 348 |
if count == len(sentences) - 1:
|
|
|
|
| 329 |
|
| 330 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 331 |
sentences = sent_tokenize(text)
|
| 332 |
+
|
| 333 |
+
print("sentences: {sentences}")
|
| 334 |
|
| 335 |
# initialize
|
| 336 |
length = 0
|
|
|
|
| 342 |
count += 1
|
| 343 |
combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
|
| 344 |
|
| 345 |
+
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
|
| 346 |
+
chunk += sentence + " " # add the sentence to the chunk
|
| 347 |
+
length = combined_length # update the length counter
|
| 348 |
|
| 349 |
# if it is the last sentence
|
| 350 |
if count == len(sentences) - 1:
|