Update functions.py
Browse files- functions.py +28 -16
functions.py
CHANGED
|
@@ -323,29 +323,41 @@ def sentiment_pipe(earnings_text):
|
|
| 323 |
return earnings_sentiment, earnings_sentences
|
| 324 |
|
| 325 |
@st.cache_data
|
| 326 |
-
def chunk_and_preprocess_text(text,
|
| 327 |
-
|
| 328 |
-
|
| 329 |
|
|
|
|
| 330 |
sentences = sent_tokenize(clean_text(text))
|
| 331 |
-
#sentences = [i.text for i in list(article.sents)]
|
| 332 |
|
| 333 |
-
|
|
|
|
|
|
|
| 334 |
chunks = []
|
|
|
|
| 335 |
|
| 336 |
for sentence in sentences:
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
chunks[current_chunk].extend(sentence.split(" "))
|
| 340 |
-
else:
|
| 341 |
-
current_chunk += 1
|
| 342 |
-
chunks.append(sentence.split(" "))
|
| 343 |
-
else:
|
| 344 |
-
chunks.append(sentence.split(" "))
|
| 345 |
-
|
| 346 |
-
for chunk_id in range(len(chunks)):
|
| 347 |
-
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
| 348 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
return chunks
|
| 350 |
|
| 351 |
@st.cache_data
|
|
|
|
| 323 |
return earnings_sentiment, earnings_sentences
|
| 324 |
|
| 325 |
@st.cache_data
|
| 326 |
+
def chunk_and_preprocess_text(text, model_name):
|
| 327 |
+
|
| 328 |
+
'''Chunk and preprocess text for summarization'''
|
| 329 |
|
| 330 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 331 |
sentences = sent_tokenize(clean_text(text))
|
|
|
|
| 332 |
|
| 333 |
+
# initialize
|
| 334 |
+
length = 0
|
| 335 |
+
chunk = ""
|
| 336 |
chunks = []
|
| 337 |
+
count = -1
|
| 338 |
|
| 339 |
for sentence in sentences:
|
| 340 |
+
count += 1
|
| 341 |
+
combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
+
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
|
| 344 |
+
chunk += sentence + " " # add the sentence to the chunk
|
| 345 |
+
length = combined_length # update the length counter
|
| 346 |
+
|
| 347 |
+
# if it is the last sentence
|
| 348 |
+
if count == len(sentences) - 1:
|
| 349 |
+
chunks.append(chunk) # save the chunk
|
| 350 |
+
|
| 351 |
+
else:
|
| 352 |
+
chunks.append(chunk) # save the chunk
|
| 353 |
+
# reset
|
| 354 |
+
length = 0
|
| 355 |
+
chunk = ""
|
| 356 |
+
|
| 357 |
+
# take care of the overflow sentence
|
| 358 |
+
chunk += sentence + " "
|
| 359 |
+
length = len(tokenizer.tokenize(sentence))
|
| 360 |
+
|
| 361 |
return chunks
|
| 362 |
|
| 363 |
@st.cache_data
|