Spaces:
Running
Running
Upload files
Browse files- Dockerfile +9 -0
- README.md +3 -3
- app.py +242 -0
- config.py +1 -0
- db_operations/__init__.py +1 -0
- db_operations/db_operations.py +59 -0
- requirements.txt +12 -0
- start.sh +3 -0
- static/favicon_new.png +0 -0
- static/loader.gif +0 -0
- static/refresh_reload_icon.png +0 -0
- static/styles.css +510 -0
- static/top-icon.png +0 -0
- templates/index.html +200 -0
- word_cloud.py +653 -0
Dockerfile
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.7-slim
|
| 2 |
+
WORKDIR /webapp
|
| 3 |
+
COPY . .
|
| 4 |
+
RUN chmod +x /webapp/start.sh
|
| 5 |
+
RUN pip install --upgrade pip
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
RUN apt update && apt install -y redis-server
|
| 8 |
+
EXPOSE 7860 6379
|
| 9 |
+
CMD ["/webapp/start.sh"]
|
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
---
|
| 2 |
title: News Aggregator
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
|
|
|
| 1 |
---
|
| 2 |
title: News Aggregator
|
| 3 |
+
emoji: ⚡
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
app.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from dateutil import parser
|
| 4 |
+
from flask import Flask, render_template
|
| 5 |
+
from flask_cors import cross_origin, CORS
|
| 6 |
+
from db_operations.db_operations import DBOperations
|
| 7 |
+
import logging
|
| 8 |
+
import traceback
|
| 9 |
+
import redis
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from functools import lru_cache
|
| 12 |
+
from word_cloud import get_frequent_words_html
|
| 13 |
+
from config import NEWS_RETENTION_SECONDS
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
app = Flask(__name__)
|
| 17 |
+
CORS(app)
|
| 18 |
+
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True)
|
| 19 |
+
db = DBOperations()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
REFRESH_FREQ = 300 # 300 secs = 5 mins
|
| 23 |
+
|
| 24 |
+
def is_db_fetch_reqd():
|
| 25 |
+
try:
|
| 26 |
+
env_news_time = redis_client.get('NEWSFETCHTIME')
|
| 27 |
+
logging.warning(f'fetch_time_env_var: {env_news_time}')
|
| 28 |
+
fetch_flag = 1
|
| 29 |
+
if env_news_time is None:
|
| 30 |
+
redis_client.set("NEWSFETCHTIME", str(datetime.now()))
|
| 31 |
+
fetch_flag = 1
|
| 32 |
+
|
| 33 |
+
if env_news_time is not None:
|
| 34 |
+
fetch_time_lapse_seconds = (datetime.now() - datetime.strptime(env_news_time, '%Y-%m-%d %H:%M:%S.%f')).seconds
|
| 35 |
+
if fetch_time_lapse_seconds <= REFRESH_FREQ:
|
| 36 |
+
fetch_flag = 0
|
| 37 |
+
else:
|
| 38 |
+
redis_client.set("NEWSFETCHTIME", str(datetime.now()))
|
| 39 |
+
fetch_flag = 1
|
| 40 |
+
except Exception as e:
|
| 41 |
+
print(e)
|
| 42 |
+
fetch_flag = 1
|
| 43 |
+
return fetch_flag
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def correct_date(x):
|
| 47 |
+
if (not isinstance(x, str)) or (str(x).find(":") == -1):
|
| 48 |
+
logging.warning(f'correct_date() error: {x} is not the right date format')
|
| 49 |
+
return "2020-11-07 00:36:44+05:30"
|
| 50 |
+
return x
|
| 51 |
+
|
| 52 |
+
def date_time_parser(dt):
|
| 53 |
+
"""
|
| 54 |
+
Computes the minutes elapsed since published time.
|
| 55 |
+
:param dt: date
|
| 56 |
+
:return: int, minutes elapsed.
|
| 57 |
+
"""
|
| 58 |
+
try:
|
| 59 |
+
return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
|
| 60 |
+
except:
|
| 61 |
+
logging.warning(f'date_time_parser() error: {dt} is not the right date format')
|
| 62 |
+
return 100000
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def elapsed_time_str(mins):
|
| 66 |
+
"""
|
| 67 |
+
Return the time elapsed string from minutes passed as an argument.
|
| 68 |
+
:param mins: int, minutes elapsed.
|
| 69 |
+
:return: str, time elapsed string
|
| 70 |
+
"""
|
| 71 |
+
try:
|
| 72 |
+
time_str = ''
|
| 73 |
+
hours = int(mins / 60)
|
| 74 |
+
days = np.round(mins / (60 * 24), 1)
|
| 75 |
+
remaining_mins = int(mins - (hours * 60))
|
| 76 |
+
if days >= 1:
|
| 77 |
+
time_str = f'{str(days)} days ago'
|
| 78 |
+
if days == 1:
|
| 79 |
+
time_str = 'a day ago'
|
| 80 |
+
elif (days < 1) & (hours < 24) & (mins >= 60):
|
| 81 |
+
time_str = f'{str(hours)} hours and {str(remaining_mins)} mins ago'
|
| 82 |
+
if (hours == 1) & (remaining_mins > 1):
|
| 83 |
+
time_str = f'an hour and {str(remaining_mins)} mins ago'
|
| 84 |
+
if (hours == 1) & (remaining_mins == 1):
|
| 85 |
+
time_str = f'an hour and a min ago'
|
| 86 |
+
if (hours > 1) & (remaining_mins == 1):
|
| 87 |
+
time_str = f'{str(hours)} hours and a min ago'
|
| 88 |
+
if (hours > 1) & (remaining_mins == 0):
|
| 89 |
+
time_str = f'{str(hours)} hours ago'
|
| 90 |
+
if ((mins / 60) == 1) & (remaining_mins == 0):
|
| 91 |
+
time_str = 'an hour ago'
|
| 92 |
+
elif (days < 1) & (hours < 24) & (mins == 0):
|
| 93 |
+
time_str = 'Just in'
|
| 94 |
+
else:
|
| 95 |
+
time_str = f'{str(mins)} minutes ago'
|
| 96 |
+
if mins == 1:
|
| 97 |
+
time_str = 'a minute ago'
|
| 98 |
+
return time_str
|
| 99 |
+
except:
|
| 100 |
+
return "-"
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def fetch_from_db(fetch_flag):
|
| 105 |
+
try:
|
| 106 |
+
logging.warning(f'fetch_flag: {fetch_flag}')
|
| 107 |
+
if fetch_flag == 1:
|
| 108 |
+
final_df = db.read_news_from_db()
|
| 109 |
+
freq_tokens = get_frequent_words_html(final_df)
|
| 110 |
+
logging.warning('Fetched From DB\n\n')
|
| 111 |
+
|
| 112 |
+
final_df['_id'] = final_df['_id'].astype('str')
|
| 113 |
+
|
| 114 |
+
redis_client.set("NEWSDF", final_df.to_json())
|
| 115 |
+
redis_client.set("NEWSWORDCLOUD", freq_tokens)
|
| 116 |
+
else:
|
| 117 |
+
final_df = pd.read_json(redis_client.get("NEWSDF"))
|
| 118 |
+
freq_tokens = redis_client.get("NEWSWORDCLOUD")
|
| 119 |
+
logging.warning('Fetched From Cache\n\n')
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(e)
|
| 123 |
+
final_df = []
|
| 124 |
+
freq_tokens = ""
|
| 125 |
+
raise
|
| 126 |
+
return final_df, freq_tokens
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
@app.route("/")
|
| 130 |
+
@cross_origin()
|
| 131 |
+
def index():
|
| 132 |
+
"""
|
| 133 |
+
Entry point
|
| 134 |
+
"""
|
| 135 |
+
try:
|
| 136 |
+
src_str = ''
|
| 137 |
+
final_df, freq_tokens = fetch_from_db(is_db_fetch_reqd())
|
| 138 |
+
if len(final_df) > 1:
|
| 139 |
+
|
| 140 |
+
final_df["parsed_date"] = [correct_date(date_) for date_ in final_df['parsed_date']]
|
| 141 |
+
final_df["parsed_date"] = [parser.parse(date_) for date_ in final_df['parsed_date']]
|
| 142 |
+
final_df["elapsed_time"] =[date_time_parser(date_) for date_ in final_df['parsed_date']]
|
| 143 |
+
final_df = final_df.loc[final_df["elapsed_time"] <= NEWS_RETENTION_SECONDS, :].copy()
|
| 144 |
+
final_df["elapsed_time_str"] = final_df["elapsed_time"].apply(elapsed_time_str)
|
| 145 |
+
final_df.sort_values(by="elapsed_time", inplace=True)
|
| 146 |
+
src_str = ", ".join(sorted([*final_df['src'].unique()]))
|
| 147 |
+
final_df['src_time'] = final_df['src'] + (" " * 5) + final_df["elapsed_time_str"]
|
| 148 |
+
final_df.drop(columns=['_id', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
|
| 149 |
+
final_df.drop_duplicates(subset='description', inplace=True)
|
| 150 |
+
final_df = final_df.loc[(final_df["title"] != ""), :].copy()
|
| 151 |
+
else:
|
| 152 |
+
final_df = pd.DataFrame({'title': '', 'url': '',
|
| 153 |
+
'description': '', 'src_time': ''}, index=[0])
|
| 154 |
+
|
| 155 |
+
except Exception as e:
|
| 156 |
+
final_df = pd.DataFrame({'title': '', 'url': '',
|
| 157 |
+
'description': '', 'src_time': ''}, index=[0])
|
| 158 |
+
logging.warning(traceback.print_exc())
|
| 159 |
+
|
| 160 |
+
result_str = f'''
|
| 161 |
+
<div class="box" id="main">
|
| 162 |
+
<form>
|
| 163 |
+
|
| 164 |
+
<div class="banner">
|
| 165 |
+
<img src="../static/favicon_new.png" class="logo-img" alt="KSV Muralidhar" />
|
| 166 |
+
<h1 style="display:inline-block; vertical-align: middle;">Latest News</h1>
|
| 167 |
+
</div>
|
| 168 |
+
'''
|
| 169 |
+
|
| 170 |
+
if len(final_df) <= 1:
|
| 171 |
+
result_str += f'''<div><p class="unavailable">This app is temporarily unavailable</p></div>'''
|
| 172 |
+
else:
|
| 173 |
+
# last_update_utc = datetime.strptime(os.getenv("NEWSFETCHTIME"), '%Y-%m-%d %H:%M:%S.%f')
|
| 174 |
+
last_update_utc = datetime.strptime(redis_client.get('NEWSFETCHTIME'), '%Y-%m-%d %H:%M:%S.%f')
|
| 175 |
+
last_update_utc = last_update_utc.strftime("%Y-%m-%d %H:%M:%S")
|
| 176 |
+
result_str += f'<p class="srctxt">News aggregated from <b>{src_str}</b>.<br><br>Last updated: {last_update_utc} UTC</p>'
|
| 177 |
+
|
| 178 |
+
result_str += '''
|
| 179 |
+
<div class="input-container">
|
| 180 |
+
<input type="text" class="keyword-input" id="keywordInput" placeholder="Search" oninput="filterContent(true)">
|
| 181 |
+
<div class="clear-btn" id="clearBtn" onclick="clearFilter()">×</div>
|
| 182 |
+
</div>
|
| 183 |
+
'''
|
| 184 |
+
|
| 185 |
+
result_str += f"{freq_tokens} "
|
| 186 |
+
result_str += '<div class="show-more-word-cloud" onclick=word_cloud_display()><p class="three-dots">...</p></div>'
|
| 187 |
+
|
| 188 |
+
result_str += '''<div style="padding-bottom: 10px; font-size: 12px; font-family: Arial, Helvetica, sans-serif;">
|
| 189 |
+
News categories and similar news are AI-generated</div>'''
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
for n, i in final_df.iterrows(): # iterating through the search results
|
| 193 |
+
href = i["url"]
|
| 194 |
+
category = i["category"]
|
| 195 |
+
description = i["description"]
|
| 196 |
+
url_txt = i["title"]
|
| 197 |
+
src_time = i["src_time"]
|
| 198 |
+
sim_news = i['similar_news']
|
| 199 |
+
result_str += f'''<div class="news-item"><div style="padding-top: 7px;">
|
| 200 |
+
<a href="{href}" target="_blank" class="article-category">{category}
|
| 201 |
+
</a>
|
| 202 |
+
</div>
|
| 203 |
+
<div>
|
| 204 |
+
<a href="{href}" target="_blank" class="headline">{url_txt}
|
| 205 |
+
</a>
|
| 206 |
+
</div>
|
| 207 |
+
<div>
|
| 208 |
+
<a href="{href}" target="_blank" class="description">
|
| 209 |
+
{description}
|
| 210 |
+
</a>
|
| 211 |
+
</div>
|
| 212 |
+
<div>
|
| 213 |
+
<a href="{href}" target="_blank" class="time">
|
| 214 |
+
{src_time}
|
| 215 |
+
</a>
|
| 216 |
+
</div>
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
<div class="container">
|
| 220 |
+
<div class="content" style="display: none;">
|
| 221 |
+
{sim_news}
|
| 222 |
+
</div>
|
| 223 |
+
<div class="show-similar-button-container">
|
| 224 |
+
<button type="button" class="show-more">Show similar news</button>
|
| 225 |
+
<button type="button" class="show-less">Hide similar news</button>
|
| 226 |
+
</div>
|
| 227 |
+
</div>
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
<div>
|
| 232 |
+
<p></p>
|
| 233 |
+
</div></div>
|
| 234 |
+
'''
|
| 235 |
+
|
| 236 |
+
result_str += '</form></div>'
|
| 237 |
+
return render_template("index.html", body=result_str)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
if __name__ == "__main__":
|
| 241 |
+
app.run(host="0.0.0.0", port=7860, workers=5, threads=5) # workers=(2*ncores) + 1, threads= (2 to 4*ncores) + 1
|
| 242 |
+
|
config.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
NEWS_RETENTION_SECONDS = 300
|
db_operations/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from db_operations.db_operations import *
|
db_operations/db_operations.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pymongo
|
| 2 |
+
import os
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class DBOperations:
|
| 7 |
+
"""
|
| 8 |
+
Reads news from MongoDB
|
| 9 |
+
"""
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.url = os.getenv('DB_URL')
|
| 12 |
+
self.database = "rss_news_db_cat_pred_sim_news"
|
| 13 |
+
self.collection = "rss_news_cat_pred_sim_news"
|
| 14 |
+
self.__client = None
|
| 15 |
+
self.__error = 0
|
| 16 |
+
|
| 17 |
+
def __connect(self):
|
| 18 |
+
try:
|
| 19 |
+
self.__client = pymongo.MongoClient(self.url)
|
| 20 |
+
_ = self.__client.list_database_names()
|
| 21 |
+
except Exception as conn_exception:
|
| 22 |
+
self.__error = 1
|
| 23 |
+
self.__client = None
|
| 24 |
+
raise
|
| 25 |
+
|
| 26 |
+
def __read(self):
|
| 27 |
+
try:
|
| 28 |
+
db = self.__client[self.database]
|
| 29 |
+
coll = db[self.collection]
|
| 30 |
+
docs = []
|
| 31 |
+
for doc in coll.find():
|
| 32 |
+
docs.append(doc)
|
| 33 |
+
rss_df = pd.DataFrame(docs)
|
| 34 |
+
except Exception as insert_err:
|
| 35 |
+
self.__error = 1
|
| 36 |
+
rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
|
| 37 |
+
'description': '', 'parsed_date': '',
|
| 38 |
+
'src': ''}, index=[0])
|
| 39 |
+
return rss_df
|
| 40 |
+
|
| 41 |
+
def __close_connection(self):
|
| 42 |
+
if self.__client is not None:
|
| 43 |
+
self.__client.close()
|
| 44 |
+
self.__client = None
|
| 45 |
+
|
| 46 |
+
def read_news_from_db(self):
|
| 47 |
+
rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
|
| 48 |
+
'description': '', 'parsed_date': '',
|
| 49 |
+
'src': ''}, index=[0])
|
| 50 |
+
if self.url is not None:
|
| 51 |
+
if self.__error == 0:
|
| 52 |
+
self.__connect()
|
| 53 |
+
if self.__error == 0:
|
| 54 |
+
rss_df = self.__read()
|
| 55 |
+
if self.__error == 0:
|
| 56 |
+
print("Read Successful")
|
| 57 |
+
if self.__client is not None:
|
| 58 |
+
self.__close_connection()
|
| 59 |
+
return rss_df
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
regex==2021.8.3
|
| 2 |
+
lxml==4.6.3
|
| 3 |
+
numpy==1.21.1
|
| 4 |
+
python-dateutil==2.8.2
|
| 5 |
+
pandas==1.3.1
|
| 6 |
+
requests==2.26.0
|
| 7 |
+
bs4==0.0.1
|
| 8 |
+
flask==2.2.2
|
| 9 |
+
flask_cors==3.0.10
|
| 10 |
+
gunicorn==20.1.0
|
| 11 |
+
pymongo==4.3.3
|
| 12 |
+
redis
|
start.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
redis-server --daemonize yes
|
| 3 |
+
gunicorn -b 0.0.0.0:7860 --timeout 120 --workers 5 --threads 5 app:app
|
static/favicon_new.png
ADDED
|
|
static/loader.gif
ADDED
|
static/refresh_reload_icon.png
ADDED
|
|
static/styles.css
ADDED
|
@@ -0,0 +1,510 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
html {
|
| 2 |
+
scroll-behavior: smooth;
|
| 3 |
+
}
|
| 4 |
+
|
| 5 |
+
@media screen and (min-width: 800px) {
|
| 6 |
+
a.headline {
|
| 7 |
+
background-color: #E5E4E2;
|
| 8 |
+
display: block;
|
| 9 |
+
width: relative;
|
| 10 |
+
text-decoration: none;
|
| 11 |
+
color: black;
|
| 12 |
+
line-height: 1.2;
|
| 13 |
+
align: justify;
|
| 14 |
+
border-left: 5px solid transparent;
|
| 15 |
+
border-top: 5px solid transparent;
|
| 16 |
+
border-bottom: 5px solid transparent;
|
| 17 |
+
border-right: 0px;
|
| 18 |
+
font-weight: bold;
|
| 19 |
+
font-size: 18px;
|
| 20 |
+
padding-right: 5px;
|
| 21 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
@media screen and (max-width: 800px) {
|
| 26 |
+
a.headline {
|
| 27 |
+
background-color: #E5E4E2;
|
| 28 |
+
display: block;
|
| 29 |
+
width: relative;
|
| 30 |
+
text-decoration: none;
|
| 31 |
+
color: black;
|
| 32 |
+
line-height: 1.2;
|
| 33 |
+
align: justify;
|
| 34 |
+
border-left: 5px solid transparent;
|
| 35 |
+
border-top: 5px solid transparent;
|
| 36 |
+
border-bottom: 5px solid transparent;
|
| 37 |
+
border-right: 0px;
|
| 38 |
+
font-weight: bold;
|
| 39 |
+
font-size: 16.5px;
|
| 40 |
+
padding-right: 5px;
|
| 41 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
@media screen and (min-width: 800px) {
|
| 46 |
+
a.description {
|
| 47 |
+
background-color: #E5E4E2;
|
| 48 |
+
align:justify;
|
| 49 |
+
text-align: justify;
|
| 50 |
+
display: block;
|
| 51 |
+
height:100%;
|
| 52 |
+
width: relative;
|
| 53 |
+
text-decoration: none;
|
| 54 |
+
border-left: 5px solid transparent;
|
| 55 |
+
border-top: 0px;
|
| 56 |
+
border-bottom: 7px solid transparent;
|
| 57 |
+
border-right: 0px;
|
| 58 |
+
font-size: 14px;
|
| 59 |
+
padding-right: 5px;
|
| 60 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 61 |
+
color: dimgrey;
|
| 62 |
+
}
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
@media screen and (max-width: 800px) {
|
| 66 |
+
a.description {
|
| 67 |
+
background-color: #E5E4E2;
|
| 68 |
+
align:justify;
|
| 69 |
+
text-align: justify;
|
| 70 |
+
display: block;
|
| 71 |
+
height:100%;
|
| 72 |
+
width: relative;
|
| 73 |
+
text-decoration: none;
|
| 74 |
+
border-left: 5px solid transparent;
|
| 75 |
+
border-top: 0px;
|
| 76 |
+
border-bottom: 7px solid transparent;
|
| 77 |
+
border-right: 0px;
|
| 78 |
+
font-size: 12.5px;
|
| 79 |
+
padding-right: 5px;
|
| 80 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 81 |
+
color: dimgrey;
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
@media screen and (min-width: 800px) {
|
| 86 |
+
a.time {
|
| 87 |
+
background-color: #E5E4E2;
|
| 88 |
+
align:justify;
|
| 89 |
+
display: block;
|
| 90 |
+
height:100%;
|
| 91 |
+
width: relative;
|
| 92 |
+
text-decoration: none;
|
| 93 |
+
border-left: 5px solid transparent;
|
| 94 |
+
border-top: 0px;
|
| 95 |
+
border-bottom: 1px solid transparent;
|
| 96 |
+
border-right: 0px;
|
| 97 |
+
padding-right: 5px;
|
| 98 |
+
font-size: 11px;
|
| 99 |
+
padding-bottom: 5px;
|
| 100 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 101 |
+
color: green;
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
@media screen and (max-width: 800px) {
|
| 106 |
+
a.time {
|
| 107 |
+
background-color: #E5E4E2;
|
| 108 |
+
align:justify;
|
| 109 |
+
display: block;
|
| 110 |
+
height:100%;
|
| 111 |
+
width: relative;
|
| 112 |
+
text-decoration: none;
|
| 113 |
+
border-left: 5px solid transparent;
|
| 114 |
+
border-top: 0px;
|
| 115 |
+
border-bottom: 1px solid transparent;
|
| 116 |
+
border-right: 0px;
|
| 117 |
+
padding-right: 5px;
|
| 118 |
+
font-size: 10px;
|
| 119 |
+
padding-bottom: 5px;
|
| 120 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 121 |
+
color: green;
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
.box {
|
| 126 |
+
display: flex;
|
| 127 |
+
justify-content: center;
|
| 128 |
+
align-items: center;
|
| 129 |
+
height: inherit;
|
| 130 |
+
padding: 20px;
|
| 131 |
+
}
|
| 132 |
+
@media screen and (min-width: 800px) {
|
| 133 |
+
form {
|
| 134 |
+
width: 50%;
|
| 135 |
+
overflow-x: hidden;
|
| 136 |
+
padding: 20px;
|
| 137 |
+
border-radius: 10px;
|
| 138 |
+
background: #fff;
|
| 139 |
+
box-shadow: 0 0 20px 0 #095484;
|
| 140 |
+
}}
|
| 141 |
+
|
| 142 |
+
@media screen and (max-width: 800px) {
|
| 143 |
+
form {
|
| 144 |
+
width: 100%;
|
| 145 |
+
overflow-x: hidden;
|
| 146 |
+
padding: 20px;
|
| 147 |
+
border-radius: 10px;
|
| 148 |
+
background: #fff;
|
| 149 |
+
box-shadow: 0 0 15px 0 #095484;
|
| 150 |
+
}}
|
| 151 |
+
.banner {
|
| 152 |
+
position: relative;
|
| 153 |
+
height: 30px;
|
| 154 |
+
/* background-size: cover; */
|
| 155 |
+
display: flex;
|
| 156 |
+
/* justify-content: center; */
|
| 157 |
+
/* align-items: center; */
|
| 158 |
+
/* text-align: center; */
|
| 159 |
+
}
|
| 160 |
+
@media screen and (min-width: 800px) {
|
| 161 |
+
h1 {
|
| 162 |
+
position: absolute;
|
| 163 |
+
margin: 0;
|
| 164 |
+
padding-left: 50px;
|
| 165 |
+
font-size: 25px;
|
| 166 |
+
color: black;
|
| 167 |
+
z-index: 2;
|
| 168 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 169 |
+
}
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
@media screen and (max-width: 800px) {
|
| 173 |
+
h1 {
|
| 174 |
+
position: absolute;
|
| 175 |
+
margin: 0;
|
| 176 |
+
padding-left: 40px;
|
| 177 |
+
font-size: 24px;
|
| 178 |
+
color: black;
|
| 179 |
+
z-index: 2;
|
| 180 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 181 |
+
}
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
p.unavailable {
|
| 185 |
+
background-color: #E5E4E2;
|
| 186 |
+
display: block;
|
| 187 |
+
width: 100%;
|
| 188 |
+
text-decoration: none;
|
| 189 |
+
color: black;
|
| 190 |
+
line-height: 1.2;
|
| 191 |
+
align: justify;
|
| 192 |
+
border-left: 5px solid transparent;
|
| 193 |
+
border-top: 5px solid transparent;
|
| 194 |
+
border-bottom: 5px solid transparent;
|
| 195 |
+
border-right: 0px;
|
| 196 |
+
font-weight: bold;
|
| 197 |
+
font-size: 18px;
|
| 198 |
+
padding-right: 5px;
|
| 199 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 200 |
+
}
|
| 201 |
+
div.news-item{
|
| 202 |
+
background-color: #E5E4E2;
|
| 203 |
+
/*box-shadow: rgba(0, 0, 0, 0.4) -1px 0px 5px, rgba(0, 0, 0, 0.5) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -3px 0px inset;*/
|
| 204 |
+
box-shadow: rgba(0, 0, 0, 0.25) 0px 0px 5px 1px, rgba(0, 0, 0, 0.1) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -1px 0px inset;
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
}
|
| 208 |
+
div.news-item:hover{
|
| 209 |
+
box-shadow: none;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
@media screen and (min-width: 800px) {
|
| 213 |
+
p.srctxt {
|
| 214 |
+
align:justify;
|
| 215 |
+
text-align: justify;
|
| 216 |
+
word-break: break-all;
|
| 217 |
+
font-size: 11px;
|
| 218 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 219 |
+
}
|
| 220 |
+
.logo-img{
|
| 221 |
+
margin-right: 10px;
|
| 222 |
+
vertical-align: center;
|
| 223 |
+
/* position: relative; */
|
| 224 |
+
width: 34px;
|
| 225 |
+
height: 34px;
|
| 226 |
+
|
| 227 |
+
}
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
@media screen and (max-width: 800px) {
|
| 231 |
+
p.srctxt {
|
| 232 |
+
align:justify;
|
| 233 |
+
text-align: justify;
|
| 234 |
+
word-break: break-all;
|
| 235 |
+
font-size: 9px;
|
| 236 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 237 |
+
}
|
| 238 |
+
.logo-img{
|
| 239 |
+
margin-right: 10px;
|
| 240 |
+
vertical-align: top;
|
| 241 |
+
/* position: absolute; */
|
| 242 |
+
width: 30px;
|
| 243 |
+
height: 30px;
|
| 244 |
+
}
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
.float{
|
| 248 |
+
position:fixed;
|
| 249 |
+
width:25px;
|
| 250 |
+
height:25px;
|
| 251 |
+
bottom:15px;
|
| 252 |
+
right:12px;
|
| 253 |
+
background-color: white;
|
| 254 |
+
border-radius:50%;
|
| 255 |
+
text-align:center;
|
| 256 |
+
vertical-align:center;
|
| 257 |
+
z-index: 99999998;
|
| 258 |
+
font-size:0;
|
| 259 |
+
cursor:pointer;
|
| 260 |
+
animation: beatan 0.8s infinite alternate;
|
| 261 |
+
|
| 262 |
+
}
|
| 263 |
+
.top-float{
|
| 264 |
+
position:fixed;
|
| 265 |
+
width:25px;
|
| 266 |
+
height:25px;
|
| 267 |
+
bottom:52px;
|
| 268 |
+
right:12px;
|
| 269 |
+
background-color: white;
|
| 270 |
+
border-radius:50%;
|
| 271 |
+
text-align:center;
|
| 272 |
+
vertical-align:center;
|
| 273 |
+
z-index: 99999998;
|
| 274 |
+
font-size:0;
|
| 275 |
+
cursor:pointer;
|
| 276 |
+
animation: beatan 0.8s infinite alternate;
|
| 277 |
+
|
| 278 |
+
}
|
| 279 |
+
.my-float{
|
| 280 |
+
margin-top:22px;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
@keyframes beatan{
|
| 284 |
+
to { transform: scale(1.1); }
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.loader {
|
| 288 |
+
position: fixed;
|
| 289 |
+
left: 0px;
|
| 290 |
+
top: 0px;
|
| 291 |
+
width: 100%;
|
| 292 |
+
height: 100%;
|
| 293 |
+
z-index: 99999999999;
|
| 294 |
+
background: url('../static/loader.gif') 50% 50% no-repeat rgb(255,255,255);
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
.highlight {
|
| 298 |
+
background-color: yellow;
|
| 299 |
+
font-weight: bold;
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
.input-container {
|
| 303 |
+
position: relative;
|
| 304 |
+
padding-bottom: 10px;
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
.keyword-input {
|
| 308 |
+
|
| 309 |
+
border-radius: 5px;
|
| 310 |
+
transition: border-color 0.3s ease;
|
| 311 |
+
border: 1px solid silver;
|
| 312 |
+
width: 10em;
|
| 313 |
+
height: 1.5em;
|
| 314 |
+
padding-left: 0.5em;
|
| 315 |
+
outline: none;
|
| 316 |
+
overflow: hidden;
|
| 317 |
+
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
.clear-btn {
|
| 321 |
+
position: absolute;
|
| 322 |
+
font-size: 20px;
|
| 323 |
+
left: 129px;
|
| 324 |
+
transform: translateY(-105%);
|
| 325 |
+
cursor: pointer;
|
| 326 |
+
opacity: 0;
|
| 327 |
+
transition: opacity 0.3s ease;
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
.clear-btn.show {
|
| 331 |
+
opacity: 1;
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
@media screen and (min-width: 800px) {
|
| 335 |
+
a.article-category {
|
| 336 |
+
background-color: #E5E4E2;
|
| 337 |
+
align:justify;
|
| 338 |
+
display: block;
|
| 339 |
+
height:100%;
|
| 340 |
+
width: relative;
|
| 341 |
+
text-decoration: none;
|
| 342 |
+
border-left: 5px solid transparent;
|
| 343 |
+
border-top: 0px;
|
| 344 |
+
font-weight: bold;
|
| 345 |
+
border-bottom: 1px solid transparent;
|
| 346 |
+
border-right: 0px;
|
| 347 |
+
padding-right: 5px;
|
| 348 |
+
font-size: 11px;
|
| 349 |
+
padding-bottom: 0px;
|
| 350 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 351 |
+
color: green;
|
| 352 |
+
}
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
@media screen and (max-width: 800px) {
|
| 356 |
+
a.article-category {
|
| 357 |
+
background-color: #E5E4E2;
|
| 358 |
+
align:justify;
|
| 359 |
+
display: block;
|
| 360 |
+
height:100%;
|
| 361 |
+
font-weight: bold;
|
| 362 |
+
width: relative;
|
| 363 |
+
text-decoration: none;
|
| 364 |
+
border-left: 5px solid transparent;
|
| 365 |
+
border-top: 0px;
|
| 366 |
+
border-bottom: 1px solid transparent;
|
| 367 |
+
border-right: 0px;
|
| 368 |
+
padding-right: 5px;
|
| 369 |
+
font-size: 10px;
|
| 370 |
+
padding-bottom: 0px;
|
| 371 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 372 |
+
color: green;
|
| 373 |
+
}
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
.content {
|
| 377 |
+
display: none;
|
| 378 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 379 |
+
|
| 380 |
+
padding-right: 5px;
|
| 381 |
+
|
| 382 |
+
padding-top: 5px;
|
| 383 |
+
border-left: 5px solid transparent;
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
.container{
|
| 387 |
+
padding-bottom:10px;
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
.show-similar-button-container{
|
| 391 |
+
display: flex;
|
| 392 |
+
flex-direction: column;
|
| 393 |
+
align-items: center;
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
.similar-news-item:hover {
|
| 397 |
+
text-decoration: none;
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
@media screen and (min-width: 800px) {
|
| 401 |
+
.similar-news-item {
|
| 402 |
+
text-align: justify;
|
| 403 |
+
text-decoration: underline;
|
| 404 |
+
font-size: 14px;
|
| 405 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 406 |
+
color: black;
|
| 407 |
+
display:inline-block;
|
| 408 |
+
padding-bottom: 10px;
|
| 409 |
+
width:100%;
|
| 410 |
+
/*white-space: nowrap;
|
| 411 |
+
overflow: hidden;
|
| 412 |
+
text-overflow: ellipsis;*/
|
| 413 |
+
|
| 414 |
+
}
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
@media screen and (max-width: 800px) {
|
| 418 |
+
.similar-news-item {
|
| 419 |
+
text-align: justify;
|
| 420 |
+
text-decoration: underline;
|
| 421 |
+
font-size: 12px;
|
| 422 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 423 |
+
color: black;
|
| 424 |
+
display:inline-block;
|
| 425 |
+
padding-bottom: 8px;
|
| 426 |
+
width:100%;
|
| 427 |
+
/*white-space: nowrap;
|
| 428 |
+
overflow: hidden;
|
| 429 |
+
text-overflow: ellipsis;*/
|
| 430 |
+
}
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
.show-more {
|
| 436 |
+
background-color: #E5E4E2;
|
| 437 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 438 |
+
border-radius:4px;
|
| 439 |
+
padding-top:3px;
|
| 440 |
+
padding-bottom:3px;
|
| 441 |
+
padding-left:3px;
|
| 442 |
+
padding-right:3px;
|
| 443 |
+
font-size: 12px;
|
| 444 |
+
display: box;
|
| 445 |
+
border: none;
|
| 446 |
+
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
.show-more:hover {
|
| 450 |
+
background-color: black;
|
| 451 |
+
color: white;
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
.show-less {
|
| 455 |
+
background-color: #E5E4E2;
|
| 456 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 457 |
+
border-radius:4px;
|
| 458 |
+
padding-top:3px;
|
| 459 |
+
padding-bottom:3px;
|
| 460 |
+
padding-left:3px;
|
| 461 |
+
padding-right:3px;
|
| 462 |
+
font-size: 12px;
|
| 463 |
+
border: none;
|
| 464 |
+
display: none;
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
.show-less:hover {
|
| 468 |
+
background-color: black;
|
| 469 |
+
color: white;
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
.word-cloud-container{
|
| 473 |
+
word-wrap: break-word;
|
| 474 |
+
padding-bottom: 10px;
|
| 475 |
+
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
.wc-tokens{
|
| 479 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 480 |
+
font-size: 13.2px;
|
| 481 |
+
cursor: pointer;
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
.wc-tokens:hover{
|
| 485 |
+
text-decoration: underline;
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
.word-cloud-section{
|
| 489 |
+
padding-bottom: 10px;
|
| 490 |
+
display: none;
|
| 491 |
+
word-wrap: break-word;
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
.show-more-word-cloud{
|
| 495 |
+
padding-bottom: 23px;
|
| 496 |
+
text-align: center;
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
.three-dots{
|
| 500 |
+
font-size: 30px;
|
| 501 |
+
margin: 0;
|
| 502 |
+
line-height:0;
|
| 503 |
+
vertical-align: top;
|
| 504 |
+
padding: 0;
|
| 505 |
+
cursor: pointer;
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
.three-dots:hover{
|
| 509 |
+
font-size: 25px;
|
| 510 |
+
}
|
static/top-icon.png
ADDED
|
|
templates/index.html
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<link rel="preload" href="../static/loader.gif" as="image">
|
| 7 |
+
<link rel="preload" href="../static/favicon_new.png" as="image">
|
| 8 |
+
<link rel="preload" href="../static/refresh_reload_icon.png" as="image">
|
| 9 |
+
<link rel="preload" href="../static/top-icon.png" as="image">
|
| 10 |
+
<link rel="icon" href="../static/favicon_new.png" type="image/png">
|
| 11 |
+
|
| 12 |
+
<meta charset="UTF-8">
|
| 13 |
+
<title>Latest News</title>
|
| 14 |
+
<link rel="stylesheet" href="static/styles.css">
|
| 15 |
+
<a id="top-loc"></a>
|
| 16 |
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
|
| 17 |
+
<script>
|
| 18 |
+
$(window).load(function(){
|
| 19 |
+
$('.loader').fadeOut();
|
| 20 |
+
});
|
| 21 |
+
</script>
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
<script>
|
| 26 |
+
function filterContent(match_case) {
|
| 27 |
+
var keyword = document.getElementById("keywordInput").value;
|
| 28 |
+
if (match_case == false)
|
| 29 |
+
{
|
| 30 |
+
/*var keyword = document.getElementById("keywordInput").value.toLowerCase(); */
|
| 31 |
+
/*var regex = new RegExp("\\b" + keyword + "\\b", "gi"); */
|
| 32 |
+
}
|
| 33 |
+
var clearbtn = document.getElementById("clearBtn");
|
| 34 |
+
|
| 35 |
+
if (keyword !== "")
|
| 36 |
+
{
|
| 37 |
+
clearbtn.style.opacity = 1;
|
| 38 |
+
var items = document.getElementsByClassName("news-item");
|
| 39 |
+
for (var i = 0; i < items.length; i++)
|
| 40 |
+
{
|
| 41 |
+
var headline = items[i].querySelector('.headline');
|
| 42 |
+
var description = items[i].querySelector('.description');
|
| 43 |
+
if (match_case == true)
|
| 44 |
+
{
|
| 45 |
+
var article_category = items[i].querySelector('.article-category');
|
| 46 |
+
var src_time = items[i].querySelector('.time');
|
| 47 |
+
var itemText = headline.textContent.concat(" ", description.textContent, " ", article_category.textContent, " ", src_time.textContent)
|
| 48 |
+
}
|
| 49 |
+
else
|
| 50 |
+
{
|
| 51 |
+
var itemText = headline.textContent.concat(" ", description.textContent, " ")
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
if (match_case == false)
|
| 55 |
+
{ var regex = new RegExp("\\b" + keyword + "\\b", "gi");
|
| 56 |
+
itemText = itemText.toLowerCase();
|
| 57 |
+
if (regex.test(itemText) == true)
|
| 58 |
+
{
|
| 59 |
+
items[i].style.display = "block";
|
| 60 |
+
highlightKeyword(headline, keyword, match_case);
|
| 61 |
+
highlightKeyword(description, keyword, match_case);
|
| 62 |
+
}
|
| 63 |
+
else
|
| 64 |
+
{
|
| 65 |
+
items[i].style.display = "none";
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
else
|
| 69 |
+
{
|
| 70 |
+
if (itemText.includes(keyword))
|
| 71 |
+
{
|
| 72 |
+
items[i].style.display = "block";
|
| 73 |
+
highlightKeyword(headline, keyword, match_case);
|
| 74 |
+
highlightKeyword(description, keyword, match_case);
|
| 75 |
+
highlightKeyword(article_category, keyword, match_case);
|
| 76 |
+
highlightKeyword(src_time, keyword, match_case);
|
| 77 |
+
|
| 78 |
+
}
|
| 79 |
+
else
|
| 80 |
+
{
|
| 81 |
+
items[i].style.display = "none";
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
else
|
| 87 |
+
{
|
| 88 |
+
clearFilter();
|
| 89 |
+
}
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
function clearFilter() {
|
| 93 |
+
var items = document.getElementsByClassName("news-item");
|
| 94 |
+
var clearbtn = document.getElementById("clearBtn");
|
| 95 |
+
clearbtn.style.opacity=0;
|
| 96 |
+
for (var i = 0; i < items.length; i++) {
|
| 97 |
+
var headline = items[i].querySelector('.headline');
|
| 98 |
+
var description = items[i].querySelector('.description');
|
| 99 |
+
var article_category = items[i].querySelector('.article-category');
|
| 100 |
+
var src_time = items[i].querySelector('.time');
|
| 101 |
+
items[i].style.display = "block";
|
| 102 |
+
headline.innerHTML = headline.textContent; // Remove highlighting
|
| 103 |
+
description.innerHTML = description.textContent; // Remove highlighting
|
| 104 |
+
article_category.innerHTML = article_category.textContent; // Remove highlighting
|
| 105 |
+
src_time.innerHTML = src_time.textContent; // Remove highlighting
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
document.getElementById("keywordInput").value = ""; // Clear input field
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
function highlightKeyword(element, keyword, match_case) {
|
| 112 |
+
var regex = new RegExp(keyword);
|
| 113 |
+
if (match_case == false)
|
| 114 |
+
{
|
| 115 |
+
var regex = new RegExp("\\b" + keyword + "\\b", 'gi');
|
| 116 |
+
}
|
| 117 |
+
element.innerHTML = element.textContent.replace(regex, function(match) {
|
| 118 |
+
return '<span class="highlight">' + match + '</span>';
|
| 119 |
+
});
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
</script>
|
| 123 |
+
|
| 124 |
+
<script>
|
| 125 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 126 |
+
const containers = document.querySelectorAll('.container');
|
| 127 |
+
|
| 128 |
+
containers.forEach(container => {
|
| 129 |
+
const content = container.querySelector('.content');
|
| 130 |
+
const showMoreBtn = container.querySelector('.show-more');
|
| 131 |
+
const showLessBtn = container.querySelector('.show-less');
|
| 132 |
+
|
| 133 |
+
showMoreBtn.addEventListener('click', function() {
|
| 134 |
+
/* var similar_news_items = document.getElementsByClassName("content");
|
| 135 |
+
var show_less_items = document.getElementsByClassName("show-less");
|
| 136 |
+
var show_more_items = document.getElementsByClassName("show-more");
|
| 137 |
+
for (var i = 0; i < similar_news_items.length; i++) {
|
| 138 |
+
similar_news_items[i].style.display = 'none';
|
| 139 |
+
show_more_items[i].style.display = 'block';
|
| 140 |
+
show_less_items[i].style.display = 'none';
|
| 141 |
+
}
|
| 142 |
+
*/
|
| 143 |
+
|
| 144 |
+
content.style.display = 'block';
|
| 145 |
+
content.style.opacity = 1;
|
| 146 |
+
showMoreBtn.style.display = 'none';
|
| 147 |
+
showLessBtn.style.display = 'block';
|
| 148 |
+
});
|
| 149 |
+
|
| 150 |
+
showLessBtn.addEventListener('click', function() {
|
| 151 |
+
document.documentElement.style.scrollBehavior = "auto";
|
| 152 |
+
var max_h = content.parentElement.parentElement.clientHeight;
|
| 153 |
+
content.style.display = 'none';
|
| 154 |
+
showMoreBtn.style.display = 'block';
|
| 155 |
+
showLessBtn.style.display = 'none';
|
| 156 |
+
var min_h = content.parentElement.parentElement.clientHeight;
|
| 157 |
+
$(window).scrollTop($(window).scrollTop() - (max_h - min_h) || 0);
|
| 158 |
+
document.documentElement.style.scrollBehavior = "smooth";
|
| 159 |
+
});
|
| 160 |
+
});
|
| 161 |
+
});
|
| 162 |
+
</script>
|
| 163 |
+
|
| 164 |
+
<script>
|
| 165 |
+
function wc_search(keyword)
|
| 166 |
+
{
|
| 167 |
+
clearFilter();
|
| 168 |
+
document.getElementById("keywordInput").value = keyword;
|
| 169 |
+
filterContent(false);
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
function word_cloud_display()
|
| 173 |
+
{
|
| 174 |
+
var word_cloud_section = document.getElementById("word-cloud-section-id");
|
| 175 |
+
if (word_cloud_section.style.display == 'block')
|
| 176 |
+
{
|
| 177 |
+
word_cloud_section.style.display = 'none';
|
| 178 |
+
}
|
| 179 |
+
else
|
| 180 |
+
{
|
| 181 |
+
word_cloud_section.style.display = 'block';
|
| 182 |
+
}
|
| 183 |
+
}
|
| 184 |
+
</script>
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
</head>
|
| 189 |
+
<body>
|
| 190 |
+
<div class="loader"></div>
|
| 191 |
+
{{body | safe}}
|
| 192 |
+
|
| 193 |
+
<a id="top_theme" class="top-float" onclick="window.scrollTo(0, 0);">
|
| 194 |
+
<img id="top-theme-icon" alt="_" src="../static/top-icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width="25px" height="25px" border="0">
|
| 195 |
+
</a>
|
| 196 |
+
<a href="javascript:window.location.reload(true)" id="theme" class="float">
|
| 197 |
+
<img id="theme-icon" alt="_" src="../static/refresh_reload_icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width=25px height=25px border="0" />
|
| 198 |
+
</a>
|
| 199 |
+
</body>
|
| 200 |
+
</html>
|
word_cloud.py
ADDED
|
@@ -0,0 +1,653 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import string
|
| 4 |
+
from unidecode import unidecode
|
| 5 |
+
from collections import Counter
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TextPreprocessor:
|
| 9 |
+
def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
|
| 10 |
+
remove_stop_words: bool = True,
|
| 11 |
+
remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None,
|
| 12 |
+
bottom_p: float = None):
|
| 13 |
+
self.remove_punct = remove_punct
|
| 14 |
+
self.remove_digits = remove_digits
|
| 15 |
+
self.remove_stop_words = remove_stop_words
|
| 16 |
+
self.remove_short_words = remove_short_words
|
| 17 |
+
self.minlen = minlen
|
| 18 |
+
self.maxlen = maxlen
|
| 19 |
+
self.top_p = top_p
|
| 20 |
+
self.bottom_p = bottom_p
|
| 21 |
+
self.words_to_remove = []
|
| 22 |
+
self.stop_words = ["'d", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
|
| 23 |
+
'about',
|
| 24 |
+
'above',
|
| 25 |
+
'across',
|
| 26 |
+
'after',
|
| 27 |
+
'afterwards',
|
| 28 |
+
'again',
|
| 29 |
+
'against',
|
| 30 |
+
'ain',
|
| 31 |
+
'all',
|
| 32 |
+
'almost',
|
| 33 |
+
'alone',
|
| 34 |
+
'along',
|
| 35 |
+
'already',
|
| 36 |
+
'also',
|
| 37 |
+
'although',
|
| 38 |
+
'always',
|
| 39 |
+
'am',
|
| 40 |
+
'among',
|
| 41 |
+
'amongst',
|
| 42 |
+
'amount',
|
| 43 |
+
'an',
|
| 44 |
+
'and',
|
| 45 |
+
'another',
|
| 46 |
+
'any',
|
| 47 |
+
'anyhow',
|
| 48 |
+
'anyone',
|
| 49 |
+
'anything',
|
| 50 |
+
'anyway',
|
| 51 |
+
'anywhere',
|
| 52 |
+
'are',
|
| 53 |
+
'around',
|
| 54 |
+
'as',
|
| 55 |
+
'at',
|
| 56 |
+
'back',
|
| 57 |
+
'be',
|
| 58 |
+
'became',
|
| 59 |
+
'because',
|
| 60 |
+
'become',
|
| 61 |
+
'becomes',
|
| 62 |
+
'becoming',
|
| 63 |
+
'been',
|
| 64 |
+
'before',
|
| 65 |
+
'beforehand',
|
| 66 |
+
'behind',
|
| 67 |
+
'being',
|
| 68 |
+
'below',
|
| 69 |
+
'beside',
|
| 70 |
+
'besides',
|
| 71 |
+
'between',
|
| 72 |
+
'beyond',
|
| 73 |
+
'both',
|
| 74 |
+
'bottom',
|
| 75 |
+
'but',
|
| 76 |
+
'by',
|
| 77 |
+
'ca',
|
| 78 |
+
'call',
|
| 79 |
+
'can',
|
| 80 |
+
'cannot',
|
| 81 |
+
'could',
|
| 82 |
+
'couldn',
|
| 83 |
+
"couldn't",
|
| 84 |
+
'd',
|
| 85 |
+
'did',
|
| 86 |
+
'do',
|
| 87 |
+
'does',
|
| 88 |
+
'doing',
|
| 89 |
+
'done',
|
| 90 |
+
'down',
|
| 91 |
+
'due',
|
| 92 |
+
'during',
|
| 93 |
+
'each',
|
| 94 |
+
'eight',
|
| 95 |
+
'either',
|
| 96 |
+
'eleven',
|
| 97 |
+
'else',
|
| 98 |
+
'elsewhere',
|
| 99 |
+
'empty',
|
| 100 |
+
'enough',
|
| 101 |
+
'even',
|
| 102 |
+
'ever',
|
| 103 |
+
'every',
|
| 104 |
+
'everyone',
|
| 105 |
+
'everything',
|
| 106 |
+
'everywhere',
|
| 107 |
+
'except',
|
| 108 |
+
'few',
|
| 109 |
+
'fifteen',
|
| 110 |
+
'fifty',
|
| 111 |
+
'first',
|
| 112 |
+
'five',
|
| 113 |
+
'for',
|
| 114 |
+
'former',
|
| 115 |
+
'formerly',
|
| 116 |
+
'forty',
|
| 117 |
+
'four',
|
| 118 |
+
'from',
|
| 119 |
+
'front',
|
| 120 |
+
'full',
|
| 121 |
+
'further',
|
| 122 |
+
'get',
|
| 123 |
+
'give',
|
| 124 |
+
'go',
|
| 125 |
+
'had',
|
| 126 |
+
'has',
|
| 127 |
+
'have',
|
| 128 |
+
'having',
|
| 129 |
+
'he',
|
| 130 |
+
'hence',
|
| 131 |
+
'her',
|
| 132 |
+
'here',
|
| 133 |
+
'hereafter',
|
| 134 |
+
'hereby',
|
| 135 |
+
'herein',
|
| 136 |
+
'hereupon',
|
| 137 |
+
'hers',
|
| 138 |
+
'herself',
|
| 139 |
+
'him',
|
| 140 |
+
'himself',
|
| 141 |
+
'his',
|
| 142 |
+
'how',
|
| 143 |
+
'however',
|
| 144 |
+
'hundred',
|
| 145 |
+
'i',
|
| 146 |
+
'if',
|
| 147 |
+
'in',
|
| 148 |
+
'indeed',
|
| 149 |
+
'into',
|
| 150 |
+
'is',
|
| 151 |
+
'it',
|
| 152 |
+
"it's",
|
| 153 |
+
'its',
|
| 154 |
+
'itself',
|
| 155 |
+
'just',
|
| 156 |
+
'keep',
|
| 157 |
+
'last',
|
| 158 |
+
'latter',
|
| 159 |
+
'latterly',
|
| 160 |
+
'least',
|
| 161 |
+
'less',
|
| 162 |
+
'll',
|
| 163 |
+
'm',
|
| 164 |
+
'ma',
|
| 165 |
+
'made',
|
| 166 |
+
'make',
|
| 167 |
+
'many',
|
| 168 |
+
'say',
|
| 169 |
+
'said',
|
| 170 |
+
'says',
|
| 171 |
+
'told',
|
| 172 |
+
'tell',
|
| 173 |
+
'may',
|
| 174 |
+
'me',
|
| 175 |
+
'meanwhile',
|
| 176 |
+
'might',
|
| 177 |
+
'mine',
|
| 178 |
+
'more',
|
| 179 |
+
'moreover',
|
| 180 |
+
'most',
|
| 181 |
+
'mostly',
|
| 182 |
+
'move',
|
| 183 |
+
'much',
|
| 184 |
+
'must',
|
| 185 |
+
'my',
|
| 186 |
+
'myself',
|
| 187 |
+
'name',
|
| 188 |
+
'namely',
|
| 189 |
+
'neither',
|
| 190 |
+
'never',
|
| 191 |
+
'nevertheless',
|
| 192 |
+
'next',
|
| 193 |
+
'nine',
|
| 194 |
+
'no',
|
| 195 |
+
'nobody',
|
| 196 |
+
'none',
|
| 197 |
+
'noone',
|
| 198 |
+
'nor',
|
| 199 |
+
'not',
|
| 200 |
+
'nothing',
|
| 201 |
+
'now',
|
| 202 |
+
'nowhere',
|
| 203 |
+
'o',
|
| 204 |
+
'of',
|
| 205 |
+
'off',
|
| 206 |
+
'often',
|
| 207 |
+
'on',
|
| 208 |
+
'once',
|
| 209 |
+
'one',
|
| 210 |
+
'only',
|
| 211 |
+
'onto',
|
| 212 |
+
'or',
|
| 213 |
+
'other',
|
| 214 |
+
'others',
|
| 215 |
+
'otherwise',
|
| 216 |
+
'our',
|
| 217 |
+
'ours',
|
| 218 |
+
'ourselves',
|
| 219 |
+
'out',
|
| 220 |
+
'over',
|
| 221 |
+
'own',
|
| 222 |
+
'part',
|
| 223 |
+
'per',
|
| 224 |
+
'perhaps',
|
| 225 |
+
'please',
|
| 226 |
+
'put',
|
| 227 |
+
'quite',
|
| 228 |
+
'rather',
|
| 229 |
+
're',
|
| 230 |
+
'rs',
|
| 231 |
+
'really',
|
| 232 |
+
'regarding',
|
| 233 |
+
's',
|
| 234 |
+
'same',
|
| 235 |
+
'say',
|
| 236 |
+
'see',
|
| 237 |
+
'seem',
|
| 238 |
+
'seemed',
|
| 239 |
+
'seeming',
|
| 240 |
+
'seems',
|
| 241 |
+
'serious',
|
| 242 |
+
'several',
|
| 243 |
+
'shan',
|
| 244 |
+
"shan't",
|
| 245 |
+
'she',
|
| 246 |
+
"she's",
|
| 247 |
+
'should',
|
| 248 |
+
"should've",
|
| 249 |
+
'shouldn',
|
| 250 |
+
"shouldn't",
|
| 251 |
+
'show',
|
| 252 |
+
'side',
|
| 253 |
+
'since',
|
| 254 |
+
'six',
|
| 255 |
+
'sixty',
|
| 256 |
+
'so',
|
| 257 |
+
'some',
|
| 258 |
+
'somehow',
|
| 259 |
+
'someone',
|
| 260 |
+
'something',
|
| 261 |
+
'sometime',
|
| 262 |
+
'sometimes',
|
| 263 |
+
'somewhere',
|
| 264 |
+
'still',
|
| 265 |
+
'such',
|
| 266 |
+
't',
|
| 267 |
+
'take',
|
| 268 |
+
'ten',
|
| 269 |
+
'than',
|
| 270 |
+
'that',
|
| 271 |
+
"that'll",
|
| 272 |
+
'the',
|
| 273 |
+
'their',
|
| 274 |
+
'theirs',
|
| 275 |
+
'them',
|
| 276 |
+
'themselves',
|
| 277 |
+
'then',
|
| 278 |
+
'thence',
|
| 279 |
+
'there',
|
| 280 |
+
'thereafter',
|
| 281 |
+
'thereby',
|
| 282 |
+
'therefore',
|
| 283 |
+
'therein',
|
| 284 |
+
'thereupon',
|
| 285 |
+
'these',
|
| 286 |
+
'they',
|
| 287 |
+
'third',
|
| 288 |
+
'this',
|
| 289 |
+
'those',
|
| 290 |
+
'though',
|
| 291 |
+
'three',
|
| 292 |
+
'through',
|
| 293 |
+
'throughout',
|
| 294 |
+
'thru',
|
| 295 |
+
'thus',
|
| 296 |
+
'to',
|
| 297 |
+
'together',
|
| 298 |
+
'too',
|
| 299 |
+
'top',
|
| 300 |
+
'toward',
|
| 301 |
+
'towards',
|
| 302 |
+
'twelve',
|
| 303 |
+
'twenty',
|
| 304 |
+
'two',
|
| 305 |
+
'under',
|
| 306 |
+
'unless',
|
| 307 |
+
'until',
|
| 308 |
+
'up',
|
| 309 |
+
'upon',
|
| 310 |
+
'us',
|
| 311 |
+
'used',
|
| 312 |
+
'using',
|
| 313 |
+
'various',
|
| 314 |
+
've',
|
| 315 |
+
'very',
|
| 316 |
+
'via',
|
| 317 |
+
'was',
|
| 318 |
+
'wasn',
|
| 319 |
+
"wasn't",
|
| 320 |
+
'we',
|
| 321 |
+
'well',
|
| 322 |
+
'were',
|
| 323 |
+
'weren',
|
| 324 |
+
"weren't",
|
| 325 |
+
'what',
|
| 326 |
+
'whatever',
|
| 327 |
+
'when',
|
| 328 |
+
'whence',
|
| 329 |
+
'whenever',
|
| 330 |
+
'where',
|
| 331 |
+
'whereafter',
|
| 332 |
+
'whereas',
|
| 333 |
+
'whereby',
|
| 334 |
+
'wherein',
|
| 335 |
+
'whereupon',
|
| 336 |
+
'wherever',
|
| 337 |
+
'whether',
|
| 338 |
+
'which',
|
| 339 |
+
'while',
|
| 340 |
+
'whither',
|
| 341 |
+
'who',
|
| 342 |
+
'whoever',
|
| 343 |
+
'whole',
|
| 344 |
+
'whom',
|
| 345 |
+
'whose',
|
| 346 |
+
'why',
|
| 347 |
+
'will',
|
| 348 |
+
'with',
|
| 349 |
+
'within',
|
| 350 |
+
'without',
|
| 351 |
+
'won',
|
| 352 |
+
"won't",
|
| 353 |
+
'would',
|
| 354 |
+
'wouldn',
|
| 355 |
+
"wouldn't",
|
| 356 |
+
'y',
|
| 357 |
+
'yet',
|
| 358 |
+
'you',
|
| 359 |
+
"you'd",
|
| 360 |
+
"you'll",
|
| 361 |
+
"you're",
|
| 362 |
+
"you've",
|
| 363 |
+
'your',
|
| 364 |
+
'yours',
|
| 365 |
+
'yourself',
|
| 366 |
+
'yourselves',
|
| 367 |
+
'‘d',
|
| 368 |
+
'‘ll',
|
| 369 |
+
'‘m',
|
| 370 |
+
'‘re',
|
| 371 |
+
'‘s',
|
| 372 |
+
'‘ve',
|
| 373 |
+
'’d',
|
| 374 |
+
'’ll',
|
| 375 |
+
'’m',
|
| 376 |
+
'’re',
|
| 377 |
+
'new',
|
| 378 |
+
'old',
|
| 379 |
+
'’s',
|
| 380 |
+
'’ve']
|
| 381 |
+
|
| 382 |
+
self.contraction_to_expansion = {"ain't": "am not",
|
| 383 |
+
"aren't": "are not",
|
| 384 |
+
"can't": "cannot",
|
| 385 |
+
"can't've": "cannot have",
|
| 386 |
+
"'cause": "because",
|
| 387 |
+
"could've": "could have",
|
| 388 |
+
"couldn't": "could not",
|
| 389 |
+
"couldn't've": "could not have",
|
| 390 |
+
"didn't": "did not",
|
| 391 |
+
"doesn't": "does not",
|
| 392 |
+
"don't": "do not",
|
| 393 |
+
"hadn't": "had not",
|
| 394 |
+
"hadn't've": "had not have",
|
| 395 |
+
"hasn't": "has not",
|
| 396 |
+
"haven't": "have not",
|
| 397 |
+
"he'd": "he would",
|
| 398 |
+
"he'd've": "he would have",
|
| 399 |
+
"he'll": "he will",
|
| 400 |
+
"he'll've": "he will have",
|
| 401 |
+
"he's": "he is",
|
| 402 |
+
"how'd": "how did",
|
| 403 |
+
"how'd'y": "how do you",
|
| 404 |
+
"how'll": "how will",
|
| 405 |
+
"how's": "how is",
|
| 406 |
+
"i'd": "i would",
|
| 407 |
+
"i'd've": "i would have",
|
| 408 |
+
"i'll": "i will",
|
| 409 |
+
"i'll've": "i will have",
|
| 410 |
+
"i'm": "i am",
|
| 411 |
+
"i've": "i have",
|
| 412 |
+
"isn't": "is not",
|
| 413 |
+
"it'd": "it had",
|
| 414 |
+
"it'd've": "it would have",
|
| 415 |
+
"it'll": "it will",
|
| 416 |
+
"it'll've": "it will have",
|
| 417 |
+
"it's": "it is",
|
| 418 |
+
"let's": "let us",
|
| 419 |
+
"ma'am": "madam",
|
| 420 |
+
"mayn't": "may not",
|
| 421 |
+
"might've": "might have",
|
| 422 |
+
"mightn't": "might not",
|
| 423 |
+
"mightn't've": "might not have",
|
| 424 |
+
"must've": "must have",
|
| 425 |
+
"mustn't": "must not",
|
| 426 |
+
"mustn't've": "must not have",
|
| 427 |
+
"needn't": "need not",
|
| 428 |
+
"needn't've": "need not have",
|
| 429 |
+
"o'clock": "of the clock",
|
| 430 |
+
"oughtn't": "ought not",
|
| 431 |
+
"oughtn't've": "ought not have",
|
| 432 |
+
"shan't": "shall not",
|
| 433 |
+
"sha'n't": "shall not",
|
| 434 |
+
"shan't've": "shall not have",
|
| 435 |
+
"she'd": "she would",
|
| 436 |
+
"she'd've": "she would have",
|
| 437 |
+
"she'll": "she will",
|
| 438 |
+
"she'll've": "she will have",
|
| 439 |
+
"she's": "she is",
|
| 440 |
+
"should've": "should have",
|
| 441 |
+
"shouldn't": "should not",
|
| 442 |
+
"shouldn't've": "should not have",
|
| 443 |
+
"so've": "so have",
|
| 444 |
+
"so's": "so is",
|
| 445 |
+
"that'd": "that would",
|
| 446 |
+
"that'd've": "that would have",
|
| 447 |
+
"that's": "that is",
|
| 448 |
+
"there'd": "there had",
|
| 449 |
+
"there'd've": "there would have",
|
| 450 |
+
"there's": "there is",
|
| 451 |
+
"they'd": "they would",
|
| 452 |
+
"they'd've": "they would have",
|
| 453 |
+
"they'll": "they will",
|
| 454 |
+
"they'll've": "they will have",
|
| 455 |
+
"they're": "they are",
|
| 456 |
+
"they've": "they have",
|
| 457 |
+
"to've": "to have",
|
| 458 |
+
"wasn't": "was not",
|
| 459 |
+
"we'd": "we had",
|
| 460 |
+
"we'd've": "we would have",
|
| 461 |
+
"we'll": "we will",
|
| 462 |
+
"we'll've": "we will have",
|
| 463 |
+
"we're": "we are",
|
| 464 |
+
"we've": "we have",
|
| 465 |
+
"weren't": "were not",
|
| 466 |
+
"what'll": "what will",
|
| 467 |
+
"what'll've": "what will have",
|
| 468 |
+
"what're": "what are",
|
| 469 |
+
"what's": "what is",
|
| 470 |
+
"what've": "what have",
|
| 471 |
+
"when's": "when is",
|
| 472 |
+
"when've": "when have",
|
| 473 |
+
"where'd": "where did",
|
| 474 |
+
"where's": "where is",
|
| 475 |
+
"where've": "where have",
|
| 476 |
+
"who'll": "who will",
|
| 477 |
+
"who'll've": "who will have",
|
| 478 |
+
"who's": "who is",
|
| 479 |
+
"who've": "who have",
|
| 480 |
+
"why's": "why is",
|
| 481 |
+
"why've": "why have",
|
| 482 |
+
"will've": "will have",
|
| 483 |
+
"won't": "will not",
|
| 484 |
+
"won't've": "will not have",
|
| 485 |
+
"would've": "would have",
|
| 486 |
+
"wouldn't": "would not",
|
| 487 |
+
"wouldn't've": "would not have",
|
| 488 |
+
"y'all": "you all",
|
| 489 |
+
"y'alls": "you alls",
|
| 490 |
+
"y'all'd": "you all would",
|
| 491 |
+
"y'all'd've": "you all would have",
|
| 492 |
+
"y'all're": "you all are",
|
| 493 |
+
"y'all've": "you all have",
|
| 494 |
+
"you'd": "you had",
|
| 495 |
+
"you'd've": "you would have",
|
| 496 |
+
"you'll": "you you will",
|
| 497 |
+
"you'll've": "you you will have",
|
| 498 |
+
"you're": "you are",
|
| 499 |
+
"you've": "you have"
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
@staticmethod
|
| 503 |
+
def __remove_double_whitespaces(string: str):
|
| 504 |
+
return " ".join(string.split())
|
| 505 |
+
|
| 506 |
+
def __remove_url(self, string_series: pd.Series):
|
| 507 |
+
"""
|
| 508 |
+
Removes URLs m text
|
| 509 |
+
:param string_series: pd.Series, input string series
|
| 510 |
+
:return: pd.Series, cleaned string series
|
| 511 |
+
"""
|
| 512 |
+
clean_string_series = string_series.str.replace(
|
| 513 |
+
pat=r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})",
|
| 514 |
+
repl=" ", regex=True).copy()
|
| 515 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 516 |
+
|
| 517 |
+
def __expand(self, string_series: pd.Series):
|
| 518 |
+
"""
|
| 519 |
+
Replaces contractions with expansions. eg. don't wit do not.
|
| 520 |
+
:param string_series: pd.Series, input string series
|
| 521 |
+
:return: pd.Series, cleaned string series
|
| 522 |
+
"""
|
| 523 |
+
clean_string_series = string_series.copy()
|
| 524 |
+
for c, e in self.contraction_to_expansion.items():
|
| 525 |
+
clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
|
| 526 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 527 |
+
|
| 528 |
+
def __remove_punct(self, string_series: pd.Series):
|
| 529 |
+
"""
|
| 530 |
+
Removes punctuations from the input string.
|
| 531 |
+
:param string_series: pd.Series, input string series
|
| 532 |
+
:return: pd.Series, cleaned string series
|
| 533 |
+
"""
|
| 534 |
+
clean_string_series = string_series.copy()
|
| 535 |
+
puncts = [r'\n', r'\r', r'\t']
|
| 536 |
+
puncts.extend(list(string.punctuation))
|
| 537 |
+
for i in puncts:
|
| 538 |
+
clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
|
| 539 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 540 |
+
|
| 541 |
+
def __remove_digits(self, string_series: pd.Series):
|
| 542 |
+
"""
|
| 543 |
+
Removes digits from the input string.
|
| 544 |
+
:param string_series: pd.Series, input string series
|
| 545 |
+
:return: pd.Series, cleaned string series
|
| 546 |
+
"""
|
| 547 |
+
clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
|
| 548 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 549 |
+
|
| 550 |
+
@staticmethod
|
| 551 |
+
def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
|
| 552 |
+
"""
|
| 553 |
+
Reomves words/tokens where minlen <= len <= maxlen.
|
| 554 |
+
:param string_series: pd.Series, input string series
|
| 555 |
+
:param minlen: int, minimum length of token to be removed.
|
| 556 |
+
:param maxlen: int, maximum length of token to be removed.
|
| 557 |
+
:return: pd.Series, cleaned string series
|
| 558 |
+
"""
|
| 559 |
+
clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if
|
| 560 |
+
(len(word) > maxlen) or (len(word) < minlen)]))
|
| 561 |
+
return clean_string_series
|
| 562 |
+
|
| 563 |
+
def __remove_stop_words(self, string_series: pd.Series):
|
| 564 |
+
"""
|
| 565 |
+
Removes stop words from the input string.
|
| 566 |
+
:param string_series: pd.Series, input string series
|
| 567 |
+
:return: pd.Series, cleaned string series
|
| 568 |
+
"""
|
| 569 |
+
def str_remove_stop_words(string: str):
|
| 570 |
+
stops = self.stop_words
|
| 571 |
+
return " ".join([token for token in string.split() if token not in stops])
|
| 572 |
+
|
| 573 |
+
return string_series.map(str_remove_stop_words)
|
| 574 |
+
|
| 575 |
+
def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
|
| 576 |
+
bottom_p: int = None, dataset: str = 'train'):
|
| 577 |
+
"""
|
| 578 |
+
Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
|
| 579 |
+
:param string_series: pd.Series, input string series
|
| 580 |
+
:param top_p: float, percent of frequent words to remove.
|
| 581 |
+
:param bottom_p: float, percent of rare words to remove.
|
| 582 |
+
:param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
|
| 583 |
+
:return: pd.Series, cleaned string series
|
| 584 |
+
"""
|
| 585 |
+
if dataset == 'train':
|
| 586 |
+
if top_p is None:
|
| 587 |
+
top_p = 0
|
| 588 |
+
if bottom_p is None:
|
| 589 |
+
bottom_p = 0
|
| 590 |
+
|
| 591 |
+
if top_p > 0 or bottom_p > 0:
|
| 592 |
+
word_freq = pd.Series(" ".join(string_series).split()).value_counts()
|
| 593 |
+
n_words = len(word_freq)
|
| 594 |
+
|
| 595 |
+
if top_p > 0:
|
| 596 |
+
self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])
|
| 597 |
+
|
| 598 |
+
if bottom_p > 0:
|
| 599 |
+
self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])
|
| 600 |
+
|
| 601 |
+
if len(self.words_to_remove) == 0:
|
| 602 |
+
return string_series
|
| 603 |
+
else:
|
| 604 |
+
clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split()
|
| 605 |
+
if word not in self.words_to_remove]))
|
| 606 |
+
return clean_string_series
|
| 607 |
+
|
| 608 |
+
def preprocess(self, string_series: pd.Series, dataset: str = "train"):
|
| 609 |
+
"""
|
| 610 |
+
Entry point.
|
| 611 |
+
:param string_series: pd.Series, input string series
|
| 612 |
+
:param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
|
| 613 |
+
:return: pd.Series, cleaned string series
|
| 614 |
+
"""
|
| 615 |
+
string_series = string_series.str.lower().copy()
|
| 616 |
+
string_series = string_series.map(unidecode).copy()
|
| 617 |
+
string_series = self.__remove_url(string_series=string_series)
|
| 618 |
+
string_series = self.__expand(string_series=string_series)
|
| 619 |
+
|
| 620 |
+
if self.remove_punct:
|
| 621 |
+
string_series = self.__remove_punct(string_series=string_series)
|
| 622 |
+
if self.remove_digits:
|
| 623 |
+
string_series = self.__remove_digits(string_series=string_series)
|
| 624 |
+
if self.remove_stop_words:
|
| 625 |
+
string_series = self.__remove_stop_words(string_series=string_series)
|
| 626 |
+
if self.remove_short_words:
|
| 627 |
+
string_series = self.__remove_short_words(string_series=string_series,
|
| 628 |
+
minlen=self.minlen,
|
| 629 |
+
maxlen=self.maxlen)
|
| 630 |
+
string_series = self.__remove_top_bottom_words(string_series=string_series,
|
| 631 |
+
top_p=self.top_p,
|
| 632 |
+
bottom_p=self.bottom_p, dataset=dataset)
|
| 633 |
+
|
| 634 |
+
string_series = string_series.str.strip().copy()
|
| 635 |
+
string_series.replace(to_replace="", value="this is an empty message", inplace=True)
|
| 636 |
+
|
| 637 |
+
return string_series
|
| 638 |
+
|
| 639 |
+
|
| 640 |
+
def get_frequent_words_html(df):
|
| 641 |
+
text_preprocess = TextPreprocessor()
|
| 642 |
+
preprocessed_txt = text_preprocess.preprocess(df['title'] + ' ' + df['description'])
|
| 643 |
+
counter = Counter(' '.join([*preprocessed_txt]).split())
|
| 644 |
+
|
| 645 |
+
freq_tokens_html = '<div class="word-cloud-container">'
|
| 646 |
+
n = 1
|
| 647 |
+
for i, j in counter.most_common(25):
|
| 648 |
+
freq_tokens_html += f'<a class="wc-tokens" onclick=wc_search("{i}")>{i}</a>{" " * np.random.randint(3, 7, 1)[0]}'
|
| 649 |
+
if n == 5:
|
| 650 |
+
freq_tokens_html += '<div class="word-cloud-section" id="word-cloud-section-id">'
|
| 651 |
+
n += 1
|
| 652 |
+
freq_tokens_html += '</div></div>'
|
| 653 |
+
return freq_tokens_html
|