Spaces:
Running
Running
Upload 23 files
Browse files- .gitattributes +35 -35
- Dockerfile +9 -9
- README.md +11 -11
- app.py +258 -262
- config.py +2 -2
- db_operations/db_operations.py +69 -69
- indian_news_app_load_tests/indian_news_app_100_conc_users.html +0 -0
- indian_news_app_load_tests/indian_news_app_1_user.html +0 -0
- indian_news_app_load_tests/indian_news_app_25_conc_users.html +0 -0
- indian_news_app_load_tests/indian_news_app_50_conc_users.html +0 -0
- indian_news_app_load_tests/indian_news_app_5_conc_users.html +0 -0
- newrelic.ini +255 -0
- requirements.txt +16 -15
- start.sh +5 -5
- static/styles.css +509 -509
- templates/index.html +205 -205
- word_cloud.py +652 -652
.gitattributes
CHANGED
|
@@ -1,35 +1,35 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
FROM python:3.9-slim
|
| 2 |
-
WORKDIR /webapp
|
| 3 |
-
COPY . .
|
| 4 |
-
RUN chmod +x /webapp/start.sh
|
| 5 |
-
RUN pip install --upgrade pip
|
| 6 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
-
RUN apt update && apt install -y redis-server
|
| 8 |
-
EXPOSE 7860 6379
|
| 9 |
-
CMD ["/webapp/start.sh"]
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
WORKDIR /webapp
|
| 3 |
+
COPY . .
|
| 4 |
+
RUN chmod +x /webapp/start.sh
|
| 5 |
+
RUN pip install --upgrade pip
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
RUN apt update && apt install -y redis-server
|
| 8 |
+
EXPOSE 7860 6379
|
| 9 |
+
CMD ["newrelic-admin", "run-program", "/webapp/start.sh"]
|
README.md
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: News Aggregator
|
| 3 |
-
emoji: ⚡
|
| 4 |
-
colorFrom: indigo
|
| 5 |
-
colorTo: blue
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
license: mit
|
| 9 |
-
---
|
| 10 |
-
|
| 11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: News Aggregator
|
| 3 |
+
emoji: ⚡
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -1,262 +1,258 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
import pandas as pd
|
| 3 |
-
from dateutil import parser
|
| 4 |
-
from quart_cors import cors
|
| 5 |
-
from quart import Quart
|
| 6 |
-
from quart import render_template
|
| 7 |
-
from db_operations.db_operations import DBOperations
|
| 8 |
-
import logging
|
| 9 |
-
import traceback
|
| 10 |
-
import redis
|
| 11 |
-
import uuid
|
| 12 |
-
from datetime import datetime
|
| 13 |
-
from functools import lru_cache
|
| 14 |
-
import gc
|
| 15 |
-
from word_cloud import get_frequent_words_html
|
| 16 |
-
from config import NEWS_RETENTION_SECONDS, UK_EDITION_URL
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
app = Quart(__name__)
|
| 20 |
-
app = cors(app, allow_origin="*")
|
| 21 |
-
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True)
|
| 22 |
-
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
|
| 23 |
-
logging.warning(f'Is Redis available?: {redis_client.ping()}')
|
| 24 |
-
db = DBOperations()
|
| 25 |
-
session_id = None
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
REFRESH_FREQ = 300 # 300 secs = 5 mins
|
| 29 |
-
|
| 30 |
-
def is_db_fetch_reqd():
|
| 31 |
-
try:
|
| 32 |
-
env_news_time = redis_client.get('NEWSFETCHTIME')
|
| 33 |
-
logging.warning(f'[session_id: {session_id}] fetch_time_env_var: {env_news_time}')
|
| 34 |
-
fetch_flag = 1
|
| 35 |
-
if env_news_time is None:
|
| 36 |
-
redis_client.set("NEWSFETCHTIME", str(datetime.now()))
|
| 37 |
-
fetch_flag = 1
|
| 38 |
-
|
| 39 |
-
if env_news_time is not None:
|
| 40 |
-
fetch_time_lapse_seconds = (datetime.now() - datetime.strptime(env_news_time, '%Y-%m-%d %H:%M:%S.%f')).seconds
|
| 41 |
-
if fetch_time_lapse_seconds <= REFRESH_FREQ:
|
| 42 |
-
fetch_flag = 0
|
| 43 |
-
else:
|
| 44 |
-
redis_client.set("NEWSFETCHTIME", str(datetime.now()))
|
| 45 |
-
fetch_flag = 1
|
| 46 |
-
except Exception as e:
|
| 47 |
-
print(e)
|
| 48 |
-
fetch_flag = 1
|
| 49 |
-
return fetch_flag
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
def correct_date(x):
|
| 53 |
-
if (not isinstance(x, str)) or (str(x).find(":") == -1):
|
| 54 |
-
logging.error(f'[session_id: {session_id}] correct_date() error: {x} is not the right date format')
|
| 55 |
-
return "2020-11-07 00:36:44+05:30"
|
| 56 |
-
return x
|
| 57 |
-
|
| 58 |
-
def date_time_parser(dt):
|
| 59 |
-
"""
|
| 60 |
-
Computes the minutes elapsed since published time.
|
| 61 |
-
:param dt: date
|
| 62 |
-
:return: int, minutes elapsed.
|
| 63 |
-
"""
|
| 64 |
-
try:
|
| 65 |
-
return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
|
| 66 |
-
except:
|
| 67 |
-
logging.error(f'[session_id: {session_id}] date_time_parser() error: {dt} is not the right date format')
|
| 68 |
-
return 100000
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
def elapsed_time_str(mins):
|
| 72 |
-
"""
|
| 73 |
-
Return the time elapsed string from minutes passed as an argument.
|
| 74 |
-
:param mins: int, minutes elapsed.
|
| 75 |
-
:return: str, time elapsed string
|
| 76 |
-
"""
|
| 77 |
-
try:
|
| 78 |
-
time_str = ''
|
| 79 |
-
hours = int(mins / 60)
|
| 80 |
-
days = np.round(mins / (60 * 24), 1)
|
| 81 |
-
remaining_mins = int(mins - (hours * 60))
|
| 82 |
-
if days >= 1:
|
| 83 |
-
time_str = f'{str(days)} days ago'
|
| 84 |
-
if days == 1:
|
| 85 |
-
time_str = 'a day ago'
|
| 86 |
-
elif (days < 1) & (hours < 24) & (mins >= 60):
|
| 87 |
-
time_str = f'{str(hours)} hours and {str(remaining_mins)} mins ago'
|
| 88 |
-
if (hours == 1) & (remaining_mins > 1):
|
| 89 |
-
time_str = f'an hour and {str(remaining_mins)} mins ago'
|
| 90 |
-
if (hours == 1) & (remaining_mins == 1):
|
| 91 |
-
time_str = f'an hour and a min ago'
|
| 92 |
-
if (hours > 1) & (remaining_mins == 1):
|
| 93 |
-
time_str = f'{str(hours)} hours and a min ago'
|
| 94 |
-
if (hours > 1) & (remaining_mins == 0):
|
| 95 |
-
time_str = f'{str(hours)} hours ago'
|
| 96 |
-
if ((mins / 60) == 1) & (remaining_mins == 0):
|
| 97 |
-
time_str = 'an hour ago'
|
| 98 |
-
elif (days < 1) & (hours < 24) & (mins == 0):
|
| 99 |
-
time_str = 'Just in'
|
| 100 |
-
else:
|
| 101 |
-
time_str = f'{str(mins)} minutes ago'
|
| 102 |
-
if mins == 1:
|
| 103 |
-
time_str = 'a minute ago'
|
| 104 |
-
return time_str
|
| 105 |
-
except:
|
| 106 |
-
return "-"
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
async def fetch_from_db(fetch_flag):
|
| 111 |
-
try:
|
| 112 |
-
logging.warning(f'[session_id: {session_id}] fetch_flag: {fetch_flag}')
|
| 113 |
-
if fetch_flag == 1:
|
| 114 |
-
final_df = await db.read_news_from_db()
|
| 115 |
-
freq_tokens = await get_frequent_words_html(final_df)
|
| 116 |
-
logging.warning(f'[session_id: {session_id}] Fetched From DB')
|
| 117 |
-
|
| 118 |
-
final_df['_id'] = final_df['_id'].astype('str')
|
| 119 |
-
|
| 120 |
-
redis_client.set("NEWSDF", final_df.to_json())
|
| 121 |
-
redis_client.set("NEWSWORDCLOUD", freq_tokens)
|
| 122 |
-
else:
|
| 123 |
-
final_df = pd.read_json(redis_client.get("NEWSDF"))
|
| 124 |
-
freq_tokens = redis_client.get("NEWSWORDCLOUD")
|
| 125 |
-
logging.warning(f'[session_id: {session_id}] Fetched From Cache')
|
| 126 |
-
|
| 127 |
-
except Exception as e:
|
| 128 |
-
print(e)
|
| 129 |
-
final_df = []
|
| 130 |
-
freq_tokens = ""
|
| 131 |
-
|
| 132 |
-
return final_df, freq_tokens
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
@app.route("/")
|
| 136 |
-
async def index():
|
| 137 |
-
"""
|
| 138 |
-
Entry point
|
| 139 |
-
"""
|
| 140 |
-
try:
|
| 141 |
-
global session_id
|
| 142 |
-
session_id = uuid.uuid4().hex
|
| 143 |
-
src_str = ''
|
| 144 |
-
status_code = 200
|
| 145 |
-
logging.warning(f'[session_id: {session_id}] Entering the application')
|
| 146 |
-
final_df, freq_tokens = await fetch_from_db(is_db_fetch_reqd())
|
| 147 |
-
if len(final_df)
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
final_df["
|
| 154 |
-
final_df
|
| 155 |
-
|
| 156 |
-
final_df = final_df
|
| 157 |
-
final_df[
|
| 158 |
-
final_df.
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
final_df.
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
<
|
| 175 |
-
<
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
<a href="{href}" target="_blank" class="
|
| 219 |
-
</a>
|
| 220 |
-
</div>
|
| 221 |
-
<div>
|
| 222 |
-
<a href="{href}" target="_blank" class="
|
| 223 |
-
|
| 224 |
-
</
|
| 225 |
-
|
| 226 |
-
<
|
| 227 |
-
{
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
<
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
if __name__ == "__main__":
|
| 261 |
-
app.run(host="0.0.0.0", port=7860, workers=5, threads=5) # workers=(2*ncores) + 1, threads= (2 to 4*ncores) + 1
|
| 262 |
-
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from dateutil import parser
|
| 4 |
+
from quart_cors import cors
|
| 5 |
+
from quart import Quart
|
| 6 |
+
from quart import render_template
|
| 7 |
+
from db_operations.db_operations import DBOperations
|
| 8 |
+
import logging
|
| 9 |
+
import traceback
|
| 10 |
+
import redis
|
| 11 |
+
import uuid
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from functools import lru_cache
|
| 14 |
+
import gc
|
| 15 |
+
from word_cloud import get_frequent_words_html
|
| 16 |
+
from config import NEWS_RETENTION_SECONDS, UK_EDITION_URL
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
app = Quart(__name__)
|
| 20 |
+
app = cors(app, allow_origin="*")
|
| 21 |
+
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True)
|
| 22 |
+
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
|
| 23 |
+
logging.warning(f'Is Redis available?: {redis_client.ping()}')
|
| 24 |
+
db = DBOperations()
|
| 25 |
+
session_id = None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
REFRESH_FREQ = 300 # 300 secs = 5 mins
|
| 29 |
+
|
| 30 |
+
def is_db_fetch_reqd():
|
| 31 |
+
try:
|
| 32 |
+
env_news_time = redis_client.get('NEWSFETCHTIME')
|
| 33 |
+
logging.warning(f'[session_id: {session_id}] fetch_time_env_var: {env_news_time}')
|
| 34 |
+
fetch_flag = 1
|
| 35 |
+
if env_news_time is None:
|
| 36 |
+
redis_client.set("NEWSFETCHTIME", str(datetime.now()))
|
| 37 |
+
fetch_flag = 1
|
| 38 |
+
|
| 39 |
+
if env_news_time is not None:
|
| 40 |
+
fetch_time_lapse_seconds = (datetime.now() - datetime.strptime(env_news_time, '%Y-%m-%d %H:%M:%S.%f')).seconds
|
| 41 |
+
if fetch_time_lapse_seconds <= REFRESH_FREQ:
|
| 42 |
+
fetch_flag = 0
|
| 43 |
+
else:
|
| 44 |
+
redis_client.set("NEWSFETCHTIME", str(datetime.now()))
|
| 45 |
+
fetch_flag = 1
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(e)
|
| 48 |
+
fetch_flag = 1
|
| 49 |
+
return fetch_flag
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def correct_date(x):
|
| 53 |
+
if (not isinstance(x, str)) or (str(x).find(":") == -1):
|
| 54 |
+
logging.error(f'[session_id: {session_id}] correct_date() error: {x} is not the right date format')
|
| 55 |
+
return "2020-11-07 00:36:44+05:30"
|
| 56 |
+
return x
|
| 57 |
+
|
| 58 |
+
def date_time_parser(dt):
|
| 59 |
+
"""
|
| 60 |
+
Computes the minutes elapsed since published time.
|
| 61 |
+
:param dt: date
|
| 62 |
+
:return: int, minutes elapsed.
|
| 63 |
+
"""
|
| 64 |
+
try:
|
| 65 |
+
return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
|
| 66 |
+
except:
|
| 67 |
+
logging.error(f'[session_id: {session_id}] date_time_parser() error: {dt} is not the right date format')
|
| 68 |
+
return 100000
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def elapsed_time_str(mins):
|
| 72 |
+
"""
|
| 73 |
+
Return the time elapsed string from minutes passed as an argument.
|
| 74 |
+
:param mins: int, minutes elapsed.
|
| 75 |
+
:return: str, time elapsed string
|
| 76 |
+
"""
|
| 77 |
+
try:
|
| 78 |
+
time_str = ''
|
| 79 |
+
hours = int(mins / 60)
|
| 80 |
+
days = np.round(mins / (60 * 24), 1)
|
| 81 |
+
remaining_mins = int(mins - (hours * 60))
|
| 82 |
+
if days >= 1:
|
| 83 |
+
time_str = f'{str(days)} days ago'
|
| 84 |
+
if days == 1:
|
| 85 |
+
time_str = 'a day ago'
|
| 86 |
+
elif (days < 1) & (hours < 24) & (mins >= 60):
|
| 87 |
+
time_str = f'{str(hours)} hours and {str(remaining_mins)} mins ago'
|
| 88 |
+
if (hours == 1) & (remaining_mins > 1):
|
| 89 |
+
time_str = f'an hour and {str(remaining_mins)} mins ago'
|
| 90 |
+
if (hours == 1) & (remaining_mins == 1):
|
| 91 |
+
time_str = f'an hour and a min ago'
|
| 92 |
+
if (hours > 1) & (remaining_mins == 1):
|
| 93 |
+
time_str = f'{str(hours)} hours and a min ago'
|
| 94 |
+
if (hours > 1) & (remaining_mins == 0):
|
| 95 |
+
time_str = f'{str(hours)} hours ago'
|
| 96 |
+
if ((mins / 60) == 1) & (remaining_mins == 0):
|
| 97 |
+
time_str = 'an hour ago'
|
| 98 |
+
elif (days < 1) & (hours < 24) & (mins == 0):
|
| 99 |
+
time_str = 'Just in'
|
| 100 |
+
else:
|
| 101 |
+
time_str = f'{str(mins)} minutes ago'
|
| 102 |
+
if mins == 1:
|
| 103 |
+
time_str = 'a minute ago'
|
| 104 |
+
return time_str
|
| 105 |
+
except:
|
| 106 |
+
return "-"
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
async def fetch_from_db(fetch_flag):
|
| 111 |
+
try:
|
| 112 |
+
logging.warning(f'[session_id: {session_id}] fetch_flag: {fetch_flag}')
|
| 113 |
+
if fetch_flag == 1:
|
| 114 |
+
final_df = await db.read_news_from_db()
|
| 115 |
+
freq_tokens = await get_frequent_words_html(final_df)
|
| 116 |
+
logging.warning(f'[session_id: {session_id}] Fetched From DB')
|
| 117 |
+
|
| 118 |
+
final_df['_id'] = final_df['_id'].astype('str')
|
| 119 |
+
|
| 120 |
+
redis_client.set("NEWSDF", final_df.to_json())
|
| 121 |
+
redis_client.set("NEWSWORDCLOUD", freq_tokens)
|
| 122 |
+
else:
|
| 123 |
+
final_df = pd.read_json(redis_client.get("NEWSDF"))
|
| 124 |
+
freq_tokens = redis_client.get("NEWSWORDCLOUD")
|
| 125 |
+
logging.warning(f'[session_id: {session_id}] Fetched From Cache')
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
print(e)
|
| 129 |
+
final_df = []
|
| 130 |
+
freq_tokens = ""
|
| 131 |
+
raise
|
| 132 |
+
return final_df, freq_tokens
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
@app.route("/")
|
| 136 |
+
async def index():
|
| 137 |
+
"""
|
| 138 |
+
Entry point
|
| 139 |
+
"""
|
| 140 |
+
try:
|
| 141 |
+
global session_id
|
| 142 |
+
session_id = uuid.uuid4().hex
|
| 143 |
+
src_str = ''
|
| 144 |
+
status_code = 200
|
| 145 |
+
logging.warning(f'[session_id: {session_id}] Entering the application')
|
| 146 |
+
final_df, freq_tokens = await fetch_from_db(is_db_fetch_reqd())
|
| 147 |
+
if len(final_df) > 1:
|
| 148 |
+
|
| 149 |
+
final_df["parsed_date"] = [correct_date(date_) for date_ in final_df['parsed_date']]
|
| 150 |
+
final_df["parsed_date"] = [parser.parse(date_) for date_ in final_df['parsed_date']]
|
| 151 |
+
final_df["elapsed_time"] =[date_time_parser(date_) for date_ in final_df['parsed_date']]
|
| 152 |
+
final_df = final_df.loc[final_df["elapsed_time"] <= NEWS_RETENTION_SECONDS, :].copy()
|
| 153 |
+
final_df["elapsed_time_str"] = final_df["elapsed_time"].apply(elapsed_time_str)
|
| 154 |
+
final_df.sort_values(by="elapsed_time", inplace=True)
|
| 155 |
+
src_str = ", ".join(sorted([*final_df['src'].unique()]))
|
| 156 |
+
final_df['src_time'] = final_df['src'] + (" " * 5) + final_df["elapsed_time_str"]
|
| 157 |
+
final_df.drop(columns=['_id', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
|
| 158 |
+
final_df.drop_duplicates(subset='description', inplace=True)
|
| 159 |
+
final_df = final_df.loc[(final_df["title"] != ""), :].copy()
|
| 160 |
+
else:
|
| 161 |
+
final_df = pd.DataFrame({'title': '', 'url': '',
|
| 162 |
+
'description': '', 'src_time': ''}, index=[0])
|
| 163 |
+
|
| 164 |
+
except Exception as e:
|
| 165 |
+
final_df = pd.DataFrame({'title': '', 'url': '',
|
| 166 |
+
'description': '', 'src_time': ''}, index=[0])
|
| 167 |
+
logging.error(f'[session_id: {session_id}] {traceback.print_exc()}')
|
| 168 |
+
|
| 169 |
+
result_str = f'''
|
| 170 |
+
<div class="box" id="main">
|
| 171 |
+
<form>
|
| 172 |
+
|
| 173 |
+
<div class="banner">
|
| 174 |
+
<img src="../static/favicon_new.png" class="logo-img" alt="KSV Muralidhar" />
|
| 175 |
+
<h1 style="display:inline-block; vertical-align: middle;">Latest Indian News</h1>
|
| 176 |
+
</div>
|
| 177 |
+
'''
|
| 178 |
+
|
| 179 |
+
if len(final_df) <= 1:
|
| 180 |
+
result_str += f'''<div><p class="unavailable">This app is temporarily unavailable</p></div>'''
|
| 181 |
+
status_code = 500
|
| 182 |
+
else:
|
| 183 |
+
last_update_utc = datetime.strptime(redis_client.get('NEWSFETCHTIME'), '%Y-%m-%d %H:%M:%S.%f')
|
| 184 |
+
last_update_mins = int(np.ceil((datetime.now() - last_update_utc).seconds / 60))
|
| 185 |
+
last_update_str = f'Updated {last_update_mins} {"minutes" if last_update_mins > 1 else "minute"} ago'
|
| 186 |
+
result_str += f'<p class="srctxt">News aggregated from <b>{src_str}</b>.<br><br>{last_update_str} <a href="{UK_EDITION_URL}"><b>Switch to UK edition</b></a></p>'
|
| 187 |
+
|
| 188 |
+
result_str += '''
|
| 189 |
+
<div class="input-container">
|
| 190 |
+
<input type="text" class="keyword-input" id="keywordInput" placeholder="Search" oninput="filterContent(true)">
|
| 191 |
+
<div class="clear-btn" id="clearBtn" onclick="clearFilter()">×</div>
|
| 192 |
+
<img src="static/info.png" alt="info" width="18" height="18" align="center" onclick="showSearchInfo()" style="cursor: pointer;">
|
| 193 |
+
</div>
|
| 194 |
+
'''
|
| 195 |
+
|
| 196 |
+
result_str += f"{freq_tokens} "
|
| 197 |
+
result_str += '<div class="show-more-word-cloud" onclick=word_cloud_display()><p class="three-dots">...</p></div>'
|
| 198 |
+
|
| 199 |
+
result_str += f'''<div style="padding-bottom: 6px; font-size: 12px; font-family: Arial, Helvetica, sans-serif;">
|
| 200 |
+
News categories and similar news are AI-generated</div>
|
| 201 |
+
<div style="padding-bottom: 10px; font-size: 12px; font-family: Arial, Helvetica, sans-serif; font-weight: bold;">
|
| 202 |
+
{len(final_df)} news articles available</div>
|
| 203 |
+
'''
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
for n, i in final_df.iterrows(): # iterating through the search results
|
| 207 |
+
href = i["url"]
|
| 208 |
+
category = i["category"]
|
| 209 |
+
description = i["description"]
|
| 210 |
+
url_txt = i["title"]
|
| 211 |
+
src_time = i["src_time"]
|
| 212 |
+
sim_news = i['similar_news']
|
| 213 |
+
result_str += f'''<div class="news-item"><div style="padding-top: 7px;">
|
| 214 |
+
<a href="{href}" target="_blank" class="article-category">{category}
|
| 215 |
+
</a>
|
| 216 |
+
</div>
|
| 217 |
+
<div>
|
| 218 |
+
<a href="{href}" target="_blank" class="headline">{url_txt}
|
| 219 |
+
</a>
|
| 220 |
+
</div>
|
| 221 |
+
<div>
|
| 222 |
+
<a href="{href}" target="_blank" class="description">
|
| 223 |
+
{description}
|
| 224 |
+
</a>
|
| 225 |
+
</div>
|
| 226 |
+
<div>
|
| 227 |
+
<a href="{href}" target="_blank" class="time">
|
| 228 |
+
{src_time}
|
| 229 |
+
</a>
|
| 230 |
+
</div>
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
<div class="container">
|
| 234 |
+
<div class="content" style="display: none;">
|
| 235 |
+
{sim_news}
|
| 236 |
+
</div>
|
| 237 |
+
<div class="show-similar-button-container">
|
| 238 |
+
<button type="button" class="show-more">Show similar news</button>
|
| 239 |
+
<button type="button" class="show-less">Hide similar news</button>
|
| 240 |
+
</div>
|
| 241 |
+
</div>
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
<div>
|
| 246 |
+
<p></p>
|
| 247 |
+
</div></div>
|
| 248 |
+
'''
|
| 249 |
+
|
| 250 |
+
result_str += '</form></div>'
|
| 251 |
+
logging.warning(f'[session_id: {session_id}] Successfully rendered template')
|
| 252 |
+
gc.collect()
|
| 253 |
+
return await render_template("index.html", body=result_str), status_code
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
if __name__ == "__main__":
|
| 257 |
+
app.run(host="0.0.0.0", port=7860, workers=5, threads=5) # workers=(2*ncores) + 1, threads= (2 to 4*ncores) + 1
|
| 258 |
+
|
|
|
|
|
|
|
|
|
|
|
|
config.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
-
NEWS_RETENTION_SECONDS = 300
|
| 2 |
-
UK_EDITION_URL = "https://ksvmuralidhar-uk-news-aggregator.hf.space"
|
|
|
|
| 1 |
+
NEWS_RETENTION_SECONDS = 300
|
| 2 |
+
UK_EDITION_URL = "https://ksvmuralidhar-uk-news-aggregator.hf.space"
|
db_operations/db_operations.py
CHANGED
|
@@ -1,69 +1,69 @@
|
|
| 1 |
-
import pymongo
|
| 2 |
-
import os
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import logging
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
class DBOperations:
|
| 8 |
-
"""
|
| 9 |
-
Reads news from MongoDB
|
| 10 |
-
"""
|
| 11 |
-
def __init__(self):
|
| 12 |
-
self.url = os.getenv('DB_URL')
|
| 13 |
-
self.database = "rss_news_db_cat_pred_sim_news"
|
| 14 |
-
self.collection = "rss_news_cat_pred_sim_news"
|
| 15 |
-
self.__client = None
|
| 16 |
-
self.__error = 0
|
| 17 |
-
|
| 18 |
-
async def __connect(self):
|
| 19 |
-
try:
|
| 20 |
-
self.__client = pymongo.MongoClient(self.url)
|
| 21 |
-
_ = self.__client.list_database_names()
|
| 22 |
-
except Exception as conn_exception:
|
| 23 |
-
self.__error = 1
|
| 24 |
-
logging.critical(f"Error in DBOperations.connect(): {conn_exception}")
|
| 25 |
-
self.__client = None
|
| 26 |
-
raise
|
| 27 |
-
|
| 28 |
-
async def __read(self):
|
| 29 |
-
try:
|
| 30 |
-
db = self.__client[self.database]
|
| 31 |
-
coll = db[self.collection]
|
| 32 |
-
docs = []
|
| 33 |
-
maxtries = 5
|
| 34 |
-
ntry = 0
|
| 35 |
-
|
| 36 |
-
while (len(docs) == 0) and (ntry < maxtries):
|
| 37 |
-
for doc in coll.find():
|
| 38 |
-
docs.append(doc)
|
| 39 |
-
ntry += 1
|
| 40 |
-
logging.info(f"DB Read try: {ntry}")
|
| 41 |
-
|
| 42 |
-
rss_df = pd.DataFrame(docs)
|
| 43 |
-
except Exception as insert_err:
|
| 44 |
-
self.__error = 1
|
| 45 |
-
logging.critical(f"Error in DBOperations.read(): {insert_err}")
|
| 46 |
-
rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
|
| 47 |
-
'description': '', 'parsed_date': '',
|
| 48 |
-
'src': ''}, index=[0])
|
| 49 |
-
return rss_df
|
| 50 |
-
|
| 51 |
-
def __close_connection(self):
|
| 52 |
-
if self.__client is not None:
|
| 53 |
-
self.__client.close()
|
| 54 |
-
self.__client = None
|
| 55 |
-
|
| 56 |
-
async def read_news_from_db(self):
|
| 57 |
-
rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
|
| 58 |
-
'description': '', 'parsed_date': '',
|
| 59 |
-
'src': ''}, index=[0])
|
| 60 |
-
if self.url is not None:
|
| 61 |
-
if self.__error == 0:
|
| 62 |
-
await self.__connect()
|
| 63 |
-
if self.__error == 0:
|
| 64 |
-
rss_df = await self.__read()
|
| 65 |
-
if self.__error == 0:
|
| 66 |
-
logging.info("Read Successful")
|
| 67 |
-
if self.__client is not None:
|
| 68 |
-
self.__close_connection()
|
| 69 |
-
return rss_df
|
|
|
|
| 1 |
+
import pymongo
|
| 2 |
+
import os
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DBOperations:
|
| 8 |
+
"""
|
| 9 |
+
Reads news from MongoDB
|
| 10 |
+
"""
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.url = os.getenv('DB_URL')
|
| 13 |
+
self.database = "rss_news_db_cat_pred_sim_news"
|
| 14 |
+
self.collection = "rss_news_cat_pred_sim_news"
|
| 15 |
+
self.__client = None
|
| 16 |
+
self.__error = 0
|
| 17 |
+
|
| 18 |
+
async def __connect(self):
|
| 19 |
+
try:
|
| 20 |
+
self.__client = pymongo.MongoClient(self.url)
|
| 21 |
+
_ = self.__client.list_database_names()
|
| 22 |
+
except Exception as conn_exception:
|
| 23 |
+
self.__error = 1
|
| 24 |
+
logging.critical(f"Error in DBOperations.connect(): {conn_exception}")
|
| 25 |
+
self.__client = None
|
| 26 |
+
raise
|
| 27 |
+
|
| 28 |
+
async def __read(self):
|
| 29 |
+
try:
|
| 30 |
+
db = self.__client[self.database]
|
| 31 |
+
coll = db[self.collection]
|
| 32 |
+
docs = []
|
| 33 |
+
maxtries = 5
|
| 34 |
+
ntry = 0
|
| 35 |
+
|
| 36 |
+
while (len(docs) == 0) and (ntry < maxtries):
|
| 37 |
+
for doc in coll.find():
|
| 38 |
+
docs.append(doc)
|
| 39 |
+
ntry += 1
|
| 40 |
+
logging.info(f"DB Read try: {ntry}")
|
| 41 |
+
|
| 42 |
+
rss_df = pd.DataFrame(docs)
|
| 43 |
+
except Exception as insert_err:
|
| 44 |
+
self.__error = 1
|
| 45 |
+
logging.critical(f"Error in DBOperations.read(): {insert_err}")
|
| 46 |
+
rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
|
| 47 |
+
'description': '', 'parsed_date': '',
|
| 48 |
+
'src': ''}, index=[0])
|
| 49 |
+
return rss_df
|
| 50 |
+
|
| 51 |
+
def __close_connection(self):
|
| 52 |
+
if self.__client is not None:
|
| 53 |
+
self.__client.close()
|
| 54 |
+
self.__client = None
|
| 55 |
+
|
| 56 |
+
async def read_news_from_db(self):
|
| 57 |
+
rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
|
| 58 |
+
'description': '', 'parsed_date': '',
|
| 59 |
+
'src': ''}, index=[0])
|
| 60 |
+
if self.url is not None:
|
| 61 |
+
if self.__error == 0:
|
| 62 |
+
await self.__connect()
|
| 63 |
+
if self.__error == 0:
|
| 64 |
+
rss_df = await self.__read()
|
| 65 |
+
if self.__error == 0:
|
| 66 |
+
logging.info("Read Successful")
|
| 67 |
+
if self.__client is not None:
|
| 68 |
+
self.__close_connection()
|
| 69 |
+
return rss_df
|
indian_news_app_load_tests/indian_news_app_100_conc_users.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
indian_news_app_load_tests/indian_news_app_1_user.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
indian_news_app_load_tests/indian_news_app_25_conc_users.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
indian_news_app_load_tests/indian_news_app_50_conc_users.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
indian_news_app_load_tests/indian_news_app_5_conc_users.html
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
newrelic.ini
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ---------------------------------------------------------------------------
|
| 2 |
+
|
| 3 |
+
#
|
| 4 |
+
# This file configures the New Relic Python Agent.
|
| 5 |
+
#
|
| 6 |
+
# The path to the configuration file should be supplied to the function
|
| 7 |
+
# newrelic.agent.initialize() when the agent is being initialized.
|
| 8 |
+
#
|
| 9 |
+
# The configuration file follows a structure similar to what you would
|
| 10 |
+
# find for Microsoft Windows INI files. For further information on the
|
| 11 |
+
# configuration file format see the Python ConfigParser documentation at:
|
| 12 |
+
#
|
| 13 |
+
# http://docs.python.org/library/configparser.html
|
| 14 |
+
#
|
| 15 |
+
# For further discussion on the behaviour of the Python agent that can
|
| 16 |
+
# be configured via this configuration file see:
|
| 17 |
+
#
|
| 18 |
+
# https://docs.newrelic.com/docs/apm/agents/python-agent/configuration/python-agent-configuration/
|
| 19 |
+
#
|
| 20 |
+
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
# Here are the settings that are common to all environments.
|
| 24 |
+
|
| 25 |
+
[newrelic]
|
| 26 |
+
|
| 27 |
+
# You must specify the license key associated with your New
|
| 28 |
+
# Relic account. This may also be set using the NEW_RELIC_LICENSE_KEY
|
| 29 |
+
# environment variable. This key binds the Python Agent's data to
|
| 30 |
+
# your account in the New Relic service. For more information on
|
| 31 |
+
# storing and generating license keys, see
|
| 32 |
+
# https://docs.newrelic.com/docs/apis/intro-apis/new-relic-api-keys/#ingest-license-key
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# The application name. Set this to be the name of your
|
| 36 |
+
# application as you would like it to show up in New Relic UI.
|
| 37 |
+
# You may also set this using the NEW_RELIC_APP_NAME environment variable.
|
| 38 |
+
# The UI will then auto-map instances of your application into a
|
| 39 |
+
# entry on your home dashboard page. You can also specify multiple
|
| 40 |
+
# app names to group your aggregated data. For further details,
|
| 41 |
+
# please see:
|
| 42 |
+
# https://docs.newrelic.com/docs/apm/agents/manage-apm-agents/app-naming/use-multiple-names-app/
|
| 43 |
+
app_name = news_aggregator
|
| 44 |
+
|
| 45 |
+
# When "true", the agent collects performance data about your
|
| 46 |
+
# application and reports this data to the New Relic UI at
|
| 47 |
+
# newrelic.com. This global switch is normally overridden for
|
| 48 |
+
# each environment below. It may also be set using the
|
| 49 |
+
# NEW_RELIC_MONITOR_MODE environment variable.
|
| 50 |
+
monitor_mode = true
|
| 51 |
+
|
| 52 |
+
# Sets the name of a file to log agent messages to. Whatever you
|
| 53 |
+
# set this to, you must ensure that the permissions for the
|
| 54 |
+
# containing directory and the file itself are correct, and
|
| 55 |
+
# that the user that your web application runs as can write out
|
| 56 |
+
# to the file. If not able to out a log file, it is also
|
| 57 |
+
# possible to say "stderr" and output to standard error output.
|
| 58 |
+
# This would normally result in output appearing in your web
|
| 59 |
+
# server log. It can also be set using the NEW_RELIC_LOG
|
| 60 |
+
# environment variable.
|
| 61 |
+
log_file = stdout
|
| 62 |
+
|
| 63 |
+
# Sets the level of detail of messages sent to the log file, if
|
| 64 |
+
# a log file location has been provided. Possible values, in
|
| 65 |
+
# increasing order of detail, are: "critical", "error", "warning",
|
| 66 |
+
# "info" and "debug". When reporting any agent issues to New
|
| 67 |
+
# Relic technical support, the most useful setting for the
|
| 68 |
+
# support engineers is "debug". However, this can generate a lot
|
| 69 |
+
# of information very quickly, so it is best not to keep the
|
| 70 |
+
# agent at this level for longer than it takes to reproduce the
|
| 71 |
+
# problem you are experiencing. This may also be set using the
|
| 72 |
+
# NEW_RELIC_LOG_LEVEL environment variable.
|
| 73 |
+
log_level = info
|
| 74 |
+
|
| 75 |
+
# High Security Mode enforces certain security settings, and prevents
|
| 76 |
+
# them from being overridden, so that no sensitive data is sent to New
|
| 77 |
+
# Relic. Enabling High Security Mode means that request parameters are
|
| 78 |
+
# not collected and SQL can not be sent to New Relic in its raw form.
|
| 79 |
+
# To activate High Security Mode, it must be set to 'true' in this
|
| 80 |
+
# local .ini configuration file AND be set to 'true' in the
|
| 81 |
+
# server-side configuration in the New Relic user interface. It can
|
| 82 |
+
# also be set using the NEW_RELIC_HIGH_SECURITY environment variable.
|
| 83 |
+
# For details, see
|
| 84 |
+
# https://docs.newrelic.com/docs/subscriptions/high-security
|
| 85 |
+
high_security = false
|
| 86 |
+
|
| 87 |
+
# The Python Agent will attempt to connect directly to the New
|
| 88 |
+
# Relic service. If there is an intermediate firewall between
|
| 89 |
+
# your host and the New Relic service that requires you to use a
|
| 90 |
+
# HTTP proxy, then you should set both the "proxy_host" and
|
| 91 |
+
# "proxy_port" settings to the required values for the HTTP
|
| 92 |
+
# proxy. The "proxy_user" and "proxy_pass" settings should
|
| 93 |
+
# additionally be set if proxy authentication is implemented by
|
| 94 |
+
# the HTTP proxy. The "proxy_scheme" setting dictates what
|
| 95 |
+
# protocol scheme is used in talking to the HTTP proxy. This
|
| 96 |
+
# would normally always be set as "http" which will result in the
|
| 97 |
+
# agent then using a SSL tunnel through the HTTP proxy for end to
|
| 98 |
+
# end encryption.
|
| 99 |
+
# See https://docs.newrelic.com/docs/apm/agents/python-agent/configuration/python-agent-configuration/#proxy
|
| 100 |
+
# for information on proxy configuration via environment variables.
|
| 101 |
+
# proxy_scheme = http
|
| 102 |
+
# proxy_host = hostname
|
| 103 |
+
# proxy_port = 8080
|
| 104 |
+
# proxy_user =
|
| 105 |
+
# proxy_pass =
|
| 106 |
+
|
| 107 |
+
# Capturing request parameters is off by default. To enable the
|
| 108 |
+
# capturing of request parameters, first ensure that the setting
|
| 109 |
+
# "attributes.enabled" is set to "true" (the default value), and
|
| 110 |
+
# then add "request.parameters.*" to the "attributes.include"
|
| 111 |
+
# setting. For details about attributes configuration, please
|
| 112 |
+
# consult the documentation.
|
| 113 |
+
# attributes.include = request.parameters.*
|
| 114 |
+
|
| 115 |
+
# The transaction tracer captures deep information about slow
|
| 116 |
+
# transactions and sends this to the UI on a periodic basis. The
|
| 117 |
+
# transaction tracer is enabled by default. Set this to "false"
|
| 118 |
+
# to turn it off.
|
| 119 |
+
transaction_tracer.enabled = true
|
| 120 |
+
|
| 121 |
+
# Threshold in seconds for when to collect a transaction trace.
|
| 122 |
+
# When the response time of a controller action exceeds this
|
| 123 |
+
# threshold, a transaction trace will be recorded and sent to
|
| 124 |
+
# the UI. Valid values are any positive float value, or (default)
|
| 125 |
+
# "apdex_f", which will use the threshold for a dissatisfying
|
| 126 |
+
# Apdex controller action - four times the Apdex T value.
|
| 127 |
+
transaction_tracer.transaction_threshold = apdex_f
|
| 128 |
+
|
| 129 |
+
# When the transaction tracer is on, SQL statements can
|
| 130 |
+
# optionally be recorded. The recorder has three modes, "off"
|
| 131 |
+
# which sends no SQL, "raw" which sends the SQL statement in its
|
| 132 |
+
# original form, and "obfuscated", which strips out numeric and
|
| 133 |
+
# string literals.
|
| 134 |
+
transaction_tracer.record_sql = obfuscated
|
| 135 |
+
|
| 136 |
+
# Threshold in seconds for when to collect stack trace for a SQL
|
| 137 |
+
# call. In other words, when SQL statements exceed this
|
| 138 |
+
# threshold, then capture and send to the UI the current stack
|
| 139 |
+
# trace. This is helpful for pinpointing where long SQL calls
|
| 140 |
+
# originate from in an application.
|
| 141 |
+
transaction_tracer.stack_trace_threshold = 0.5
|
| 142 |
+
|
| 143 |
+
# Determines whether the agent will capture query plans for slow
|
| 144 |
+
# SQL queries. Only supported in MySQL and PostgreSQL. Set this
|
| 145 |
+
# to "false" to turn it off.
|
| 146 |
+
transaction_tracer.explain_enabled = true
|
| 147 |
+
|
| 148 |
+
# Threshold for query execution time below which query plans
|
| 149 |
+
# will not not be captured. Relevant only when "explain_enabled"
|
| 150 |
+
# is true.
|
| 151 |
+
transaction_tracer.explain_threshold = 0.5
|
| 152 |
+
|
| 153 |
+
# Space separated list of function or method names in form
|
| 154 |
+
# 'module:function' or 'module:class.function' for which
|
| 155 |
+
# additional function timing instrumentation will be added.
|
| 156 |
+
transaction_tracer.function_trace =
|
| 157 |
+
|
| 158 |
+
# The error collector captures information about uncaught
|
| 159 |
+
# exceptions or logged exceptions and sends them to UI for
|
| 160 |
+
# viewing. The error collector is enabled by default. Set this
|
| 161 |
+
# to "false" to turn it off. For more details on errors, see
|
| 162 |
+
# https://docs.newrelic.com/docs/apm/agents/manage-apm-agents/agent-data/manage-errors-apm-collect-ignore-or-mark-expected/
|
| 163 |
+
error_collector.enabled = true
|
| 164 |
+
|
| 165 |
+
# To stop specific errors from reporting to the UI, set this to
|
| 166 |
+
# a space separated list of the Python exception type names to
|
| 167 |
+
# ignore. The exception name should be of the form 'module:class'.
|
| 168 |
+
error_collector.ignore_classes =
|
| 169 |
+
|
| 170 |
+
# Expected errors are reported to the UI but will not affect the
|
| 171 |
+
# Apdex or error rate. To mark specific errors as expected, set this
|
| 172 |
+
# to a space separated list of the Python exception type names to
|
| 173 |
+
# expected. The exception name should be of the form 'module:class'.
|
| 174 |
+
error_collector.expected_classes =
|
| 175 |
+
|
| 176 |
+
# Browser monitoring is the Real User Monitoring feature of the UI.
|
| 177 |
+
# For those Python web frameworks that are supported, this
|
| 178 |
+
# setting enables the auto-insertion of the browser monitoring
|
| 179 |
+
# JavaScript fragments.
|
| 180 |
+
browser_monitoring.auto_instrument = true
|
| 181 |
+
|
| 182 |
+
# A thread profiling session can be scheduled via the UI when
|
| 183 |
+
# this option is enabled. The thread profiler will periodically
|
| 184 |
+
# capture a snapshot of the call stack for each active thread in
|
| 185 |
+
# the application to construct a statistically representative
|
| 186 |
+
# call tree. For more details on the thread profiler tool, see
|
| 187 |
+
# https://docs.newrelic.com/docs/apm/apm-ui-pages/events/thread-profiler-tool/
|
| 188 |
+
thread_profiler.enabled = true
|
| 189 |
+
|
| 190 |
+
# Your application deployments can be recorded through the
|
| 191 |
+
# New Relic REST API. To use this feature provide your API key
|
| 192 |
+
# below then use the `newrelic-admin record-deploy` command.
|
| 193 |
+
# This can also be set using the NEW_RELIC_API_KEY
|
| 194 |
+
# environment variable.
|
| 195 |
+
# api_key =
|
| 196 |
+
|
| 197 |
+
# Distributed tracing lets you see the path that a request takes
|
| 198 |
+
# through your distributed system. For more information, please
|
| 199 |
+
# consult our distributed tracing planning guide.
|
| 200 |
+
# https://docs.newrelic.com/docs/transition-guide-distributed-tracing
|
| 201 |
+
distributed_tracing.enabled = true
|
| 202 |
+
|
| 203 |
+
# This setting enables log decoration, the forwarding of log events,
|
| 204 |
+
# and the collection of logging metrics if these sub-feature
|
| 205 |
+
# configurations are also enabled. If this setting is false, no
|
| 206 |
+
# logging instrumentation features are enabled. This can also be
|
| 207 |
+
# set using the NEW_RELIC_APPLICATION_LOGGING_ENABLED environment
|
| 208 |
+
# variable.
|
| 209 |
+
# application_logging.enabled = true
|
| 210 |
+
|
| 211 |
+
# If true, the agent captures log records emitted by your application
|
| 212 |
+
# and forwards them to New Relic. `application_logging.enabled` must
|
| 213 |
+
# also be true for this setting to take effect. You can also set
|
| 214 |
+
# this using the NEW_RELIC_APPLICATION_LOGGING_FORWARDING_ENABLED
|
| 215 |
+
# environment variable.
|
| 216 |
+
# application_logging.forwarding.enabled = true
|
| 217 |
+
|
| 218 |
+
# If true, the agent decorates logs with metadata to link to entities,
|
| 219 |
+
# hosts, traces, and spans. `application_logging.enabled` must also
|
| 220 |
+
# be true for this setting to take effect. This can also be set
|
| 221 |
+
# using the NEW_RELIC_APPLICATION_LOGGING_LOCAL_DECORATING_ENABLED
|
| 222 |
+
# environment variable.
|
| 223 |
+
# application_logging.local_decorating.enabled = true
|
| 224 |
+
|
| 225 |
+
# If true, the agent captures metrics related to the log lines
|
| 226 |
+
# being sent up by your application. This can also be set
|
| 227 |
+
# using the NEW_RELIC_APPLICATION_LOGGING_METRICS_ENABLED
|
| 228 |
+
# environment variable.
|
| 229 |
+
# application_logging.metrics.enabled = true
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
# ---------------------------------------------------------------------------
|
| 233 |
+
|
| 234 |
+
#
|
| 235 |
+
# The application environments. These are specific settings which
|
| 236 |
+
# override the common environment settings. The settings related to a
|
| 237 |
+
# specific environment will be used when the environment argument to the
|
| 238 |
+
# newrelic.agent.initialize() function has been defined to be either
|
| 239 |
+
# "development", "test", "staging" or "production".
|
| 240 |
+
#
|
| 241 |
+
|
| 242 |
+
[newrelic:development]
|
| 243 |
+
monitor_mode = false
|
| 244 |
+
|
| 245 |
+
[newrelic:test]
|
| 246 |
+
monitor_mode = false
|
| 247 |
+
|
| 248 |
+
[newrelic:staging]
|
| 249 |
+
app_name = (Staging)
|
| 250 |
+
monitor_mode = true
|
| 251 |
+
|
| 252 |
+
[newrelic:production]
|
| 253 |
+
monitor_mode = true
|
| 254 |
+
|
| 255 |
+
# ---------------------------------------------------------------------------
|
requirements.txt
CHANGED
|
@@ -1,15 +1,16 @@
|
|
| 1 |
-
regex==2021.8.3
|
| 2 |
-
lxml==4.6.3
|
| 3 |
-
numpy==1.21.1
|
| 4 |
-
python-dateutil==2.8.2
|
| 5 |
-
pandas==1.3.1
|
| 6 |
-
requests==2.26.0
|
| 7 |
-
bs4==0.0.1
|
| 8 |
-
gunicorn
|
| 9 |
-
pymongo==4.3.3
|
| 10 |
-
unidecode
|
| 11 |
-
redis
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
| 1 |
+
regex==2021.8.3
|
| 2 |
+
lxml==4.6.3
|
| 3 |
+
numpy==1.21.1
|
| 4 |
+
python-dateutil==2.8.2
|
| 5 |
+
pandas==1.3.1
|
| 6 |
+
requests==2.26.0
|
| 7 |
+
bs4==0.0.1
|
| 8 |
+
gunicorn
|
| 9 |
+
pymongo==4.3.3
|
| 10 |
+
unidecode
|
| 11 |
+
redis
|
| 12 |
+
newrelic
|
| 13 |
+
asyncio
|
| 14 |
+
uvicorn
|
| 15 |
+
Quart
|
| 16 |
+
quart-cors
|
start.sh
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
redis-server --daemonize yes
|
| 3 |
-
redis-cli config set save ""
|
| 4 |
-
redis-cli config set appendonly no
|
| 5 |
-
redis-cli config set stop-writes-on-bgsave-error no
|
| 6 |
gunicorn -b 0.0.0.0:7860 --timeout 120 --worker-class uvicorn.workers.UvicornWorker --workers 5 --threads 5 app:app
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
redis-server --daemonize yes
|
| 3 |
+
redis-cli config set save ""
|
| 4 |
+
redis-cli config set appendonly no
|
| 5 |
+
redis-cli config set stop-writes-on-bgsave-error no
|
| 6 |
gunicorn -b 0.0.0.0:7860 --timeout 120 --worker-class uvicorn.workers.UvicornWorker --workers 5 --threads 5 app:app
|
static/styles.css
CHANGED
|
@@ -1,510 +1,510 @@
|
|
| 1 |
-
html {
|
| 2 |
-
scroll-behavior: smooth;
|
| 3 |
-
}
|
| 4 |
-
|
| 5 |
-
@media screen and (min-width: 800px) {
|
| 6 |
-
a.headline {
|
| 7 |
-
background-color: #E5E4E2;
|
| 8 |
-
display: block;
|
| 9 |
-
width: relative;
|
| 10 |
-
text-decoration: none;
|
| 11 |
-
color: black;
|
| 12 |
-
line-height: 1.2;
|
| 13 |
-
align: justify;
|
| 14 |
-
border-left: 5px solid transparent;
|
| 15 |
-
border-top: 5px solid transparent;
|
| 16 |
-
border-bottom: 5px solid transparent;
|
| 17 |
-
border-right: 0px;
|
| 18 |
-
font-weight: bold;
|
| 19 |
-
font-size: 18px;
|
| 20 |
-
padding-right: 5px;
|
| 21 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 22 |
-
}
|
| 23 |
-
}
|
| 24 |
-
|
| 25 |
-
@media screen and (max-width: 800px) {
|
| 26 |
-
a.headline {
|
| 27 |
-
background-color: #E5E4E2;
|
| 28 |
-
display: block;
|
| 29 |
-
width: relative;
|
| 30 |
-
text-decoration: none;
|
| 31 |
-
color: black;
|
| 32 |
-
line-height: 1.2;
|
| 33 |
-
align: justify;
|
| 34 |
-
border-left: 5px solid transparent;
|
| 35 |
-
border-top: 5px solid transparent;
|
| 36 |
-
border-bottom: 5px solid transparent;
|
| 37 |
-
border-right: 0px;
|
| 38 |
-
font-weight: bold;
|
| 39 |
-
font-size: 16.5px;
|
| 40 |
-
padding-right: 5px;
|
| 41 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 42 |
-
}
|
| 43 |
-
}
|
| 44 |
-
|
| 45 |
-
@media screen and (min-width: 800px) {
|
| 46 |
-
a.description {
|
| 47 |
-
background-color: #E5E4E2;
|
| 48 |
-
align:justify;
|
| 49 |
-
text-align: justify;
|
| 50 |
-
display: block;
|
| 51 |
-
height:100%;
|
| 52 |
-
width: relative;
|
| 53 |
-
text-decoration: none;
|
| 54 |
-
border-left: 5px solid transparent;
|
| 55 |
-
border-top: 0px;
|
| 56 |
-
border-bottom: 7px solid transparent;
|
| 57 |
-
border-right: 0px;
|
| 58 |
-
font-size: 14px;
|
| 59 |
-
padding-right: 5px;
|
| 60 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 61 |
-
color: dimgrey;
|
| 62 |
-
}
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
@media screen and (max-width: 800px) {
|
| 66 |
-
a.description {
|
| 67 |
-
background-color: #E5E4E2;
|
| 68 |
-
align:justify;
|
| 69 |
-
text-align: justify;
|
| 70 |
-
display: block;
|
| 71 |
-
height:100%;
|
| 72 |
-
width: relative;
|
| 73 |
-
text-decoration: none;
|
| 74 |
-
border-left: 5px solid transparent;
|
| 75 |
-
border-top: 0px;
|
| 76 |
-
border-bottom: 7px solid transparent;
|
| 77 |
-
border-right: 0px;
|
| 78 |
-
font-size: 12.5px;
|
| 79 |
-
padding-right: 5px;
|
| 80 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 81 |
-
color: dimgrey;
|
| 82 |
-
}
|
| 83 |
-
}
|
| 84 |
-
|
| 85 |
-
@media screen and (min-width: 800px) {
|
| 86 |
-
a.time {
|
| 87 |
-
background-color: #E5E4E2;
|
| 88 |
-
align:justify;
|
| 89 |
-
display: block;
|
| 90 |
-
height:100%;
|
| 91 |
-
width: relative;
|
| 92 |
-
text-decoration: none;
|
| 93 |
-
border-left: 5px solid transparent;
|
| 94 |
-
border-top: 0px;
|
| 95 |
-
border-bottom: 1px solid transparent;
|
| 96 |
-
border-right: 0px;
|
| 97 |
-
padding-right: 5px;
|
| 98 |
-
font-size: 11px;
|
| 99 |
-
padding-bottom: 5px;
|
| 100 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 101 |
-
color: green;
|
| 102 |
-
}
|
| 103 |
-
}
|
| 104 |
-
|
| 105 |
-
@media screen and (max-width: 800px) {
|
| 106 |
-
a.time {
|
| 107 |
-
background-color: #E5E4E2;
|
| 108 |
-
align:justify;
|
| 109 |
-
display: block;
|
| 110 |
-
height:100%;
|
| 111 |
-
width: relative;
|
| 112 |
-
text-decoration: none;
|
| 113 |
-
border-left: 5px solid transparent;
|
| 114 |
-
border-top: 0px;
|
| 115 |
-
border-bottom: 1px solid transparent;
|
| 116 |
-
border-right: 0px;
|
| 117 |
-
padding-right: 5px;
|
| 118 |
-
font-size: 10px;
|
| 119 |
-
padding-bottom: 5px;
|
| 120 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 121 |
-
color: green;
|
| 122 |
-
}
|
| 123 |
-
}
|
| 124 |
-
|
| 125 |
-
.box {
|
| 126 |
-
display: flex;
|
| 127 |
-
justify-content: center;
|
| 128 |
-
align-items: center;
|
| 129 |
-
height: inherit;
|
| 130 |
-
padding: 20px;
|
| 131 |
-
}
|
| 132 |
-
@media screen and (min-width: 800px) {
|
| 133 |
-
form {
|
| 134 |
-
width: 50%;
|
| 135 |
-
overflow-x: hidden;
|
| 136 |
-
padding: 20px;
|
| 137 |
-
border-radius: 10px;
|
| 138 |
-
background: #fff;
|
| 139 |
-
box-shadow: 0 0 20px 0 #095484;
|
| 140 |
-
}}
|
| 141 |
-
|
| 142 |
-
@media screen and (max-width: 800px) {
|
| 143 |
-
form {
|
| 144 |
-
width: 100%;
|
| 145 |
-
overflow-x: hidden;
|
| 146 |
-
padding: 20px;
|
| 147 |
-
border-radius: 10px;
|
| 148 |
-
background: #fff;
|
| 149 |
-
box-shadow: 0 0 15px 0 #095484;
|
| 150 |
-
}}
|
| 151 |
-
.banner {
|
| 152 |
-
position: relative;
|
| 153 |
-
height: 30px;
|
| 154 |
-
/* background-size: cover; */
|
| 155 |
-
display: flex;
|
| 156 |
-
/* justify-content: center; */
|
| 157 |
-
/* align-items: center; */
|
| 158 |
-
/* text-align: center; */
|
| 159 |
-
}
|
| 160 |
-
@media screen and (min-width: 800px) {
|
| 161 |
-
h1 {
|
| 162 |
-
position: absolute;
|
| 163 |
-
margin: 0;
|
| 164 |
-
padding-left: 50px;
|
| 165 |
-
font-size: 25px;
|
| 166 |
-
color: black;
|
| 167 |
-
z-index: 2;
|
| 168 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 169 |
-
}
|
| 170 |
-
}
|
| 171 |
-
|
| 172 |
-
@media screen and (max-width: 800px) {
|
| 173 |
-
h1 {
|
| 174 |
-
position: absolute;
|
| 175 |
-
margin: 0;
|
| 176 |
-
padding-left: 40px;
|
| 177 |
-
font-size: 24px;
|
| 178 |
-
color: black;
|
| 179 |
-
z-index: 2;
|
| 180 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 181 |
-
}
|
| 182 |
-
}
|
| 183 |
-
|
| 184 |
-
p.unavailable {
|
| 185 |
-
background-color: #E5E4E2;
|
| 186 |
-
display: block;
|
| 187 |
-
width: 100%;
|
| 188 |
-
text-decoration: none;
|
| 189 |
-
color: black;
|
| 190 |
-
line-height: 1.2;
|
| 191 |
-
align: justify;
|
| 192 |
-
border-left: 5px solid transparent;
|
| 193 |
-
border-top: 5px solid transparent;
|
| 194 |
-
border-bottom: 5px solid transparent;
|
| 195 |
-
border-right: 0px;
|
| 196 |
-
font-weight: bold;
|
| 197 |
-
font-size: 18px;
|
| 198 |
-
padding-right: 5px;
|
| 199 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 200 |
-
}
|
| 201 |
-
div.news-item{
|
| 202 |
-
background-color: #E5E4E2;
|
| 203 |
-
/*box-shadow: rgba(0, 0, 0, 0.4) -1px 0px 5px, rgba(0, 0, 0, 0.5) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -3px 0px inset;*/
|
| 204 |
-
box-shadow: rgba(0, 0, 0, 0.25) 0px 0px 5px 1px, rgba(0, 0, 0, 0.1) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -1px 0px inset;
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
}
|
| 208 |
-
div.news-item:hover{
|
| 209 |
-
box-shadow: none;
|
| 210 |
-
}
|
| 211 |
-
|
| 212 |
-
@media screen and (min-width: 800px) {
|
| 213 |
-
p.srctxt {
|
| 214 |
-
align:justify;
|
| 215 |
-
text-align: justify;
|
| 216 |
-
word-break: break-all;
|
| 217 |
-
font-size: 11px;
|
| 218 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 219 |
-
}
|
| 220 |
-
.logo-img{
|
| 221 |
-
margin-right: 10px;
|
| 222 |
-
vertical-align: center;
|
| 223 |
-
/* position: relative; */
|
| 224 |
-
width: 34px;
|
| 225 |
-
height: 34px;
|
| 226 |
-
|
| 227 |
-
}
|
| 228 |
-
}
|
| 229 |
-
|
| 230 |
-
@media screen and (max-width: 800px) {
|
| 231 |
-
p.srctxt {
|
| 232 |
-
align:justify;
|
| 233 |
-
text-align: justify;
|
| 234 |
-
word-break: break-all;
|
| 235 |
-
font-size: 9px;
|
| 236 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 237 |
-
}
|
| 238 |
-
.logo-img{
|
| 239 |
-
margin-right: 10px;
|
| 240 |
-
vertical-align: top;
|
| 241 |
-
/* position: absolute; */
|
| 242 |
-
width: 30px;
|
| 243 |
-
height: 30px;
|
| 244 |
-
}
|
| 245 |
-
}
|
| 246 |
-
|
| 247 |
-
.float{
|
| 248 |
-
position:fixed;
|
| 249 |
-
width:25px;
|
| 250 |
-
height:25px;
|
| 251 |
-
bottom:15px;
|
| 252 |
-
right:12px;
|
| 253 |
-
background-color: white;
|
| 254 |
-
border-radius:50%;
|
| 255 |
-
text-align:center;
|
| 256 |
-
vertical-align:center;
|
| 257 |
-
z-index: 99999998;
|
| 258 |
-
font-size:0;
|
| 259 |
-
cursor:pointer;
|
| 260 |
-
animation: beatan 0.8s infinite alternate;
|
| 261 |
-
|
| 262 |
-
}
|
| 263 |
-
.top-float{
|
| 264 |
-
position:fixed;
|
| 265 |
-
width:25px;
|
| 266 |
-
height:25px;
|
| 267 |
-
bottom:52px;
|
| 268 |
-
right:12px;
|
| 269 |
-
background-color: white;
|
| 270 |
-
border-radius:50%;
|
| 271 |
-
text-align:center;
|
| 272 |
-
vertical-align:center;
|
| 273 |
-
z-index: 99999998;
|
| 274 |
-
font-size:0;
|
| 275 |
-
cursor:pointer;
|
| 276 |
-
animation: beatan 0.8s infinite alternate;
|
| 277 |
-
|
| 278 |
-
}
|
| 279 |
-
.my-float{
|
| 280 |
-
margin-top:22px;
|
| 281 |
-
}
|
| 282 |
-
|
| 283 |
-
@keyframes beatan{
|
| 284 |
-
to { transform: scale(1.1); }
|
| 285 |
-
}
|
| 286 |
-
|
| 287 |
-
.loader {
|
| 288 |
-
position: fixed;
|
| 289 |
-
left: 0px;
|
| 290 |
-
top: 0px;
|
| 291 |
-
width: 100%;
|
| 292 |
-
height: 100%;
|
| 293 |
-
z-index: 99999999999;
|
| 294 |
-
background: url('../static/loader.gif') 50% 50% no-repeat rgb(255,255,255);
|
| 295 |
-
}
|
| 296 |
-
|
| 297 |
-
.highlight {
|
| 298 |
-
background-color: yellow;
|
| 299 |
-
font-weight: bold;
|
| 300 |
-
}
|
| 301 |
-
|
| 302 |
-
.input-container {
|
| 303 |
-
position: relative;
|
| 304 |
-
padding-bottom: 10px;
|
| 305 |
-
}
|
| 306 |
-
|
| 307 |
-
.keyword-input {
|
| 308 |
-
|
| 309 |
-
border-radius: 5px;
|
| 310 |
-
transition: border-color 0.3s ease;
|
| 311 |
-
border: 1px solid silver;
|
| 312 |
-
width: 10em;
|
| 313 |
-
height: 1.5em;
|
| 314 |
-
padding-left: 0.5em;
|
| 315 |
-
outline: none;
|
| 316 |
-
overflow: hidden;
|
| 317 |
-
|
| 318 |
-
}
|
| 319 |
-
|
| 320 |
-
.clear-btn {
|
| 321 |
-
position: absolute;
|
| 322 |
-
font-size: 20px;
|
| 323 |
-
left: 129px;
|
| 324 |
-
transform: translateY(-105%);
|
| 325 |
-
cursor: pointer;
|
| 326 |
-
opacity: 0;
|
| 327 |
-
transition: opacity 0.3s ease;
|
| 328 |
-
}
|
| 329 |
-
|
| 330 |
-
.clear-btn.show {
|
| 331 |
-
opacity: 1;
|
| 332 |
-
}
|
| 333 |
-
|
| 334 |
-
@media screen and (min-width: 800px) {
|
| 335 |
-
a.article-category {
|
| 336 |
-
background-color: #E5E4E2;
|
| 337 |
-
align:justify;
|
| 338 |
-
display: block;
|
| 339 |
-
height:100%;
|
| 340 |
-
width: relative;
|
| 341 |
-
text-decoration: none;
|
| 342 |
-
border-left: 5px solid transparent;
|
| 343 |
-
border-top: 0px;
|
| 344 |
-
font-weight: bold;
|
| 345 |
-
border-bottom: 1px solid transparent;
|
| 346 |
-
border-right: 0px;
|
| 347 |
-
padding-right: 5px;
|
| 348 |
-
font-size: 11px;
|
| 349 |
-
padding-bottom: 0px;
|
| 350 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 351 |
-
color: green;
|
| 352 |
-
}
|
| 353 |
-
}
|
| 354 |
-
|
| 355 |
-
@media screen and (max-width: 800px) {
|
| 356 |
-
a.article-category {
|
| 357 |
-
background-color: #E5E4E2;
|
| 358 |
-
align:justify;
|
| 359 |
-
display: block;
|
| 360 |
-
height:100%;
|
| 361 |
-
font-weight: bold;
|
| 362 |
-
width: relative;
|
| 363 |
-
text-decoration: none;
|
| 364 |
-
border-left: 5px solid transparent;
|
| 365 |
-
border-top: 0px;
|
| 366 |
-
border-bottom: 1px solid transparent;
|
| 367 |
-
border-right: 0px;
|
| 368 |
-
padding-right: 5px;
|
| 369 |
-
font-size: 10px;
|
| 370 |
-
padding-bottom: 0px;
|
| 371 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 372 |
-
color: green;
|
| 373 |
-
}
|
| 374 |
-
}
|
| 375 |
-
|
| 376 |
-
.content {
|
| 377 |
-
display: none;
|
| 378 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 379 |
-
|
| 380 |
-
padding-right: 5px;
|
| 381 |
-
|
| 382 |
-
padding-top: 5px;
|
| 383 |
-
border-left: 5px solid transparent;
|
| 384 |
-
}
|
| 385 |
-
|
| 386 |
-
.container{
|
| 387 |
-
padding-bottom:10px;
|
| 388 |
-
}
|
| 389 |
-
|
| 390 |
-
.show-similar-button-container{
|
| 391 |
-
display: flex;
|
| 392 |
-
flex-direction: column;
|
| 393 |
-
align-items: center;
|
| 394 |
-
}
|
| 395 |
-
|
| 396 |
-
.similar-news-item:hover {
|
| 397 |
-
text-decoration: none;
|
| 398 |
-
}
|
| 399 |
-
|
| 400 |
-
@media screen and (min-width: 800px) {
|
| 401 |
-
.similar-news-item {
|
| 402 |
-
text-align: justify;
|
| 403 |
-
text-decoration: underline;
|
| 404 |
-
font-size: 14px;
|
| 405 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 406 |
-
color: black;
|
| 407 |
-
display:inline-block;
|
| 408 |
-
padding-bottom: 10px;
|
| 409 |
-
width:100%;
|
| 410 |
-
/*white-space: nowrap;
|
| 411 |
-
overflow: hidden;
|
| 412 |
-
text-overflow: ellipsis;*/
|
| 413 |
-
|
| 414 |
-
}
|
| 415 |
-
}
|
| 416 |
-
|
| 417 |
-
@media screen and (max-width: 800px) {
|
| 418 |
-
.similar-news-item {
|
| 419 |
-
text-align: justify;
|
| 420 |
-
text-decoration: underline;
|
| 421 |
-
font-size: 12px;
|
| 422 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 423 |
-
color: black;
|
| 424 |
-
display:inline-block;
|
| 425 |
-
padding-bottom: 8px;
|
| 426 |
-
width:100%;
|
| 427 |
-
/*white-space: nowrap;
|
| 428 |
-
overflow: hidden;
|
| 429 |
-
text-overflow: ellipsis;*/
|
| 430 |
-
}
|
| 431 |
-
}
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
.show-more {
|
| 436 |
-
background-color: #E5E4E2;
|
| 437 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 438 |
-
border-radius:4px;
|
| 439 |
-
padding-top:3px;
|
| 440 |
-
padding-bottom:3px;
|
| 441 |
-
padding-left:3px;
|
| 442 |
-
padding-right:3px;
|
| 443 |
-
font-size: 12px;
|
| 444 |
-
display: box;
|
| 445 |
-
border: none;
|
| 446 |
-
|
| 447 |
-
}
|
| 448 |
-
|
| 449 |
-
.show-more:hover {
|
| 450 |
-
background-color: black;
|
| 451 |
-
color: white;
|
| 452 |
-
}
|
| 453 |
-
|
| 454 |
-
.show-less {
|
| 455 |
-
background-color: #E5E4E2;
|
| 456 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 457 |
-
border-radius:4px;
|
| 458 |
-
padding-top:3px;
|
| 459 |
-
padding-bottom:3px;
|
| 460 |
-
padding-left:3px;
|
| 461 |
-
padding-right:3px;
|
| 462 |
-
font-size: 12px;
|
| 463 |
-
border: none;
|
| 464 |
-
display: none;
|
| 465 |
-
}
|
| 466 |
-
|
| 467 |
-
.show-less:hover {
|
| 468 |
-
background-color: black;
|
| 469 |
-
color: white;
|
| 470 |
-
}
|
| 471 |
-
|
| 472 |
-
.word-cloud-container{
|
| 473 |
-
word-wrap: break-word;
|
| 474 |
-
padding-bottom: 10px;
|
| 475 |
-
|
| 476 |
-
}
|
| 477 |
-
|
| 478 |
-
.wc-tokens{
|
| 479 |
-
font-family: Arial, Helvetica, sans-serif;
|
| 480 |
-
font-size: 13.2px;
|
| 481 |
-
cursor: pointer;
|
| 482 |
-
}
|
| 483 |
-
|
| 484 |
-
.wc-tokens:hover{
|
| 485 |
-
text-decoration: underline;
|
| 486 |
-
}
|
| 487 |
-
|
| 488 |
-
.word-cloud-section{
|
| 489 |
-
padding-bottom: 10px;
|
| 490 |
-
display: none;
|
| 491 |
-
word-wrap: break-word;
|
| 492 |
-
}
|
| 493 |
-
|
| 494 |
-
.show-more-word-cloud{
|
| 495 |
-
padding-bottom: 23px;
|
| 496 |
-
text-align: center;
|
| 497 |
-
}
|
| 498 |
-
|
| 499 |
-
.three-dots{
|
| 500 |
-
font-size: 30px;
|
| 501 |
-
margin: 0;
|
| 502 |
-
line-height:0;
|
| 503 |
-
vertical-align: top;
|
| 504 |
-
padding: 0;
|
| 505 |
-
cursor: pointer;
|
| 506 |
-
}
|
| 507 |
-
|
| 508 |
-
.three-dots:hover{
|
| 509 |
-
font-size: 25px;
|
| 510 |
}
|
|
|
|
| 1 |
+
html {
|
| 2 |
+
scroll-behavior: smooth;
|
| 3 |
+
}
|
| 4 |
+
|
| 5 |
+
@media screen and (min-width: 800px) {
|
| 6 |
+
a.headline {
|
| 7 |
+
background-color: #E5E4E2;
|
| 8 |
+
display: block;
|
| 9 |
+
width: relative;
|
| 10 |
+
text-decoration: none;
|
| 11 |
+
color: black;
|
| 12 |
+
line-height: 1.2;
|
| 13 |
+
align: justify;
|
| 14 |
+
border-left: 5px solid transparent;
|
| 15 |
+
border-top: 5px solid transparent;
|
| 16 |
+
border-bottom: 5px solid transparent;
|
| 17 |
+
border-right: 0px;
|
| 18 |
+
font-weight: bold;
|
| 19 |
+
font-size: 18px;
|
| 20 |
+
padding-right: 5px;
|
| 21 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
@media screen and (max-width: 800px) {
|
| 26 |
+
a.headline {
|
| 27 |
+
background-color: #E5E4E2;
|
| 28 |
+
display: block;
|
| 29 |
+
width: relative;
|
| 30 |
+
text-decoration: none;
|
| 31 |
+
color: black;
|
| 32 |
+
line-height: 1.2;
|
| 33 |
+
align: justify;
|
| 34 |
+
border-left: 5px solid transparent;
|
| 35 |
+
border-top: 5px solid transparent;
|
| 36 |
+
border-bottom: 5px solid transparent;
|
| 37 |
+
border-right: 0px;
|
| 38 |
+
font-weight: bold;
|
| 39 |
+
font-size: 16.5px;
|
| 40 |
+
padding-right: 5px;
|
| 41 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
@media screen and (min-width: 800px) {
|
| 46 |
+
a.description {
|
| 47 |
+
background-color: #E5E4E2;
|
| 48 |
+
align:justify;
|
| 49 |
+
text-align: justify;
|
| 50 |
+
display: block;
|
| 51 |
+
height:100%;
|
| 52 |
+
width: relative;
|
| 53 |
+
text-decoration: none;
|
| 54 |
+
border-left: 5px solid transparent;
|
| 55 |
+
border-top: 0px;
|
| 56 |
+
border-bottom: 7px solid transparent;
|
| 57 |
+
border-right: 0px;
|
| 58 |
+
font-size: 14px;
|
| 59 |
+
padding-right: 5px;
|
| 60 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 61 |
+
color: dimgrey;
|
| 62 |
+
}
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
@media screen and (max-width: 800px) {
|
| 66 |
+
a.description {
|
| 67 |
+
background-color: #E5E4E2;
|
| 68 |
+
align:justify;
|
| 69 |
+
text-align: justify;
|
| 70 |
+
display: block;
|
| 71 |
+
height:100%;
|
| 72 |
+
width: relative;
|
| 73 |
+
text-decoration: none;
|
| 74 |
+
border-left: 5px solid transparent;
|
| 75 |
+
border-top: 0px;
|
| 76 |
+
border-bottom: 7px solid transparent;
|
| 77 |
+
border-right: 0px;
|
| 78 |
+
font-size: 12.5px;
|
| 79 |
+
padding-right: 5px;
|
| 80 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 81 |
+
color: dimgrey;
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
@media screen and (min-width: 800px) {
|
| 86 |
+
a.time {
|
| 87 |
+
background-color: #E5E4E2;
|
| 88 |
+
align:justify;
|
| 89 |
+
display: block;
|
| 90 |
+
height:100%;
|
| 91 |
+
width: relative;
|
| 92 |
+
text-decoration: none;
|
| 93 |
+
border-left: 5px solid transparent;
|
| 94 |
+
border-top: 0px;
|
| 95 |
+
border-bottom: 1px solid transparent;
|
| 96 |
+
border-right: 0px;
|
| 97 |
+
padding-right: 5px;
|
| 98 |
+
font-size: 11px;
|
| 99 |
+
padding-bottom: 5px;
|
| 100 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 101 |
+
color: green;
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
@media screen and (max-width: 800px) {
|
| 106 |
+
a.time {
|
| 107 |
+
background-color: #E5E4E2;
|
| 108 |
+
align:justify;
|
| 109 |
+
display: block;
|
| 110 |
+
height:100%;
|
| 111 |
+
width: relative;
|
| 112 |
+
text-decoration: none;
|
| 113 |
+
border-left: 5px solid transparent;
|
| 114 |
+
border-top: 0px;
|
| 115 |
+
border-bottom: 1px solid transparent;
|
| 116 |
+
border-right: 0px;
|
| 117 |
+
padding-right: 5px;
|
| 118 |
+
font-size: 10px;
|
| 119 |
+
padding-bottom: 5px;
|
| 120 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 121 |
+
color: green;
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
.box {
|
| 126 |
+
display: flex;
|
| 127 |
+
justify-content: center;
|
| 128 |
+
align-items: center;
|
| 129 |
+
height: inherit;
|
| 130 |
+
padding: 20px;
|
| 131 |
+
}
|
| 132 |
+
@media screen and (min-width: 800px) {
|
| 133 |
+
form {
|
| 134 |
+
width: 50%;
|
| 135 |
+
overflow-x: hidden;
|
| 136 |
+
padding: 20px;
|
| 137 |
+
border-radius: 10px;
|
| 138 |
+
background: #fff;
|
| 139 |
+
box-shadow: 0 0 20px 0 #095484;
|
| 140 |
+
}}
|
| 141 |
+
|
| 142 |
+
@media screen and (max-width: 800px) {
|
| 143 |
+
form {
|
| 144 |
+
width: 100%;
|
| 145 |
+
overflow-x: hidden;
|
| 146 |
+
padding: 20px;
|
| 147 |
+
border-radius: 10px;
|
| 148 |
+
background: #fff;
|
| 149 |
+
box-shadow: 0 0 15px 0 #095484;
|
| 150 |
+
}}
|
| 151 |
+
.banner {
|
| 152 |
+
position: relative;
|
| 153 |
+
height: 30px;
|
| 154 |
+
/* background-size: cover; */
|
| 155 |
+
display: flex;
|
| 156 |
+
/* justify-content: center; */
|
| 157 |
+
/* align-items: center; */
|
| 158 |
+
/* text-align: center; */
|
| 159 |
+
}
|
| 160 |
+
@media screen and (min-width: 800px) {
|
| 161 |
+
h1 {
|
| 162 |
+
position: absolute;
|
| 163 |
+
margin: 0;
|
| 164 |
+
padding-left: 50px;
|
| 165 |
+
font-size: 25px;
|
| 166 |
+
color: black;
|
| 167 |
+
z-index: 2;
|
| 168 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 169 |
+
}
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
@media screen and (max-width: 800px) {
|
| 173 |
+
h1 {
|
| 174 |
+
position: absolute;
|
| 175 |
+
margin: 0;
|
| 176 |
+
padding-left: 40px;
|
| 177 |
+
font-size: 24px;
|
| 178 |
+
color: black;
|
| 179 |
+
z-index: 2;
|
| 180 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 181 |
+
}
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
p.unavailable {
|
| 185 |
+
background-color: #E5E4E2;
|
| 186 |
+
display: block;
|
| 187 |
+
width: 100%;
|
| 188 |
+
text-decoration: none;
|
| 189 |
+
color: black;
|
| 190 |
+
line-height: 1.2;
|
| 191 |
+
align: justify;
|
| 192 |
+
border-left: 5px solid transparent;
|
| 193 |
+
border-top: 5px solid transparent;
|
| 194 |
+
border-bottom: 5px solid transparent;
|
| 195 |
+
border-right: 0px;
|
| 196 |
+
font-weight: bold;
|
| 197 |
+
font-size: 18px;
|
| 198 |
+
padding-right: 5px;
|
| 199 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 200 |
+
}
|
| 201 |
+
div.news-item{
|
| 202 |
+
background-color: #E5E4E2;
|
| 203 |
+
/*box-shadow: rgba(0, 0, 0, 0.4) -1px 0px 5px, rgba(0, 0, 0, 0.5) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -3px 0px inset;*/
|
| 204 |
+
box-shadow: rgba(0, 0, 0, 0.25) 0px 0px 5px 1px, rgba(0, 0, 0, 0.1) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -1px 0px inset;
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
}
|
| 208 |
+
div.news-item:hover{
|
| 209 |
+
box-shadow: none;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
@media screen and (min-width: 800px) {
|
| 213 |
+
p.srctxt {
|
| 214 |
+
align:justify;
|
| 215 |
+
text-align: justify;
|
| 216 |
+
word-break: break-all;
|
| 217 |
+
font-size: 11px;
|
| 218 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 219 |
+
}
|
| 220 |
+
.logo-img{
|
| 221 |
+
margin-right: 10px;
|
| 222 |
+
vertical-align: center;
|
| 223 |
+
/* position: relative; */
|
| 224 |
+
width: 34px;
|
| 225 |
+
height: 34px;
|
| 226 |
+
|
| 227 |
+
}
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
@media screen and (max-width: 800px) {
|
| 231 |
+
p.srctxt {
|
| 232 |
+
align:justify;
|
| 233 |
+
text-align: justify;
|
| 234 |
+
word-break: break-all;
|
| 235 |
+
font-size: 9px;
|
| 236 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 237 |
+
}
|
| 238 |
+
.logo-img{
|
| 239 |
+
margin-right: 10px;
|
| 240 |
+
vertical-align: top;
|
| 241 |
+
/* position: absolute; */
|
| 242 |
+
width: 30px;
|
| 243 |
+
height: 30px;
|
| 244 |
+
}
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
.float{
|
| 248 |
+
position:fixed;
|
| 249 |
+
width:25px;
|
| 250 |
+
height:25px;
|
| 251 |
+
bottom:15px;
|
| 252 |
+
right:12px;
|
| 253 |
+
background-color: white;
|
| 254 |
+
border-radius:50%;
|
| 255 |
+
text-align:center;
|
| 256 |
+
vertical-align:center;
|
| 257 |
+
z-index: 99999998;
|
| 258 |
+
font-size:0;
|
| 259 |
+
cursor:pointer;
|
| 260 |
+
animation: beatan 0.8s infinite alternate;
|
| 261 |
+
|
| 262 |
+
}
|
| 263 |
+
.top-float{
|
| 264 |
+
position:fixed;
|
| 265 |
+
width:25px;
|
| 266 |
+
height:25px;
|
| 267 |
+
bottom:52px;
|
| 268 |
+
right:12px;
|
| 269 |
+
background-color: white;
|
| 270 |
+
border-radius:50%;
|
| 271 |
+
text-align:center;
|
| 272 |
+
vertical-align:center;
|
| 273 |
+
z-index: 99999998;
|
| 274 |
+
font-size:0;
|
| 275 |
+
cursor:pointer;
|
| 276 |
+
animation: beatan 0.8s infinite alternate;
|
| 277 |
+
|
| 278 |
+
}
|
| 279 |
+
.my-float{
|
| 280 |
+
margin-top:22px;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
@keyframes beatan{
|
| 284 |
+
to { transform: scale(1.1); }
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.loader {
|
| 288 |
+
position: fixed;
|
| 289 |
+
left: 0px;
|
| 290 |
+
top: 0px;
|
| 291 |
+
width: 100%;
|
| 292 |
+
height: 100%;
|
| 293 |
+
z-index: 99999999999;
|
| 294 |
+
background: url('../static/loader.gif') 50% 50% no-repeat rgb(255,255,255);
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
.highlight {
|
| 298 |
+
background-color: yellow;
|
| 299 |
+
font-weight: bold;
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
.input-container {
|
| 303 |
+
position: relative;
|
| 304 |
+
padding-bottom: 10px;
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
.keyword-input {
|
| 308 |
+
|
| 309 |
+
border-radius: 5px;
|
| 310 |
+
transition: border-color 0.3s ease;
|
| 311 |
+
border: 1px solid silver;
|
| 312 |
+
width: 10em;
|
| 313 |
+
height: 1.5em;
|
| 314 |
+
padding-left: 0.5em;
|
| 315 |
+
outline: none;
|
| 316 |
+
overflow: hidden;
|
| 317 |
+
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
.clear-btn {
|
| 321 |
+
position: absolute;
|
| 322 |
+
font-size: 20px;
|
| 323 |
+
left: 129px;
|
| 324 |
+
transform: translateY(-105%);
|
| 325 |
+
cursor: pointer;
|
| 326 |
+
opacity: 0;
|
| 327 |
+
transition: opacity 0.3s ease;
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
.clear-btn.show {
|
| 331 |
+
opacity: 1;
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
@media screen and (min-width: 800px) {
|
| 335 |
+
a.article-category {
|
| 336 |
+
background-color: #E5E4E2;
|
| 337 |
+
align:justify;
|
| 338 |
+
display: block;
|
| 339 |
+
height:100%;
|
| 340 |
+
width: relative;
|
| 341 |
+
text-decoration: none;
|
| 342 |
+
border-left: 5px solid transparent;
|
| 343 |
+
border-top: 0px;
|
| 344 |
+
font-weight: bold;
|
| 345 |
+
border-bottom: 1px solid transparent;
|
| 346 |
+
border-right: 0px;
|
| 347 |
+
padding-right: 5px;
|
| 348 |
+
font-size: 11px;
|
| 349 |
+
padding-bottom: 0px;
|
| 350 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 351 |
+
color: green;
|
| 352 |
+
}
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
@media screen and (max-width: 800px) {
|
| 356 |
+
a.article-category {
|
| 357 |
+
background-color: #E5E4E2;
|
| 358 |
+
align:justify;
|
| 359 |
+
display: block;
|
| 360 |
+
height:100%;
|
| 361 |
+
font-weight: bold;
|
| 362 |
+
width: relative;
|
| 363 |
+
text-decoration: none;
|
| 364 |
+
border-left: 5px solid transparent;
|
| 365 |
+
border-top: 0px;
|
| 366 |
+
border-bottom: 1px solid transparent;
|
| 367 |
+
border-right: 0px;
|
| 368 |
+
padding-right: 5px;
|
| 369 |
+
font-size: 10px;
|
| 370 |
+
padding-bottom: 0px;
|
| 371 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 372 |
+
color: green;
|
| 373 |
+
}
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
.content {
|
| 377 |
+
display: none;
|
| 378 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 379 |
+
|
| 380 |
+
padding-right: 5px;
|
| 381 |
+
|
| 382 |
+
padding-top: 5px;
|
| 383 |
+
border-left: 5px solid transparent;
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
.container{
|
| 387 |
+
padding-bottom:10px;
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
.show-similar-button-container{
|
| 391 |
+
display: flex;
|
| 392 |
+
flex-direction: column;
|
| 393 |
+
align-items: center;
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
.similar-news-item:hover {
|
| 397 |
+
text-decoration: none;
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
@media screen and (min-width: 800px) {
|
| 401 |
+
.similar-news-item {
|
| 402 |
+
text-align: justify;
|
| 403 |
+
text-decoration: underline;
|
| 404 |
+
font-size: 14px;
|
| 405 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 406 |
+
color: black;
|
| 407 |
+
display:inline-block;
|
| 408 |
+
padding-bottom: 10px;
|
| 409 |
+
width:100%;
|
| 410 |
+
/*white-space: nowrap;
|
| 411 |
+
overflow: hidden;
|
| 412 |
+
text-overflow: ellipsis;*/
|
| 413 |
+
|
| 414 |
+
}
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
@media screen and (max-width: 800px) {
|
| 418 |
+
.similar-news-item {
|
| 419 |
+
text-align: justify;
|
| 420 |
+
text-decoration: underline;
|
| 421 |
+
font-size: 12px;
|
| 422 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 423 |
+
color: black;
|
| 424 |
+
display:inline-block;
|
| 425 |
+
padding-bottom: 8px;
|
| 426 |
+
width:100%;
|
| 427 |
+
/*white-space: nowrap;
|
| 428 |
+
overflow: hidden;
|
| 429 |
+
text-overflow: ellipsis;*/
|
| 430 |
+
}
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
.show-more {
|
| 436 |
+
background-color: #E5E4E2;
|
| 437 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 438 |
+
border-radius:4px;
|
| 439 |
+
padding-top:3px;
|
| 440 |
+
padding-bottom:3px;
|
| 441 |
+
padding-left:3px;
|
| 442 |
+
padding-right:3px;
|
| 443 |
+
font-size: 12px;
|
| 444 |
+
display: box;
|
| 445 |
+
border: none;
|
| 446 |
+
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
.show-more:hover {
|
| 450 |
+
background-color: black;
|
| 451 |
+
color: white;
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
.show-less {
|
| 455 |
+
background-color: #E5E4E2;
|
| 456 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 457 |
+
border-radius:4px;
|
| 458 |
+
padding-top:3px;
|
| 459 |
+
padding-bottom:3px;
|
| 460 |
+
padding-left:3px;
|
| 461 |
+
padding-right:3px;
|
| 462 |
+
font-size: 12px;
|
| 463 |
+
border: none;
|
| 464 |
+
display: none;
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
.show-less:hover {
|
| 468 |
+
background-color: black;
|
| 469 |
+
color: white;
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
.word-cloud-container{
|
| 473 |
+
word-wrap: break-word;
|
| 474 |
+
padding-bottom: 10px;
|
| 475 |
+
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
.wc-tokens{
|
| 479 |
+
font-family: Arial, Helvetica, sans-serif;
|
| 480 |
+
font-size: 13.2px;
|
| 481 |
+
cursor: pointer;
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
.wc-tokens:hover{
|
| 485 |
+
text-decoration: underline;
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
.word-cloud-section{
|
| 489 |
+
padding-bottom: 10px;
|
| 490 |
+
display: none;
|
| 491 |
+
word-wrap: break-word;
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
.show-more-word-cloud{
|
| 495 |
+
padding-bottom: 23px;
|
| 496 |
+
text-align: center;
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
.three-dots{
|
| 500 |
+
font-size: 30px;
|
| 501 |
+
margin: 0;
|
| 502 |
+
line-height:0;
|
| 503 |
+
vertical-align: top;
|
| 504 |
+
padding: 0;
|
| 505 |
+
cursor: pointer;
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
.three-dots:hover{
|
| 509 |
+
font-size: 25px;
|
| 510 |
}
|
templates/index.html
CHANGED
|
@@ -1,205 +1,205 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html lang="en">
|
| 3 |
-
<head>
|
| 4 |
-
|
| 5 |
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
-
<link rel="preload" href="../static/loader.gif" as="image">
|
| 7 |
-
<link rel="preload" href="../static/favicon_new.png" as="image">
|
| 8 |
-
<link rel="preload" href="../static/refresh_reload_icon.png" as="image">
|
| 9 |
-
<link rel="preload" href="../static/top-icon.png" as="image">
|
| 10 |
-
<link rel="icon" href="../static/favicon_new.png" type="image/png">
|
| 11 |
-
|
| 12 |
-
<meta charset="UTF-8">
|
| 13 |
-
<title>Latest Indian News</title>
|
| 14 |
-
<link rel="stylesheet" href="static/styles.css">
|
| 15 |
-
<a id="top-loc"></a>
|
| 16 |
-
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
|
| 17 |
-
<!--
|
| 18 |
-
<script>
|
| 19 |
-
$(window).load(function(){
|
| 20 |
-
$('.loader').fadeOut();
|
| 21 |
-
});
|
| 22 |
-
</script>
|
| 23 |
-
-->
|
| 24 |
-
|
| 25 |
-
<script>
|
| 26 |
-
function filterContent(match_case) {
|
| 27 |
-
var keyword = document.getElementById("keywordInput").value;
|
| 28 |
-
if (match_case == false)
|
| 29 |
-
{
|
| 30 |
-
/*var keyword = document.getElementById("keywordInput").value.toLowerCase(); */
|
| 31 |
-
/*var regex = new RegExp("\\b" + keyword + "\\b", "gi"); */
|
| 32 |
-
}
|
| 33 |
-
var clearbtn = document.getElementById("clearBtn");
|
| 34 |
-
|
| 35 |
-
if (keyword !== "")
|
| 36 |
-
{
|
| 37 |
-
clearbtn.style.opacity = 1;
|
| 38 |
-
var items = document.getElementsByClassName("news-item");
|
| 39 |
-
for (var i = 0; i < items.length; i++)
|
| 40 |
-
{
|
| 41 |
-
var headline = items[i].querySelector('.headline');
|
| 42 |
-
var description = items[i].querySelector('.description');
|
| 43 |
-
if (match_case == true)
|
| 44 |
-
{
|
| 45 |
-
var article_category = items[i].querySelector('.article-category');
|
| 46 |
-
var src_time = items[i].querySelector('.time');
|
| 47 |
-
var itemText = headline.textContent.concat(" ", description.textContent, " ", article_category.textContent, " ", src_time.textContent)
|
| 48 |
-
}
|
| 49 |
-
else
|
| 50 |
-
{
|
| 51 |
-
var itemText = headline.textContent.concat(" ", description.textContent, " ")
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
-
if (match_case == false)
|
| 55 |
-
{ var regex = new RegExp("\\b" + keyword + "\\b", "gi");
|
| 56 |
-
itemText = itemText.toLowerCase();
|
| 57 |
-
if (regex.test(itemText) == true)
|
| 58 |
-
{
|
| 59 |
-
items[i].style.display = "block";
|
| 60 |
-
highlightKeyword(headline, keyword, match_case);
|
| 61 |
-
highlightKeyword(description, keyword, match_case);
|
| 62 |
-
}
|
| 63 |
-
else
|
| 64 |
-
{
|
| 65 |
-
items[i].style.display = "none";
|
| 66 |
-
}
|
| 67 |
-
}
|
| 68 |
-
else
|
| 69 |
-
{
|
| 70 |
-
if (itemText.includes(keyword))
|
| 71 |
-
{
|
| 72 |
-
items[i].style.display = "block";
|
| 73 |
-
highlightKeyword(headline, keyword, match_case);
|
| 74 |
-
highlightKeyword(description, keyword, match_case);
|
| 75 |
-
highlightKeyword(article_category, keyword, match_case);
|
| 76 |
-
highlightKeyword(src_time, keyword, match_case);
|
| 77 |
-
|
| 78 |
-
}
|
| 79 |
-
else
|
| 80 |
-
{
|
| 81 |
-
items[i].style.display = "none";
|
| 82 |
-
}
|
| 83 |
-
}
|
| 84 |
-
}
|
| 85 |
-
}
|
| 86 |
-
else
|
| 87 |
-
{
|
| 88 |
-
clearFilter();
|
| 89 |
-
}
|
| 90 |
-
}
|
| 91 |
-
|
| 92 |
-
function clearFilter() {
|
| 93 |
-
var items = document.getElementsByClassName("news-item");
|
| 94 |
-
var clearbtn = document.getElementById("clearBtn");
|
| 95 |
-
clearbtn.style.opacity=0;
|
| 96 |
-
for (var i = 0; i < items.length; i++) {
|
| 97 |
-
var headline = items[i].querySelector('.headline');
|
| 98 |
-
var description = items[i].querySelector('.description');
|
| 99 |
-
var article_category = items[i].querySelector('.article-category');
|
| 100 |
-
var src_time = items[i].querySelector('.time');
|
| 101 |
-
items[i].style.display = "block";
|
| 102 |
-
headline.innerHTML = headline.textContent; // Remove highlighting
|
| 103 |
-
description.innerHTML = description.textContent; // Remove highlighting
|
| 104 |
-
article_category.innerHTML = article_category.textContent; // Remove highlighting
|
| 105 |
-
src_time.innerHTML = src_time.textContent; // Remove highlighting
|
| 106 |
-
}
|
| 107 |
-
|
| 108 |
-
document.getElementById("keywordInput").value = ""; // Clear input field
|
| 109 |
-
}
|
| 110 |
-
|
| 111 |
-
function highlightKeyword(element, keyword, match_case) {
|
| 112 |
-
var regex = new RegExp(keyword);
|
| 113 |
-
if (match_case == false)
|
| 114 |
-
{
|
| 115 |
-
var regex = new RegExp("\\b" + keyword + "\\b", 'gi');
|
| 116 |
-
}
|
| 117 |
-
element.innerHTML = element.textContent.replace(regex, function(match) {
|
| 118 |
-
return '<span class="highlight">' + match + '</span>';
|
| 119 |
-
});
|
| 120 |
-
}
|
| 121 |
-
|
| 122 |
-
</script>
|
| 123 |
-
|
| 124 |
-
<script>
|
| 125 |
-
document.addEventListener('DOMContentLoaded', function() {
|
| 126 |
-
const containers = document.querySelectorAll('.container');
|
| 127 |
-
|
| 128 |
-
containers.forEach(container => {
|
| 129 |
-
const content = container.querySelector('.content');
|
| 130 |
-
const showMoreBtn = container.querySelector('.show-more');
|
| 131 |
-
const showLessBtn = container.querySelector('.show-less');
|
| 132 |
-
|
| 133 |
-
showMoreBtn.addEventListener('click', function() {
|
| 134 |
-
/* var similar_news_items = document.getElementsByClassName("content");
|
| 135 |
-
var show_less_items = document.getElementsByClassName("show-less");
|
| 136 |
-
var show_more_items = document.getElementsByClassName("show-more");
|
| 137 |
-
for (var i = 0; i < similar_news_items.length; i++) {
|
| 138 |
-
similar_news_items[i].style.display = 'none';
|
| 139 |
-
show_more_items[i].style.display = 'block';
|
| 140 |
-
show_less_items[i].style.display = 'none';
|
| 141 |
-
}
|
| 142 |
-
*/
|
| 143 |
-
|
| 144 |
-
content.style.display = 'block';
|
| 145 |
-
content.style.opacity = 1;
|
| 146 |
-
showMoreBtn.style.display = 'none';
|
| 147 |
-
showLessBtn.style.display = 'block';
|
| 148 |
-
});
|
| 149 |
-
|
| 150 |
-
showLessBtn.addEventListener('click', function() {
|
| 151 |
-
document.documentElement.style.scrollBehavior = "auto";
|
| 152 |
-
var max_h = content.parentElement.parentElement.clientHeight;
|
| 153 |
-
content.style.display = 'none';
|
| 154 |
-
showMoreBtn.style.display = 'block';
|
| 155 |
-
showLessBtn.style.display = 'none';
|
| 156 |
-
var min_h = content.parentElement.parentElement.clientHeight;
|
| 157 |
-
$(window).scrollTop($(window).scrollTop() - (max_h - min_h) || 0);
|
| 158 |
-
document.documentElement.style.scrollBehavior = "smooth";
|
| 159 |
-
});
|
| 160 |
-
});
|
| 161 |
-
});
|
| 162 |
-
</script>
|
| 163 |
-
|
| 164 |
-
<script>
|
| 165 |
-
function wc_search(keyword)
|
| 166 |
-
{
|
| 167 |
-
clearFilter();
|
| 168 |
-
document.getElementById("keywordInput").value = keyword;
|
| 169 |
-
filterContent(false);
|
| 170 |
-
}
|
| 171 |
-
|
| 172 |
-
function word_cloud_display()
|
| 173 |
-
{
|
| 174 |
-
var word_cloud_section = document.getElementById("word-cloud-section-id");
|
| 175 |
-
if (word_cloud_section.style.display == 'block')
|
| 176 |
-
{
|
| 177 |
-
word_cloud_section.style.display = 'none';
|
| 178 |
-
}
|
| 179 |
-
else
|
| 180 |
-
{
|
| 181 |
-
word_cloud_section.style.display = 'block';
|
| 182 |
-
}
|
| 183 |
-
}
|
| 184 |
-
</script>
|
| 185 |
-
|
| 186 |
-
<script>
|
| 187 |
-
function showSearchInfo() {
|
| 188 |
-
alert("- Search is case-sensitive.\n- Search for news category (NATION, WORLD, SPORTS, ENTERTAINMENT, BUSINESS, TECHNOLOGY, HEALTH and SCIENCE) to filter news by category.\n- Search for news source (like zeebiz.com, ndtv.com, etc.) to filter news by source.")
|
| 189 |
-
}
|
| 190 |
-
</script>
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
</head>
|
| 194 |
-
<body>
|
| 195 |
-
<!--<div class="loader"></div>-->
|
| 196 |
-
{{body | safe}}
|
| 197 |
-
|
| 198 |
-
<a id="top_theme" class="top-float" onclick="window.scrollTo(0, 0);">
|
| 199 |
-
<img id="top-theme-icon" alt="_" src="../static/top-icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width="25px" height="25px" border="0">
|
| 200 |
-
</a>
|
| 201 |
-
<a href="javascript:window.location.reload(true)" id="theme" class="float">
|
| 202 |
-
<img id="theme-icon" alt="_" src="../static/refresh_reload_icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width=25px height=25px border="0" />
|
| 203 |
-
</a>
|
| 204 |
-
</body>
|
| 205 |
-
</html>
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<link rel="preload" href="../static/loader.gif" as="image">
|
| 7 |
+
<link rel="preload" href="../static/favicon_new.png" as="image">
|
| 8 |
+
<link rel="preload" href="../static/refresh_reload_icon.png" as="image">
|
| 9 |
+
<link rel="preload" href="../static/top-icon.png" as="image">
|
| 10 |
+
<link rel="icon" href="../static/favicon_new.png" type="image/png">
|
| 11 |
+
|
| 12 |
+
<meta charset="UTF-8">
|
| 13 |
+
<title>Latest Indian News</title>
|
| 14 |
+
<link rel="stylesheet" href="static/styles.css">
|
| 15 |
+
<a id="top-loc"></a>
|
| 16 |
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
|
| 17 |
+
<!--
|
| 18 |
+
<script>
|
| 19 |
+
$(window).load(function(){
|
| 20 |
+
$('.loader').fadeOut();
|
| 21 |
+
});
|
| 22 |
+
</script>
|
| 23 |
+
-->
|
| 24 |
+
|
| 25 |
+
<script>
|
| 26 |
+
function filterContent(match_case) {
|
| 27 |
+
var keyword = document.getElementById("keywordInput").value;
|
| 28 |
+
if (match_case == false)
|
| 29 |
+
{
|
| 30 |
+
/*var keyword = document.getElementById("keywordInput").value.toLowerCase(); */
|
| 31 |
+
/*var regex = new RegExp("\\b" + keyword + "\\b", "gi"); */
|
| 32 |
+
}
|
| 33 |
+
var clearbtn = document.getElementById("clearBtn");
|
| 34 |
+
|
| 35 |
+
if (keyword !== "")
|
| 36 |
+
{
|
| 37 |
+
clearbtn.style.opacity = 1;
|
| 38 |
+
var items = document.getElementsByClassName("news-item");
|
| 39 |
+
for (var i = 0; i < items.length; i++)
|
| 40 |
+
{
|
| 41 |
+
var headline = items[i].querySelector('.headline');
|
| 42 |
+
var description = items[i].querySelector('.description');
|
| 43 |
+
if (match_case == true)
|
| 44 |
+
{
|
| 45 |
+
var article_category = items[i].querySelector('.article-category');
|
| 46 |
+
var src_time = items[i].querySelector('.time');
|
| 47 |
+
var itemText = headline.textContent.concat(" ", description.textContent, " ", article_category.textContent, " ", src_time.textContent)
|
| 48 |
+
}
|
| 49 |
+
else
|
| 50 |
+
{
|
| 51 |
+
var itemText = headline.textContent.concat(" ", description.textContent, " ")
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
if (match_case == false)
|
| 55 |
+
{ var regex = new RegExp("\\b" + keyword + "\\b", "gi");
|
| 56 |
+
itemText = itemText.toLowerCase();
|
| 57 |
+
if (regex.test(itemText) == true)
|
| 58 |
+
{
|
| 59 |
+
items[i].style.display = "block";
|
| 60 |
+
highlightKeyword(headline, keyword, match_case);
|
| 61 |
+
highlightKeyword(description, keyword, match_case);
|
| 62 |
+
}
|
| 63 |
+
else
|
| 64 |
+
{
|
| 65 |
+
items[i].style.display = "none";
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
else
|
| 69 |
+
{
|
| 70 |
+
if (itemText.includes(keyword))
|
| 71 |
+
{
|
| 72 |
+
items[i].style.display = "block";
|
| 73 |
+
highlightKeyword(headline, keyword, match_case);
|
| 74 |
+
highlightKeyword(description, keyword, match_case);
|
| 75 |
+
highlightKeyword(article_category, keyword, match_case);
|
| 76 |
+
highlightKeyword(src_time, keyword, match_case);
|
| 77 |
+
|
| 78 |
+
}
|
| 79 |
+
else
|
| 80 |
+
{
|
| 81 |
+
items[i].style.display = "none";
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
else
|
| 87 |
+
{
|
| 88 |
+
clearFilter();
|
| 89 |
+
}
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
function clearFilter() {
|
| 93 |
+
var items = document.getElementsByClassName("news-item");
|
| 94 |
+
var clearbtn = document.getElementById("clearBtn");
|
| 95 |
+
clearbtn.style.opacity=0;
|
| 96 |
+
for (var i = 0; i < items.length; i++) {
|
| 97 |
+
var headline = items[i].querySelector('.headline');
|
| 98 |
+
var description = items[i].querySelector('.description');
|
| 99 |
+
var article_category = items[i].querySelector('.article-category');
|
| 100 |
+
var src_time = items[i].querySelector('.time');
|
| 101 |
+
items[i].style.display = "block";
|
| 102 |
+
headline.innerHTML = headline.textContent; // Remove highlighting
|
| 103 |
+
description.innerHTML = description.textContent; // Remove highlighting
|
| 104 |
+
article_category.innerHTML = article_category.textContent; // Remove highlighting
|
| 105 |
+
src_time.innerHTML = src_time.textContent; // Remove highlighting
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
document.getElementById("keywordInput").value = ""; // Clear input field
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
function highlightKeyword(element, keyword, match_case) {
|
| 112 |
+
var regex = new RegExp(keyword);
|
| 113 |
+
if (match_case == false)
|
| 114 |
+
{
|
| 115 |
+
var regex = new RegExp("\\b" + keyword + "\\b", 'gi');
|
| 116 |
+
}
|
| 117 |
+
element.innerHTML = element.textContent.replace(regex, function(match) {
|
| 118 |
+
return '<span class="highlight">' + match + '</span>';
|
| 119 |
+
});
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
</script>
|
| 123 |
+
|
| 124 |
+
<script>
|
| 125 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 126 |
+
const containers = document.querySelectorAll('.container');
|
| 127 |
+
|
| 128 |
+
containers.forEach(container => {
|
| 129 |
+
const content = container.querySelector('.content');
|
| 130 |
+
const showMoreBtn = container.querySelector('.show-more');
|
| 131 |
+
const showLessBtn = container.querySelector('.show-less');
|
| 132 |
+
|
| 133 |
+
showMoreBtn.addEventListener('click', function() {
|
| 134 |
+
/* var similar_news_items = document.getElementsByClassName("content");
|
| 135 |
+
var show_less_items = document.getElementsByClassName("show-less");
|
| 136 |
+
var show_more_items = document.getElementsByClassName("show-more");
|
| 137 |
+
for (var i = 0; i < similar_news_items.length; i++) {
|
| 138 |
+
similar_news_items[i].style.display = 'none';
|
| 139 |
+
show_more_items[i].style.display = 'block';
|
| 140 |
+
show_less_items[i].style.display = 'none';
|
| 141 |
+
}
|
| 142 |
+
*/
|
| 143 |
+
|
| 144 |
+
content.style.display = 'block';
|
| 145 |
+
content.style.opacity = 1;
|
| 146 |
+
showMoreBtn.style.display = 'none';
|
| 147 |
+
showLessBtn.style.display = 'block';
|
| 148 |
+
});
|
| 149 |
+
|
| 150 |
+
showLessBtn.addEventListener('click', function() {
|
| 151 |
+
document.documentElement.style.scrollBehavior = "auto";
|
| 152 |
+
var max_h = content.parentElement.parentElement.clientHeight;
|
| 153 |
+
content.style.display = 'none';
|
| 154 |
+
showMoreBtn.style.display = 'block';
|
| 155 |
+
showLessBtn.style.display = 'none';
|
| 156 |
+
var min_h = content.parentElement.parentElement.clientHeight;
|
| 157 |
+
$(window).scrollTop($(window).scrollTop() - (max_h - min_h) || 0);
|
| 158 |
+
document.documentElement.style.scrollBehavior = "smooth";
|
| 159 |
+
});
|
| 160 |
+
});
|
| 161 |
+
});
|
| 162 |
+
</script>
|
| 163 |
+
|
| 164 |
+
<script>
|
| 165 |
+
function wc_search(keyword)
|
| 166 |
+
{
|
| 167 |
+
clearFilter();
|
| 168 |
+
document.getElementById("keywordInput").value = keyword;
|
| 169 |
+
filterContent(false);
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
function word_cloud_display()
|
| 173 |
+
{
|
| 174 |
+
var word_cloud_section = document.getElementById("word-cloud-section-id");
|
| 175 |
+
if (word_cloud_section.style.display == 'block')
|
| 176 |
+
{
|
| 177 |
+
word_cloud_section.style.display = 'none';
|
| 178 |
+
}
|
| 179 |
+
else
|
| 180 |
+
{
|
| 181 |
+
word_cloud_section.style.display = 'block';
|
| 182 |
+
}
|
| 183 |
+
}
|
| 184 |
+
</script>
|
| 185 |
+
|
| 186 |
+
<script>
|
| 187 |
+
function showSearchInfo() {
|
| 188 |
+
alert("- Search is case-sensitive.\n- Search for news category (NATION, WORLD, SPORTS, ENTERTAINMENT, BUSINESS, TECHNOLOGY, HEALTH and SCIENCE) to filter news by category.\n- Search for news source (like zeebiz.com, ndtv.com, etc.) to filter news by source.")
|
| 189 |
+
}
|
| 190 |
+
</script>
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
</head>
|
| 194 |
+
<body>
|
| 195 |
+
<!--<div class="loader"></div>-->
|
| 196 |
+
{{body | safe}}
|
| 197 |
+
|
| 198 |
+
<a id="top_theme" class="top-float" onclick="window.scrollTo(0, 0);">
|
| 199 |
+
<img id="top-theme-icon" alt="_" src="../static/top-icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width="25px" height="25px" border="0">
|
| 200 |
+
</a>
|
| 201 |
+
<a href="javascript:window.location.reload(true)" id="theme" class="float">
|
| 202 |
+
<img id="theme-icon" alt="_" src="../static/refresh_reload_icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width=25px height=25px border="0" />
|
| 203 |
+
</a>
|
| 204 |
+
</body>
|
| 205 |
+
</html>
|
word_cloud.py
CHANGED
|
@@ -1,653 +1,653 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import string
|
| 4 |
-
from unidecode import unidecode
|
| 5 |
-
from collections import Counter
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class TextPreprocessor:
|
| 9 |
-
def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
|
| 10 |
-
remove_stop_words: bool = True,
|
| 11 |
-
remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None,
|
| 12 |
-
bottom_p: float = None):
|
| 13 |
-
self.remove_punct = remove_punct
|
| 14 |
-
self.remove_digits = remove_digits
|
| 15 |
-
self.remove_stop_words = remove_stop_words
|
| 16 |
-
self.remove_short_words = remove_short_words
|
| 17 |
-
self.minlen = minlen
|
| 18 |
-
self.maxlen = maxlen
|
| 19 |
-
self.top_p = top_p
|
| 20 |
-
self.bottom_p = bottom_p
|
| 21 |
-
self.words_to_remove = []
|
| 22 |
-
self.stop_words = ["'d", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
|
| 23 |
-
'about',
|
| 24 |
-
'above',
|
| 25 |
-
'across',
|
| 26 |
-
'after',
|
| 27 |
-
'afterwards',
|
| 28 |
-
'again',
|
| 29 |
-
'against',
|
| 30 |
-
'ain',
|
| 31 |
-
'all',
|
| 32 |
-
'almost',
|
| 33 |
-
'alone',
|
| 34 |
-
'along',
|
| 35 |
-
'already',
|
| 36 |
-
'also',
|
| 37 |
-
'although',
|
| 38 |
-
'always',
|
| 39 |
-
'am',
|
| 40 |
-
'among',
|
| 41 |
-
'amongst',
|
| 42 |
-
'amount',
|
| 43 |
-
'an',
|
| 44 |
-
'and',
|
| 45 |
-
'another',
|
| 46 |
-
'any',
|
| 47 |
-
'anyhow',
|
| 48 |
-
'anyone',
|
| 49 |
-
'anything',
|
| 50 |
-
'anyway',
|
| 51 |
-
'anywhere',
|
| 52 |
-
'are',
|
| 53 |
-
'around',
|
| 54 |
-
'as',
|
| 55 |
-
'at',
|
| 56 |
-
'back',
|
| 57 |
-
'be',
|
| 58 |
-
'became',
|
| 59 |
-
'because',
|
| 60 |
-
'become',
|
| 61 |
-
'becomes',
|
| 62 |
-
'becoming',
|
| 63 |
-
'been',
|
| 64 |
-
'before',
|
| 65 |
-
'beforehand',
|
| 66 |
-
'behind',
|
| 67 |
-
'being',
|
| 68 |
-
'below',
|
| 69 |
-
'beside',
|
| 70 |
-
'besides',
|
| 71 |
-
'between',
|
| 72 |
-
'beyond',
|
| 73 |
-
'both',
|
| 74 |
-
'bottom',
|
| 75 |
-
'but',
|
| 76 |
-
'by',
|
| 77 |
-
'ca',
|
| 78 |
-
'call',
|
| 79 |
-
'can',
|
| 80 |
-
'cannot',
|
| 81 |
-
'could',
|
| 82 |
-
'couldn',
|
| 83 |
-
"couldn't",
|
| 84 |
-
'd',
|
| 85 |
-
'did',
|
| 86 |
-
'do',
|
| 87 |
-
'does',
|
| 88 |
-
'doing',
|
| 89 |
-
'done',
|
| 90 |
-
'down',
|
| 91 |
-
'due',
|
| 92 |
-
'during',
|
| 93 |
-
'each',
|
| 94 |
-
'eight',
|
| 95 |
-
'either',
|
| 96 |
-
'eleven',
|
| 97 |
-
'else',
|
| 98 |
-
'elsewhere',
|
| 99 |
-
'empty',
|
| 100 |
-
'enough',
|
| 101 |
-
'even',
|
| 102 |
-
'ever',
|
| 103 |
-
'every',
|
| 104 |
-
'everyone',
|
| 105 |
-
'everything',
|
| 106 |
-
'everywhere',
|
| 107 |
-
'except',
|
| 108 |
-
'few',
|
| 109 |
-
'fifteen',
|
| 110 |
-
'fifty',
|
| 111 |
-
'first',
|
| 112 |
-
'five',
|
| 113 |
-
'for',
|
| 114 |
-
'former',
|
| 115 |
-
'formerly',
|
| 116 |
-
'forty',
|
| 117 |
-
'four',
|
| 118 |
-
'from',
|
| 119 |
-
'front',
|
| 120 |
-
'full',
|
| 121 |
-
'further',
|
| 122 |
-
'get',
|
| 123 |
-
'give',
|
| 124 |
-
'go',
|
| 125 |
-
'had',
|
| 126 |
-
'has',
|
| 127 |
-
'have',
|
| 128 |
-
'having',
|
| 129 |
-
'he',
|
| 130 |
-
'hence',
|
| 131 |
-
'her',
|
| 132 |
-
'here',
|
| 133 |
-
'hereafter',
|
| 134 |
-
'hereby',
|
| 135 |
-
'herein',
|
| 136 |
-
'hereupon',
|
| 137 |
-
'hers',
|
| 138 |
-
'herself',
|
| 139 |
-
'him',
|
| 140 |
-
'himself',
|
| 141 |
-
'his',
|
| 142 |
-
'how',
|
| 143 |
-
'however',
|
| 144 |
-
'hundred',
|
| 145 |
-
'i',
|
| 146 |
-
'if',
|
| 147 |
-
'in',
|
| 148 |
-
'indeed',
|
| 149 |
-
'into',
|
| 150 |
-
'is',
|
| 151 |
-
'it',
|
| 152 |
-
"it's",
|
| 153 |
-
'its',
|
| 154 |
-
'itself',
|
| 155 |
-
'just',
|
| 156 |
-
'keep',
|
| 157 |
-
'last',
|
| 158 |
-
'latter',
|
| 159 |
-
'latterly',
|
| 160 |
-
'least',
|
| 161 |
-
'less',
|
| 162 |
-
'll',
|
| 163 |
-
'm',
|
| 164 |
-
'ma',
|
| 165 |
-
'made',
|
| 166 |
-
'make',
|
| 167 |
-
'many',
|
| 168 |
-
'say',
|
| 169 |
-
'said',
|
| 170 |
-
'says',
|
| 171 |
-
'told',
|
| 172 |
-
'tell',
|
| 173 |
-
'may',
|
| 174 |
-
'me',
|
| 175 |
-
'meanwhile',
|
| 176 |
-
'might',
|
| 177 |
-
'mine',
|
| 178 |
-
'more',
|
| 179 |
-
'moreover',
|
| 180 |
-
'most',
|
| 181 |
-
'mostly',
|
| 182 |
-
'move',
|
| 183 |
-
'much',
|
| 184 |
-
'must',
|
| 185 |
-
'my',
|
| 186 |
-
'myself',
|
| 187 |
-
'name',
|
| 188 |
-
'namely',
|
| 189 |
-
'neither',
|
| 190 |
-
'never',
|
| 191 |
-
'nevertheless',
|
| 192 |
-
'next',
|
| 193 |
-
'nine',
|
| 194 |
-
'no',
|
| 195 |
-
'nobody',
|
| 196 |
-
'none',
|
| 197 |
-
'noone',
|
| 198 |
-
'nor',
|
| 199 |
-
'not',
|
| 200 |
-
'nothing',
|
| 201 |
-
'now',
|
| 202 |
-
'nowhere',
|
| 203 |
-
'o',
|
| 204 |
-
'of',
|
| 205 |
-
'off',
|
| 206 |
-
'often',
|
| 207 |
-
'on',
|
| 208 |
-
'once',
|
| 209 |
-
'one',
|
| 210 |
-
'only',
|
| 211 |
-
'onto',
|
| 212 |
-
'or',
|
| 213 |
-
'other',
|
| 214 |
-
'others',
|
| 215 |
-
'otherwise',
|
| 216 |
-
'our',
|
| 217 |
-
'ours',
|
| 218 |
-
'ourselves',
|
| 219 |
-
'out',
|
| 220 |
-
'over',
|
| 221 |
-
'own',
|
| 222 |
-
'part',
|
| 223 |
-
'per',
|
| 224 |
-
'perhaps',
|
| 225 |
-
'please',
|
| 226 |
-
'put',
|
| 227 |
-
'quite',
|
| 228 |
-
'rather',
|
| 229 |
-
're',
|
| 230 |
-
'rs',
|
| 231 |
-
'really',
|
| 232 |
-
'regarding',
|
| 233 |
-
's',
|
| 234 |
-
'same',
|
| 235 |
-
'say',
|
| 236 |
-
'see',
|
| 237 |
-
'seem',
|
| 238 |
-
'seemed',
|
| 239 |
-
'seeming',
|
| 240 |
-
'seems',
|
| 241 |
-
'serious',
|
| 242 |
-
'several',
|
| 243 |
-
'shan',
|
| 244 |
-
"shan't",
|
| 245 |
-
'she',
|
| 246 |
-
"she's",
|
| 247 |
-
'should',
|
| 248 |
-
"should've",
|
| 249 |
-
'shouldn',
|
| 250 |
-
"shouldn't",
|
| 251 |
-
'show',
|
| 252 |
-
'side',
|
| 253 |
-
'since',
|
| 254 |
-
'six',
|
| 255 |
-
'sixty',
|
| 256 |
-
'so',
|
| 257 |
-
'some',
|
| 258 |
-
'somehow',
|
| 259 |
-
'someone',
|
| 260 |
-
'something',
|
| 261 |
-
'sometime',
|
| 262 |
-
'sometimes',
|
| 263 |
-
'somewhere',
|
| 264 |
-
'still',
|
| 265 |
-
'such',
|
| 266 |
-
't',
|
| 267 |
-
'take',
|
| 268 |
-
'ten',
|
| 269 |
-
'than',
|
| 270 |
-
'that',
|
| 271 |
-
"that'll",
|
| 272 |
-
'the',
|
| 273 |
-
'their',
|
| 274 |
-
'theirs',
|
| 275 |
-
'them',
|
| 276 |
-
'themselves',
|
| 277 |
-
'then',
|
| 278 |
-
'thence',
|
| 279 |
-
'there',
|
| 280 |
-
'thereafter',
|
| 281 |
-
'thereby',
|
| 282 |
-
'therefore',
|
| 283 |
-
'therein',
|
| 284 |
-
'thereupon',
|
| 285 |
-
'these',
|
| 286 |
-
'they',
|
| 287 |
-
'third',
|
| 288 |
-
'this',
|
| 289 |
-
'those',
|
| 290 |
-
'though',
|
| 291 |
-
'three',
|
| 292 |
-
'through',
|
| 293 |
-
'throughout',
|
| 294 |
-
'thru',
|
| 295 |
-
'thus',
|
| 296 |
-
'to',
|
| 297 |
-
'together',
|
| 298 |
-
'too',
|
| 299 |
-
'top',
|
| 300 |
-
'toward',
|
| 301 |
-
'towards',
|
| 302 |
-
'twelve',
|
| 303 |
-
'twenty',
|
| 304 |
-
'two',
|
| 305 |
-
'under',
|
| 306 |
-
'unless',
|
| 307 |
-
'until',
|
| 308 |
-
'up',
|
| 309 |
-
'upon',
|
| 310 |
-
'us',
|
| 311 |
-
'used',
|
| 312 |
-
'using',
|
| 313 |
-
'various',
|
| 314 |
-
've',
|
| 315 |
-
'very',
|
| 316 |
-
'via',
|
| 317 |
-
'was',
|
| 318 |
-
'wasn',
|
| 319 |
-
"wasn't",
|
| 320 |
-
'we',
|
| 321 |
-
'well',
|
| 322 |
-
'were',
|
| 323 |
-
'weren',
|
| 324 |
-
"weren't",
|
| 325 |
-
'what',
|
| 326 |
-
'whatever',
|
| 327 |
-
'when',
|
| 328 |
-
'whence',
|
| 329 |
-
'whenever',
|
| 330 |
-
'where',
|
| 331 |
-
'whereafter',
|
| 332 |
-
'whereas',
|
| 333 |
-
'whereby',
|
| 334 |
-
'wherein',
|
| 335 |
-
'whereupon',
|
| 336 |
-
'wherever',
|
| 337 |
-
'whether',
|
| 338 |
-
'which',
|
| 339 |
-
'while',
|
| 340 |
-
'whither',
|
| 341 |
-
'who',
|
| 342 |
-
'whoever',
|
| 343 |
-
'whole',
|
| 344 |
-
'whom',
|
| 345 |
-
'whose',
|
| 346 |
-
'why',
|
| 347 |
-
'will',
|
| 348 |
-
'with',
|
| 349 |
-
'within',
|
| 350 |
-
'without',
|
| 351 |
-
'won',
|
| 352 |
-
"won't",
|
| 353 |
-
'would',
|
| 354 |
-
'wouldn',
|
| 355 |
-
"wouldn't",
|
| 356 |
-
'y',
|
| 357 |
-
'yet',
|
| 358 |
-
'you',
|
| 359 |
-
"you'd",
|
| 360 |
-
"you'll",
|
| 361 |
-
"you're",
|
| 362 |
-
"you've",
|
| 363 |
-
'your',
|
| 364 |
-
'yours',
|
| 365 |
-
'yourself',
|
| 366 |
-
'yourselves',
|
| 367 |
-
'‘d',
|
| 368 |
-
'‘ll',
|
| 369 |
-
'‘m',
|
| 370 |
-
'‘re',
|
| 371 |
-
'‘s',
|
| 372 |
-
'‘ve',
|
| 373 |
-
'’d',
|
| 374 |
-
'’ll',
|
| 375 |
-
'’m',
|
| 376 |
-
'’re',
|
| 377 |
-
'new',
|
| 378 |
-
'old',
|
| 379 |
-
'’s',
|
| 380 |
-
'’ve']
|
| 381 |
-
|
| 382 |
-
self.contraction_to_expansion = {"ain't": "am not",
|
| 383 |
-
"aren't": "are not",
|
| 384 |
-
"can't": "cannot",
|
| 385 |
-
"can't've": "cannot have",
|
| 386 |
-
"'cause": "because",
|
| 387 |
-
"could've": "could have",
|
| 388 |
-
"couldn't": "could not",
|
| 389 |
-
"couldn't've": "could not have",
|
| 390 |
-
"didn't": "did not",
|
| 391 |
-
"doesn't": "does not",
|
| 392 |
-
"don't": "do not",
|
| 393 |
-
"hadn't": "had not",
|
| 394 |
-
"hadn't've": "had not have",
|
| 395 |
-
"hasn't": "has not",
|
| 396 |
-
"haven't": "have not",
|
| 397 |
-
"he'd": "he would",
|
| 398 |
-
"he'd've": "he would have",
|
| 399 |
-
"he'll": "he will",
|
| 400 |
-
"he'll've": "he will have",
|
| 401 |
-
"he's": "he is",
|
| 402 |
-
"how'd": "how did",
|
| 403 |
-
"how'd'y": "how do you",
|
| 404 |
-
"how'll": "how will",
|
| 405 |
-
"how's": "how is",
|
| 406 |
-
"i'd": "i would",
|
| 407 |
-
"i'd've": "i would have",
|
| 408 |
-
"i'll": "i will",
|
| 409 |
-
"i'll've": "i will have",
|
| 410 |
-
"i'm": "i am",
|
| 411 |
-
"i've": "i have",
|
| 412 |
-
"isn't": "is not",
|
| 413 |
-
"it'd": "it had",
|
| 414 |
-
"it'd've": "it would have",
|
| 415 |
-
"it'll": "it will",
|
| 416 |
-
"it'll've": "it will have",
|
| 417 |
-
"it's": "it is",
|
| 418 |
-
"let's": "let us",
|
| 419 |
-
"ma'am": "madam",
|
| 420 |
-
"mayn't": "may not",
|
| 421 |
-
"might've": "might have",
|
| 422 |
-
"mightn't": "might not",
|
| 423 |
-
"mightn't've": "might not have",
|
| 424 |
-
"must've": "must have",
|
| 425 |
-
"mustn't": "must not",
|
| 426 |
-
"mustn't've": "must not have",
|
| 427 |
-
"needn't": "need not",
|
| 428 |
-
"needn't've": "need not have",
|
| 429 |
-
"o'clock": "of the clock",
|
| 430 |
-
"oughtn't": "ought not",
|
| 431 |
-
"oughtn't've": "ought not have",
|
| 432 |
-
"shan't": "shall not",
|
| 433 |
-
"sha'n't": "shall not",
|
| 434 |
-
"shan't've": "shall not have",
|
| 435 |
-
"she'd": "she would",
|
| 436 |
-
"she'd've": "she would have",
|
| 437 |
-
"she'll": "she will",
|
| 438 |
-
"she'll've": "she will have",
|
| 439 |
-
"she's": "she is",
|
| 440 |
-
"should've": "should have",
|
| 441 |
-
"shouldn't": "should not",
|
| 442 |
-
"shouldn't've": "should not have",
|
| 443 |
-
"so've": "so have",
|
| 444 |
-
"so's": "so is",
|
| 445 |
-
"that'd": "that would",
|
| 446 |
-
"that'd've": "that would have",
|
| 447 |
-
"that's": "that is",
|
| 448 |
-
"there'd": "there had",
|
| 449 |
-
"there'd've": "there would have",
|
| 450 |
-
"there's": "there is",
|
| 451 |
-
"they'd": "they would",
|
| 452 |
-
"they'd've": "they would have",
|
| 453 |
-
"they'll": "they will",
|
| 454 |
-
"they'll've": "they will have",
|
| 455 |
-
"they're": "they are",
|
| 456 |
-
"they've": "they have",
|
| 457 |
-
"to've": "to have",
|
| 458 |
-
"wasn't": "was not",
|
| 459 |
-
"we'd": "we had",
|
| 460 |
-
"we'd've": "we would have",
|
| 461 |
-
"we'll": "we will",
|
| 462 |
-
"we'll've": "we will have",
|
| 463 |
-
"we're": "we are",
|
| 464 |
-
"we've": "we have",
|
| 465 |
-
"weren't": "were not",
|
| 466 |
-
"what'll": "what will",
|
| 467 |
-
"what'll've": "what will have",
|
| 468 |
-
"what're": "what are",
|
| 469 |
-
"what's": "what is",
|
| 470 |
-
"what've": "what have",
|
| 471 |
-
"when's": "when is",
|
| 472 |
-
"when've": "when have",
|
| 473 |
-
"where'd": "where did",
|
| 474 |
-
"where's": "where is",
|
| 475 |
-
"where've": "where have",
|
| 476 |
-
"who'll": "who will",
|
| 477 |
-
"who'll've": "who will have",
|
| 478 |
-
"who's": "who is",
|
| 479 |
-
"who've": "who have",
|
| 480 |
-
"why's": "why is",
|
| 481 |
-
"why've": "why have",
|
| 482 |
-
"will've": "will have",
|
| 483 |
-
"won't": "will not",
|
| 484 |
-
"won't've": "will not have",
|
| 485 |
-
"would've": "would have",
|
| 486 |
-
"wouldn't": "would not",
|
| 487 |
-
"wouldn't've": "would not have",
|
| 488 |
-
"y'all": "you all",
|
| 489 |
-
"y'alls": "you alls",
|
| 490 |
-
"y'all'd": "you all would",
|
| 491 |
-
"y'all'd've": "you all would have",
|
| 492 |
-
"y'all're": "you all are",
|
| 493 |
-
"y'all've": "you all have",
|
| 494 |
-
"you'd": "you had",
|
| 495 |
-
"you'd've": "you would have",
|
| 496 |
-
"you'll": "you you will",
|
| 497 |
-
"you'll've": "you you will have",
|
| 498 |
-
"you're": "you are",
|
| 499 |
-
"you've": "you have"
|
| 500 |
-
}
|
| 501 |
-
|
| 502 |
-
@staticmethod
|
| 503 |
-
def __remove_double_whitespaces(string: str):
|
| 504 |
-
return " ".join(string.split())
|
| 505 |
-
|
| 506 |
-
async def __remove_url(self, string_series: pd.Series):
|
| 507 |
-
"""
|
| 508 |
-
Removes URLs m text
|
| 509 |
-
:param string_series: pd.Series, input string series
|
| 510 |
-
:return: pd.Series, cleaned string series
|
| 511 |
-
"""
|
| 512 |
-
clean_string_series = string_series.str.replace(
|
| 513 |
-
pat=r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})",
|
| 514 |
-
repl=" ", regex=True).copy()
|
| 515 |
-
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 516 |
-
|
| 517 |
-
async def __expand(self, string_series: pd.Series):
|
| 518 |
-
"""
|
| 519 |
-
Replaces contractions with expansions. eg. don't wit do not.
|
| 520 |
-
:param string_series: pd.Series, input string series
|
| 521 |
-
:return: pd.Series, cleaned string series
|
| 522 |
-
"""
|
| 523 |
-
clean_string_series = string_series.copy()
|
| 524 |
-
for c, e in self.contraction_to_expansion.items():
|
| 525 |
-
clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
|
| 526 |
-
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 527 |
-
|
| 528 |
-
async def __remove_punct(self, string_series: pd.Series):
|
| 529 |
-
"""
|
| 530 |
-
Removes punctuations from the input string.
|
| 531 |
-
:param string_series: pd.Series, input string series
|
| 532 |
-
:return: pd.Series, cleaned string series
|
| 533 |
-
"""
|
| 534 |
-
clean_string_series = string_series.copy()
|
| 535 |
-
puncts = [r'\n', r'\r', r'\t']
|
| 536 |
-
puncts.extend(list(string.punctuation))
|
| 537 |
-
for i in puncts:
|
| 538 |
-
clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
|
| 539 |
-
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 540 |
-
|
| 541 |
-
async def __remove_digits(self, string_series: pd.Series):
|
| 542 |
-
"""
|
| 543 |
-
Removes digits from the input string.
|
| 544 |
-
:param string_series: pd.Series, input string series
|
| 545 |
-
:return: pd.Series, cleaned string series
|
| 546 |
-
"""
|
| 547 |
-
clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
|
| 548 |
-
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 549 |
-
|
| 550 |
-
@staticmethod
|
| 551 |
-
async def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
|
| 552 |
-
"""
|
| 553 |
-
Reomves words/tokens where minlen <= len <= maxlen.
|
| 554 |
-
:param string_series: pd.Series, input string series
|
| 555 |
-
:param minlen: int, minimum length of token to be removed.
|
| 556 |
-
:param maxlen: int, maximum length of token to be removed.
|
| 557 |
-
:return: pd.Series, cleaned string series
|
| 558 |
-
"""
|
| 559 |
-
clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if
|
| 560 |
-
(len(word) > maxlen) or (len(word) < minlen)]))
|
| 561 |
-
return clean_string_series
|
| 562 |
-
|
| 563 |
-
async def __remove_stop_words(self, string_series: pd.Series):
|
| 564 |
-
"""
|
| 565 |
-
Removes stop words from the input string.
|
| 566 |
-
:param string_series: pd.Series, input string series
|
| 567 |
-
:return: pd.Series, cleaned string series
|
| 568 |
-
"""
|
| 569 |
-
def str_remove_stop_words(string: str):
|
| 570 |
-
stops = self.stop_words
|
| 571 |
-
return " ".join([token for token in string.split() if token not in stops])
|
| 572 |
-
|
| 573 |
-
return string_series.map(str_remove_stop_words)
|
| 574 |
-
|
| 575 |
-
async def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
|
| 576 |
-
bottom_p: int = None, dataset: str = 'train'):
|
| 577 |
-
"""
|
| 578 |
-
Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
|
| 579 |
-
:param string_series: pd.Series, input string series
|
| 580 |
-
:param top_p: float, percent of frequent words to remove.
|
| 581 |
-
:param bottom_p: float, percent of rare words to remove.
|
| 582 |
-
:param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
|
| 583 |
-
:return: pd.Series, cleaned string series
|
| 584 |
-
"""
|
| 585 |
-
if dataset == 'train':
|
| 586 |
-
if top_p is None:
|
| 587 |
-
top_p = 0
|
| 588 |
-
if bottom_p is None:
|
| 589 |
-
bottom_p = 0
|
| 590 |
-
|
| 591 |
-
if top_p > 0 or bottom_p > 0:
|
| 592 |
-
word_freq = pd.Series(" ".join(string_series).split()).value_counts()
|
| 593 |
-
n_words = len(word_freq)
|
| 594 |
-
|
| 595 |
-
if top_p > 0:
|
| 596 |
-
self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])
|
| 597 |
-
|
| 598 |
-
if bottom_p > 0:
|
| 599 |
-
self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])
|
| 600 |
-
|
| 601 |
-
if len(self.words_to_remove) == 0:
|
| 602 |
-
return string_series
|
| 603 |
-
else:
|
| 604 |
-
clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split()
|
| 605 |
-
if word not in self.words_to_remove]))
|
| 606 |
-
return clean_string_series
|
| 607 |
-
|
| 608 |
-
async def preprocess(self, string_series: pd.Series, dataset: str = "train"):
|
| 609 |
-
"""
|
| 610 |
-
Entry point.
|
| 611 |
-
:param string_series: pd.Series, input string series
|
| 612 |
-
:param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
|
| 613 |
-
:return: pd.Series, cleaned string series
|
| 614 |
-
"""
|
| 615 |
-
string_series = string_series.str.lower().copy()
|
| 616 |
-
string_series = string_series.map(unidecode).copy()
|
| 617 |
-
string_series = await self.__remove_url(string_series=string_series)
|
| 618 |
-
string_series = await self.__expand(string_series=string_series)
|
| 619 |
-
|
| 620 |
-
if self.remove_punct:
|
| 621 |
-
string_series = await self.__remove_punct(string_series=string_series)
|
| 622 |
-
if self.remove_digits:
|
| 623 |
-
string_series = await self.__remove_digits(string_series=string_series)
|
| 624 |
-
if self.remove_stop_words:
|
| 625 |
-
string_series = await self.__remove_stop_words(string_series=string_series)
|
| 626 |
-
if self.remove_short_words:
|
| 627 |
-
string_series = await self.__remove_short_words(string_series=string_series,
|
| 628 |
-
minlen=self.minlen,
|
| 629 |
-
maxlen=self.maxlen)
|
| 630 |
-
string_series = await self.__remove_top_bottom_words(string_series=string_series,
|
| 631 |
-
top_p=self.top_p,
|
| 632 |
-
bottom_p=self.bottom_p, dataset=dataset)
|
| 633 |
-
|
| 634 |
-
string_series = string_series.str.strip().copy()
|
| 635 |
-
string_series.replace(to_replace="", value="this is an empty message", inplace=True)
|
| 636 |
-
|
| 637 |
-
return string_series
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
async def get_frequent_words_html(df):
|
| 641 |
-
text_preprocess = TextPreprocessor()
|
| 642 |
-
preprocessed_txt = await text_preprocess.preprocess(df['title'] + ' ' + df['description'])
|
| 643 |
-
counter = Counter(' '.join([*preprocessed_txt]).split())
|
| 644 |
-
|
| 645 |
-
freq_tokens_html = '<div class="word-cloud-container">'
|
| 646 |
-
n = 1
|
| 647 |
-
for i, j in counter.most_common(25):
|
| 648 |
-
freq_tokens_html += f'<a class="wc-tokens" onclick=wc_search("{i}")>{i}</a>{" " * np.random.randint(3, 7, 1)[0]}'
|
| 649 |
-
if n == 5:
|
| 650 |
-
freq_tokens_html += '<div class="word-cloud-section" id="word-cloud-section-id">'
|
| 651 |
-
n += 1
|
| 652 |
-
freq_tokens_html += '</div></div>'
|
| 653 |
return freq_tokens_html
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import string
|
| 4 |
+
from unidecode import unidecode
|
| 5 |
+
from collections import Counter
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TextPreprocessor:
|
| 9 |
+
def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
|
| 10 |
+
remove_stop_words: bool = True,
|
| 11 |
+
remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None,
|
| 12 |
+
bottom_p: float = None):
|
| 13 |
+
self.remove_punct = remove_punct
|
| 14 |
+
self.remove_digits = remove_digits
|
| 15 |
+
self.remove_stop_words = remove_stop_words
|
| 16 |
+
self.remove_short_words = remove_short_words
|
| 17 |
+
self.minlen = minlen
|
| 18 |
+
self.maxlen = maxlen
|
| 19 |
+
self.top_p = top_p
|
| 20 |
+
self.bottom_p = bottom_p
|
| 21 |
+
self.words_to_remove = []
|
| 22 |
+
self.stop_words = ["'d", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
|
| 23 |
+
'about',
|
| 24 |
+
'above',
|
| 25 |
+
'across',
|
| 26 |
+
'after',
|
| 27 |
+
'afterwards',
|
| 28 |
+
'again',
|
| 29 |
+
'against',
|
| 30 |
+
'ain',
|
| 31 |
+
'all',
|
| 32 |
+
'almost',
|
| 33 |
+
'alone',
|
| 34 |
+
'along',
|
| 35 |
+
'already',
|
| 36 |
+
'also',
|
| 37 |
+
'although',
|
| 38 |
+
'always',
|
| 39 |
+
'am',
|
| 40 |
+
'among',
|
| 41 |
+
'amongst',
|
| 42 |
+
'amount',
|
| 43 |
+
'an',
|
| 44 |
+
'and',
|
| 45 |
+
'another',
|
| 46 |
+
'any',
|
| 47 |
+
'anyhow',
|
| 48 |
+
'anyone',
|
| 49 |
+
'anything',
|
| 50 |
+
'anyway',
|
| 51 |
+
'anywhere',
|
| 52 |
+
'are',
|
| 53 |
+
'around',
|
| 54 |
+
'as',
|
| 55 |
+
'at',
|
| 56 |
+
'back',
|
| 57 |
+
'be',
|
| 58 |
+
'became',
|
| 59 |
+
'because',
|
| 60 |
+
'become',
|
| 61 |
+
'becomes',
|
| 62 |
+
'becoming',
|
| 63 |
+
'been',
|
| 64 |
+
'before',
|
| 65 |
+
'beforehand',
|
| 66 |
+
'behind',
|
| 67 |
+
'being',
|
| 68 |
+
'below',
|
| 69 |
+
'beside',
|
| 70 |
+
'besides',
|
| 71 |
+
'between',
|
| 72 |
+
'beyond',
|
| 73 |
+
'both',
|
| 74 |
+
'bottom',
|
| 75 |
+
'but',
|
| 76 |
+
'by',
|
| 77 |
+
'ca',
|
| 78 |
+
'call',
|
| 79 |
+
'can',
|
| 80 |
+
'cannot',
|
| 81 |
+
'could',
|
| 82 |
+
'couldn',
|
| 83 |
+
"couldn't",
|
| 84 |
+
'd',
|
| 85 |
+
'did',
|
| 86 |
+
'do',
|
| 87 |
+
'does',
|
| 88 |
+
'doing',
|
| 89 |
+
'done',
|
| 90 |
+
'down',
|
| 91 |
+
'due',
|
| 92 |
+
'during',
|
| 93 |
+
'each',
|
| 94 |
+
'eight',
|
| 95 |
+
'either',
|
| 96 |
+
'eleven',
|
| 97 |
+
'else',
|
| 98 |
+
'elsewhere',
|
| 99 |
+
'empty',
|
| 100 |
+
'enough',
|
| 101 |
+
'even',
|
| 102 |
+
'ever',
|
| 103 |
+
'every',
|
| 104 |
+
'everyone',
|
| 105 |
+
'everything',
|
| 106 |
+
'everywhere',
|
| 107 |
+
'except',
|
| 108 |
+
'few',
|
| 109 |
+
'fifteen',
|
| 110 |
+
'fifty',
|
| 111 |
+
'first',
|
| 112 |
+
'five',
|
| 113 |
+
'for',
|
| 114 |
+
'former',
|
| 115 |
+
'formerly',
|
| 116 |
+
'forty',
|
| 117 |
+
'four',
|
| 118 |
+
'from',
|
| 119 |
+
'front',
|
| 120 |
+
'full',
|
| 121 |
+
'further',
|
| 122 |
+
'get',
|
| 123 |
+
'give',
|
| 124 |
+
'go',
|
| 125 |
+
'had',
|
| 126 |
+
'has',
|
| 127 |
+
'have',
|
| 128 |
+
'having',
|
| 129 |
+
'he',
|
| 130 |
+
'hence',
|
| 131 |
+
'her',
|
| 132 |
+
'here',
|
| 133 |
+
'hereafter',
|
| 134 |
+
'hereby',
|
| 135 |
+
'herein',
|
| 136 |
+
'hereupon',
|
| 137 |
+
'hers',
|
| 138 |
+
'herself',
|
| 139 |
+
'him',
|
| 140 |
+
'himself',
|
| 141 |
+
'his',
|
| 142 |
+
'how',
|
| 143 |
+
'however',
|
| 144 |
+
'hundred',
|
| 145 |
+
'i',
|
| 146 |
+
'if',
|
| 147 |
+
'in',
|
| 148 |
+
'indeed',
|
| 149 |
+
'into',
|
| 150 |
+
'is',
|
| 151 |
+
'it',
|
| 152 |
+
"it's",
|
| 153 |
+
'its',
|
| 154 |
+
'itself',
|
| 155 |
+
'just',
|
| 156 |
+
'keep',
|
| 157 |
+
'last',
|
| 158 |
+
'latter',
|
| 159 |
+
'latterly',
|
| 160 |
+
'least',
|
| 161 |
+
'less',
|
| 162 |
+
'll',
|
| 163 |
+
'm',
|
| 164 |
+
'ma',
|
| 165 |
+
'made',
|
| 166 |
+
'make',
|
| 167 |
+
'many',
|
| 168 |
+
'say',
|
| 169 |
+
'said',
|
| 170 |
+
'says',
|
| 171 |
+
'told',
|
| 172 |
+
'tell',
|
| 173 |
+
'may',
|
| 174 |
+
'me',
|
| 175 |
+
'meanwhile',
|
| 176 |
+
'might',
|
| 177 |
+
'mine',
|
| 178 |
+
'more',
|
| 179 |
+
'moreover',
|
| 180 |
+
'most',
|
| 181 |
+
'mostly',
|
| 182 |
+
'move',
|
| 183 |
+
'much',
|
| 184 |
+
'must',
|
| 185 |
+
'my',
|
| 186 |
+
'myself',
|
| 187 |
+
'name',
|
| 188 |
+
'namely',
|
| 189 |
+
'neither',
|
| 190 |
+
'never',
|
| 191 |
+
'nevertheless',
|
| 192 |
+
'next',
|
| 193 |
+
'nine',
|
| 194 |
+
'no',
|
| 195 |
+
'nobody',
|
| 196 |
+
'none',
|
| 197 |
+
'noone',
|
| 198 |
+
'nor',
|
| 199 |
+
'not',
|
| 200 |
+
'nothing',
|
| 201 |
+
'now',
|
| 202 |
+
'nowhere',
|
| 203 |
+
'o',
|
| 204 |
+
'of',
|
| 205 |
+
'off',
|
| 206 |
+
'often',
|
| 207 |
+
'on',
|
| 208 |
+
'once',
|
| 209 |
+
'one',
|
| 210 |
+
'only',
|
| 211 |
+
'onto',
|
| 212 |
+
'or',
|
| 213 |
+
'other',
|
| 214 |
+
'others',
|
| 215 |
+
'otherwise',
|
| 216 |
+
'our',
|
| 217 |
+
'ours',
|
| 218 |
+
'ourselves',
|
| 219 |
+
'out',
|
| 220 |
+
'over',
|
| 221 |
+
'own',
|
| 222 |
+
'part',
|
| 223 |
+
'per',
|
| 224 |
+
'perhaps',
|
| 225 |
+
'please',
|
| 226 |
+
'put',
|
| 227 |
+
'quite',
|
| 228 |
+
'rather',
|
| 229 |
+
're',
|
| 230 |
+
'rs',
|
| 231 |
+
'really',
|
| 232 |
+
'regarding',
|
| 233 |
+
's',
|
| 234 |
+
'same',
|
| 235 |
+
'say',
|
| 236 |
+
'see',
|
| 237 |
+
'seem',
|
| 238 |
+
'seemed',
|
| 239 |
+
'seeming',
|
| 240 |
+
'seems',
|
| 241 |
+
'serious',
|
| 242 |
+
'several',
|
| 243 |
+
'shan',
|
| 244 |
+
"shan't",
|
| 245 |
+
'she',
|
| 246 |
+
"she's",
|
| 247 |
+
'should',
|
| 248 |
+
"should've",
|
| 249 |
+
'shouldn',
|
| 250 |
+
"shouldn't",
|
| 251 |
+
'show',
|
| 252 |
+
'side',
|
| 253 |
+
'since',
|
| 254 |
+
'six',
|
| 255 |
+
'sixty',
|
| 256 |
+
'so',
|
| 257 |
+
'some',
|
| 258 |
+
'somehow',
|
| 259 |
+
'someone',
|
| 260 |
+
'something',
|
| 261 |
+
'sometime',
|
| 262 |
+
'sometimes',
|
| 263 |
+
'somewhere',
|
| 264 |
+
'still',
|
| 265 |
+
'such',
|
| 266 |
+
't',
|
| 267 |
+
'take',
|
| 268 |
+
'ten',
|
| 269 |
+
'than',
|
| 270 |
+
'that',
|
| 271 |
+
"that'll",
|
| 272 |
+
'the',
|
| 273 |
+
'their',
|
| 274 |
+
'theirs',
|
| 275 |
+
'them',
|
| 276 |
+
'themselves',
|
| 277 |
+
'then',
|
| 278 |
+
'thence',
|
| 279 |
+
'there',
|
| 280 |
+
'thereafter',
|
| 281 |
+
'thereby',
|
| 282 |
+
'therefore',
|
| 283 |
+
'therein',
|
| 284 |
+
'thereupon',
|
| 285 |
+
'these',
|
| 286 |
+
'they',
|
| 287 |
+
'third',
|
| 288 |
+
'this',
|
| 289 |
+
'those',
|
| 290 |
+
'though',
|
| 291 |
+
'three',
|
| 292 |
+
'through',
|
| 293 |
+
'throughout',
|
| 294 |
+
'thru',
|
| 295 |
+
'thus',
|
| 296 |
+
'to',
|
| 297 |
+
'together',
|
| 298 |
+
'too',
|
| 299 |
+
'top',
|
| 300 |
+
'toward',
|
| 301 |
+
'towards',
|
| 302 |
+
'twelve',
|
| 303 |
+
'twenty',
|
| 304 |
+
'two',
|
| 305 |
+
'under',
|
| 306 |
+
'unless',
|
| 307 |
+
'until',
|
| 308 |
+
'up',
|
| 309 |
+
'upon',
|
| 310 |
+
'us',
|
| 311 |
+
'used',
|
| 312 |
+
'using',
|
| 313 |
+
'various',
|
| 314 |
+
've',
|
| 315 |
+
'very',
|
| 316 |
+
'via',
|
| 317 |
+
'was',
|
| 318 |
+
'wasn',
|
| 319 |
+
"wasn't",
|
| 320 |
+
'we',
|
| 321 |
+
'well',
|
| 322 |
+
'were',
|
| 323 |
+
'weren',
|
| 324 |
+
"weren't",
|
| 325 |
+
'what',
|
| 326 |
+
'whatever',
|
| 327 |
+
'when',
|
| 328 |
+
'whence',
|
| 329 |
+
'whenever',
|
| 330 |
+
'where',
|
| 331 |
+
'whereafter',
|
| 332 |
+
'whereas',
|
| 333 |
+
'whereby',
|
| 334 |
+
'wherein',
|
| 335 |
+
'whereupon',
|
| 336 |
+
'wherever',
|
| 337 |
+
'whether',
|
| 338 |
+
'which',
|
| 339 |
+
'while',
|
| 340 |
+
'whither',
|
| 341 |
+
'who',
|
| 342 |
+
'whoever',
|
| 343 |
+
'whole',
|
| 344 |
+
'whom',
|
| 345 |
+
'whose',
|
| 346 |
+
'why',
|
| 347 |
+
'will',
|
| 348 |
+
'with',
|
| 349 |
+
'within',
|
| 350 |
+
'without',
|
| 351 |
+
'won',
|
| 352 |
+
"won't",
|
| 353 |
+
'would',
|
| 354 |
+
'wouldn',
|
| 355 |
+
"wouldn't",
|
| 356 |
+
'y',
|
| 357 |
+
'yet',
|
| 358 |
+
'you',
|
| 359 |
+
"you'd",
|
| 360 |
+
"you'll",
|
| 361 |
+
"you're",
|
| 362 |
+
"you've",
|
| 363 |
+
'your',
|
| 364 |
+
'yours',
|
| 365 |
+
'yourself',
|
| 366 |
+
'yourselves',
|
| 367 |
+
'‘d',
|
| 368 |
+
'‘ll',
|
| 369 |
+
'‘m',
|
| 370 |
+
'‘re',
|
| 371 |
+
'‘s',
|
| 372 |
+
'‘ve',
|
| 373 |
+
'’d',
|
| 374 |
+
'’ll',
|
| 375 |
+
'’m',
|
| 376 |
+
'’re',
|
| 377 |
+
'new',
|
| 378 |
+
'old',
|
| 379 |
+
'’s',
|
| 380 |
+
'’ve']
|
| 381 |
+
|
| 382 |
+
self.contraction_to_expansion = {"ain't": "am not",
|
| 383 |
+
"aren't": "are not",
|
| 384 |
+
"can't": "cannot",
|
| 385 |
+
"can't've": "cannot have",
|
| 386 |
+
"'cause": "because",
|
| 387 |
+
"could've": "could have",
|
| 388 |
+
"couldn't": "could not",
|
| 389 |
+
"couldn't've": "could not have",
|
| 390 |
+
"didn't": "did not",
|
| 391 |
+
"doesn't": "does not",
|
| 392 |
+
"don't": "do not",
|
| 393 |
+
"hadn't": "had not",
|
| 394 |
+
"hadn't've": "had not have",
|
| 395 |
+
"hasn't": "has not",
|
| 396 |
+
"haven't": "have not",
|
| 397 |
+
"he'd": "he would",
|
| 398 |
+
"he'd've": "he would have",
|
| 399 |
+
"he'll": "he will",
|
| 400 |
+
"he'll've": "he will have",
|
| 401 |
+
"he's": "he is",
|
| 402 |
+
"how'd": "how did",
|
| 403 |
+
"how'd'y": "how do you",
|
| 404 |
+
"how'll": "how will",
|
| 405 |
+
"how's": "how is",
|
| 406 |
+
"i'd": "i would",
|
| 407 |
+
"i'd've": "i would have",
|
| 408 |
+
"i'll": "i will",
|
| 409 |
+
"i'll've": "i will have",
|
| 410 |
+
"i'm": "i am",
|
| 411 |
+
"i've": "i have",
|
| 412 |
+
"isn't": "is not",
|
| 413 |
+
"it'd": "it had",
|
| 414 |
+
"it'd've": "it would have",
|
| 415 |
+
"it'll": "it will",
|
| 416 |
+
"it'll've": "it will have",
|
| 417 |
+
"it's": "it is",
|
| 418 |
+
"let's": "let us",
|
| 419 |
+
"ma'am": "madam",
|
| 420 |
+
"mayn't": "may not",
|
| 421 |
+
"might've": "might have",
|
| 422 |
+
"mightn't": "might not",
|
| 423 |
+
"mightn't've": "might not have",
|
| 424 |
+
"must've": "must have",
|
| 425 |
+
"mustn't": "must not",
|
| 426 |
+
"mustn't've": "must not have",
|
| 427 |
+
"needn't": "need not",
|
| 428 |
+
"needn't've": "need not have",
|
| 429 |
+
"o'clock": "of the clock",
|
| 430 |
+
"oughtn't": "ought not",
|
| 431 |
+
"oughtn't've": "ought not have",
|
| 432 |
+
"shan't": "shall not",
|
| 433 |
+
"sha'n't": "shall not",
|
| 434 |
+
"shan't've": "shall not have",
|
| 435 |
+
"she'd": "she would",
|
| 436 |
+
"she'd've": "she would have",
|
| 437 |
+
"she'll": "she will",
|
| 438 |
+
"she'll've": "she will have",
|
| 439 |
+
"she's": "she is",
|
| 440 |
+
"should've": "should have",
|
| 441 |
+
"shouldn't": "should not",
|
| 442 |
+
"shouldn't've": "should not have",
|
| 443 |
+
"so've": "so have",
|
| 444 |
+
"so's": "so is",
|
| 445 |
+
"that'd": "that would",
|
| 446 |
+
"that'd've": "that would have",
|
| 447 |
+
"that's": "that is",
|
| 448 |
+
"there'd": "there had",
|
| 449 |
+
"there'd've": "there would have",
|
| 450 |
+
"there's": "there is",
|
| 451 |
+
"they'd": "they would",
|
| 452 |
+
"they'd've": "they would have",
|
| 453 |
+
"they'll": "they will",
|
| 454 |
+
"they'll've": "they will have",
|
| 455 |
+
"they're": "they are",
|
| 456 |
+
"they've": "they have",
|
| 457 |
+
"to've": "to have",
|
| 458 |
+
"wasn't": "was not",
|
| 459 |
+
"we'd": "we had",
|
| 460 |
+
"we'd've": "we would have",
|
| 461 |
+
"we'll": "we will",
|
| 462 |
+
"we'll've": "we will have",
|
| 463 |
+
"we're": "we are",
|
| 464 |
+
"we've": "we have",
|
| 465 |
+
"weren't": "were not",
|
| 466 |
+
"what'll": "what will",
|
| 467 |
+
"what'll've": "what will have",
|
| 468 |
+
"what're": "what are",
|
| 469 |
+
"what's": "what is",
|
| 470 |
+
"what've": "what have",
|
| 471 |
+
"when's": "when is",
|
| 472 |
+
"when've": "when have",
|
| 473 |
+
"where'd": "where did",
|
| 474 |
+
"where's": "where is",
|
| 475 |
+
"where've": "where have",
|
| 476 |
+
"who'll": "who will",
|
| 477 |
+
"who'll've": "who will have",
|
| 478 |
+
"who's": "who is",
|
| 479 |
+
"who've": "who have",
|
| 480 |
+
"why's": "why is",
|
| 481 |
+
"why've": "why have",
|
| 482 |
+
"will've": "will have",
|
| 483 |
+
"won't": "will not",
|
| 484 |
+
"won't've": "will not have",
|
| 485 |
+
"would've": "would have",
|
| 486 |
+
"wouldn't": "would not",
|
| 487 |
+
"wouldn't've": "would not have",
|
| 488 |
+
"y'all": "you all",
|
| 489 |
+
"y'alls": "you alls",
|
| 490 |
+
"y'all'd": "you all would",
|
| 491 |
+
"y'all'd've": "you all would have",
|
| 492 |
+
"y'all're": "you all are",
|
| 493 |
+
"y'all've": "you all have",
|
| 494 |
+
"you'd": "you had",
|
| 495 |
+
"you'd've": "you would have",
|
| 496 |
+
"you'll": "you you will",
|
| 497 |
+
"you'll've": "you you will have",
|
| 498 |
+
"you're": "you are",
|
| 499 |
+
"you've": "you have"
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
@staticmethod
|
| 503 |
+
def __remove_double_whitespaces(string: str):
|
| 504 |
+
return " ".join(string.split())
|
| 505 |
+
|
| 506 |
+
async def __remove_url(self, string_series: pd.Series):
|
| 507 |
+
"""
|
| 508 |
+
Removes URLs m text
|
| 509 |
+
:param string_series: pd.Series, input string series
|
| 510 |
+
:return: pd.Series, cleaned string series
|
| 511 |
+
"""
|
| 512 |
+
clean_string_series = string_series.str.replace(
|
| 513 |
+
pat=r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})",
|
| 514 |
+
repl=" ", regex=True).copy()
|
| 515 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 516 |
+
|
| 517 |
+
async def __expand(self, string_series: pd.Series):
|
| 518 |
+
"""
|
| 519 |
+
Replaces contractions with expansions. eg. don't wit do not.
|
| 520 |
+
:param string_series: pd.Series, input string series
|
| 521 |
+
:return: pd.Series, cleaned string series
|
| 522 |
+
"""
|
| 523 |
+
clean_string_series = string_series.copy()
|
| 524 |
+
for c, e in self.contraction_to_expansion.items():
|
| 525 |
+
clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
|
| 526 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 527 |
+
|
| 528 |
+
async def __remove_punct(self, string_series: pd.Series):
|
| 529 |
+
"""
|
| 530 |
+
Removes punctuations from the input string.
|
| 531 |
+
:param string_series: pd.Series, input string series
|
| 532 |
+
:return: pd.Series, cleaned string series
|
| 533 |
+
"""
|
| 534 |
+
clean_string_series = string_series.copy()
|
| 535 |
+
puncts = [r'\n', r'\r', r'\t']
|
| 536 |
+
puncts.extend(list(string.punctuation))
|
| 537 |
+
for i in puncts:
|
| 538 |
+
clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
|
| 539 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 540 |
+
|
| 541 |
+
async def __remove_digits(self, string_series: pd.Series):
|
| 542 |
+
"""
|
| 543 |
+
Removes digits from the input string.
|
| 544 |
+
:param string_series: pd.Series, input string series
|
| 545 |
+
:return: pd.Series, cleaned string series
|
| 546 |
+
"""
|
| 547 |
+
clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
|
| 548 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
| 549 |
+
|
| 550 |
+
@staticmethod
|
| 551 |
+
async def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
|
| 552 |
+
"""
|
| 553 |
+
Reomves words/tokens where minlen <= len <= maxlen.
|
| 554 |
+
:param string_series: pd.Series, input string series
|
| 555 |
+
:param minlen: int, minimum length of token to be removed.
|
| 556 |
+
:param maxlen: int, maximum length of token to be removed.
|
| 557 |
+
:return: pd.Series, cleaned string series
|
| 558 |
+
"""
|
| 559 |
+
clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if
|
| 560 |
+
(len(word) > maxlen) or (len(word) < minlen)]))
|
| 561 |
+
return clean_string_series
|
| 562 |
+
|
| 563 |
+
async def __remove_stop_words(self, string_series: pd.Series):
|
| 564 |
+
"""
|
| 565 |
+
Removes stop words from the input string.
|
| 566 |
+
:param string_series: pd.Series, input string series
|
| 567 |
+
:return: pd.Series, cleaned string series
|
| 568 |
+
"""
|
| 569 |
+
def str_remove_stop_words(string: str):
|
| 570 |
+
stops = self.stop_words
|
| 571 |
+
return " ".join([token for token in string.split() if token not in stops])
|
| 572 |
+
|
| 573 |
+
return string_series.map(str_remove_stop_words)
|
| 574 |
+
|
| 575 |
+
async def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
|
| 576 |
+
bottom_p: int = None, dataset: str = 'train'):
|
| 577 |
+
"""
|
| 578 |
+
Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
|
| 579 |
+
:param string_series: pd.Series, input string series
|
| 580 |
+
:param top_p: float, percent of frequent words to remove.
|
| 581 |
+
:param bottom_p: float, percent of rare words to remove.
|
| 582 |
+
:param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
|
| 583 |
+
:return: pd.Series, cleaned string series
|
| 584 |
+
"""
|
| 585 |
+
if dataset == 'train':
|
| 586 |
+
if top_p is None:
|
| 587 |
+
top_p = 0
|
| 588 |
+
if bottom_p is None:
|
| 589 |
+
bottom_p = 0
|
| 590 |
+
|
| 591 |
+
if top_p > 0 or bottom_p > 0:
|
| 592 |
+
word_freq = pd.Series(" ".join(string_series).split()).value_counts()
|
| 593 |
+
n_words = len(word_freq)
|
| 594 |
+
|
| 595 |
+
if top_p > 0:
|
| 596 |
+
self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])
|
| 597 |
+
|
| 598 |
+
if bottom_p > 0:
|
| 599 |
+
self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])
|
| 600 |
+
|
| 601 |
+
if len(self.words_to_remove) == 0:
|
| 602 |
+
return string_series
|
| 603 |
+
else:
|
| 604 |
+
clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split()
|
| 605 |
+
if word not in self.words_to_remove]))
|
| 606 |
+
return clean_string_series
|
| 607 |
+
|
| 608 |
+
async def preprocess(self, string_series: pd.Series, dataset: str = "train"):
|
| 609 |
+
"""
|
| 610 |
+
Entry point.
|
| 611 |
+
:param string_series: pd.Series, input string series
|
| 612 |
+
:param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
|
| 613 |
+
:return: pd.Series, cleaned string series
|
| 614 |
+
"""
|
| 615 |
+
string_series = string_series.str.lower().copy()
|
| 616 |
+
string_series = string_series.map(unidecode).copy()
|
| 617 |
+
string_series = await self.__remove_url(string_series=string_series)
|
| 618 |
+
string_series = await self.__expand(string_series=string_series)
|
| 619 |
+
|
| 620 |
+
if self.remove_punct:
|
| 621 |
+
string_series = await self.__remove_punct(string_series=string_series)
|
| 622 |
+
if self.remove_digits:
|
| 623 |
+
string_series = await self.__remove_digits(string_series=string_series)
|
| 624 |
+
if self.remove_stop_words:
|
| 625 |
+
string_series = await self.__remove_stop_words(string_series=string_series)
|
| 626 |
+
if self.remove_short_words:
|
| 627 |
+
string_series = await self.__remove_short_words(string_series=string_series,
|
| 628 |
+
minlen=self.minlen,
|
| 629 |
+
maxlen=self.maxlen)
|
| 630 |
+
string_series = await self.__remove_top_bottom_words(string_series=string_series,
|
| 631 |
+
top_p=self.top_p,
|
| 632 |
+
bottom_p=self.bottom_p, dataset=dataset)
|
| 633 |
+
|
| 634 |
+
string_series = string_series.str.strip().copy()
|
| 635 |
+
string_series.replace(to_replace="", value="this is an empty message", inplace=True)
|
| 636 |
+
|
| 637 |
+
return string_series
|
| 638 |
+
|
| 639 |
+
|
| 640 |
+
async def get_frequent_words_html(df):
|
| 641 |
+
text_preprocess = TextPreprocessor()
|
| 642 |
+
preprocessed_txt = await text_preprocess.preprocess(df['title'] + ' ' + df['description'])
|
| 643 |
+
counter = Counter(' '.join([*preprocessed_txt]).split())
|
| 644 |
+
|
| 645 |
+
freq_tokens_html = '<div class="word-cloud-container">'
|
| 646 |
+
n = 1
|
| 647 |
+
for i, j in counter.most_common(25):
|
| 648 |
+
freq_tokens_html += f'<a class="wc-tokens" onclick=wc_search("{i}")>{i}</a>{" " * np.random.randint(3, 7, 1)[0]}'
|
| 649 |
+
if n == 5:
|
| 650 |
+
freq_tokens_html += '<div class="word-cloud-section" id="word-cloud-section-id">'
|
| 651 |
+
n += 1
|
| 652 |
+
freq_tokens_html += '</div></div>'
|
| 653 |
return freq_tokens_html
|