Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	| """ | |
| # MANIFESTO ANALYSIS | |
| """ | |
| ##IMPORTING LIBRARIES | |
| import random | |
| import matplotlib.pyplot as plt | |
| import nltk | |
| from nltk.tokenize import word_tokenize,sent_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.stem.porter import PorterStemmer | |
| from nltk.stem import WordNetLemmatizer | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.probability import FreqDist | |
| from cleantext import clean | |
| import textract | |
| import urllib.request | |
| import nltk.corpus | |
| from nltk.text import Text | |
| import io | |
| from io import StringIO,BytesIO | |
| import sys | |
| import pandas as pd | |
| import cv2 | |
| import re | |
| from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator | |
| from textblob import TextBlob | |
| from PIL import Image | |
| import os | |
| import gradio as gr | |
| from zipfile import ZipFile | |
| import contractions | |
| import unidecode | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| nltk.download('wordnet') | |
| nltk.download('words') | |
| """## PARSING FILES""" | |
| #def Parsing(parsed_text): | |
| #parsed_text=parsed_text.name | |
| #raw_party =parser.from_file(parsed_text) | |
| # raw_party = raw_party['content'],cache_examples=True | |
| # return clean(raw_party) | |
| def Parsing(parsed_text): | |
| parsed_text=parsed_text.name | |
| raw_party =textract.process(parsed_text, encoding='ascii',method='pdfminer') | |
| return clean(raw_party) | |
| #Added more stopwords to avoid irrelevant terms | |
| stop_words = set(stopwords.words('english')) | |
| stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This','also', 'A', 'fu','To','5','ing', 'er', '2') | |
| """## PREPROCESSING""" | |
| def clean_text(text): | |
| ''' | |
| The function which returns clean text | |
| ''' | |
| text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters | |
| text=unidecode.unidecode(text)# diacritics remove | |
| text=contractions.fix(text) # contraction fix | |
| text = re.sub(r"\n", " ", text) | |
| text = re.sub(r"\n\n", " ", text) | |
| text = re.sub(r"\t", " ", text) | |
| text = re.sub(r"/ ", " ", text) | |
| text = text.strip(" ") | |
| text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single | |
| text = [word for word in text.split() if word not in stop_words] | |
| text = ' '.join(text) | |
| return text | |
| # text_Party=clean_text(raw_party) | |
| def Preprocess(textParty): | |
| ''' | |
| Removing special characters extra spaces | |
| ''' | |
| text1Party = re.sub('[^A-Za-z0-9]+', ' ', textParty) | |
| #Removing all stop words | |
| pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') | |
| text2Party = pattern.sub('', text1Party) | |
| # fdist_cong = FreqDist(word_tokens_cong) | |
| return text2Party | |
| ''' | |
| Using Concordance, you can see each time a word is used, along with its | |
| immediate context. It can give you a peek into how a word is being used | |
| at the sentence level and what words are used with it | |
| ''' | |
| def conc(text_Party,strng): | |
| word_tokens_party = word_tokenize(text_Party) | |
| moby = Text(word_tokens_party) | |
| resultList = [] | |
| for i in range(0,1): | |
| save_stdout = sys.stdout | |
| result = StringIO() | |
| sys.stdout = result | |
| moby.concordance(strng,lines=4,width=82) | |
| sys.stdout = save_stdout | |
| s=result.getvalue().splitlines() | |
| return result.getvalue() | |
| def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10,numLins=4): | |
| """ | |
| Function to get all the phases that contain the target word in a text/passage tar_passage. | |
| Workaround to save the output given by nltk Concordance function | |
| str target_word, str tar_passage int left_margin int right_margin --> list of str | |
| left_margin and right_margin allocate the number of words/pununciation before and after target word | |
| Left margin will take note of the beginning of the text | |
| """ | |
| ## Create list of tokens using nltk function | |
| tokens = nltk.word_tokenize(tar_passage) | |
| ## Create the text of tokens | |
| text = nltk.Text(tokens) | |
| ## Collect all the index or offset position of the target word | |
| c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower()) | |
| ## Collect the range of the words that is within the target word by using text.tokens[start;end]. | |
| ## The map function is use so that when the offset position - the target range < 0, it will be default to zero | |
| concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in c.offsets(target_word)]) | |
| ## join the sentences for each of the target phrase and return it | |
| result = [''.join([x.replace("Y","")+' ' for x in con_sub]) for con_sub in concordance_txt][:-1] | |
| result=result[:numLins+1] | |
| res='\n\n'.join(result) | |
| return res | |
| def normalize(d, target=1.0): | |
| raw = sum(d.values()) | |
| factor = target/raw | |
| return {key:value*factor for key,value in d.items()} | |
| def fDistance(text2Party): | |
| ''' | |
| Most frequent words search | |
| ''' | |
| word_tokens_party = word_tokenize(text2Party) #Tokenizing | |
| fdistance = FreqDist(word_tokens_party).most_common(10) | |
| mem={} | |
| for x in fdistance: | |
| mem[x[0]]=x[1] | |
| return normalize(mem) | |
| def fDistancePlot(text2Party,plotN=15): | |
| ''' | |
| Most Frequent Words Visualization | |
| ''' | |
| word_tokens_party = word_tokenize(text2Party) #Tokenizing | |
| fdistance = FreqDist(word_tokens_party) | |
| plt.title('Frequency Distribution') | |
| plt.axis('off') | |
| plt.figure(figsize=(4,3)) | |
| fdistance.plot(plotN) | |
| plt.tight_layout() | |
| buf = BytesIO() | |
| plt.savefig(buf) | |
| buf.seek(0) | |
| img1 = Image.open(buf) | |
| plt.clf() | |
| return img1 | |
| def DispersionPlot(textParty): | |
| ''' | |
| Dispersion PLot | |
| ''' | |
| word_tokens_party = word_tokenize(textParty) #Tokenizing | |
| moby = Text(word_tokens_party) | |
| fdistance = FreqDist(word_tokens_party) | |
| word_Lst=[] | |
| for x in range(5): | |
| word_Lst.append(fdistance.most_common(6)[x][0]) | |
| plt.axis('off') | |
| plt.figure(figsize=(4,3)) | |
| plt.title('Dispersion Plot') | |
| moby.dispersion_plot(word_Lst) | |
| plt.plot(color="#EF6D6D") | |
| plt.tight_layout() | |
| buf = BytesIO() | |
| plt.savefig(buf) | |
| buf.seek(0) | |
| img = Image.open(buf) | |
| plt.clf() | |
| return img | |
| def getSubjectivity(text): | |
| ''' | |
| Create a function to get the polarity | |
| ''' | |
| return TextBlob(text).sentiment.subjectivity | |
| def getPolarity(text): | |
| ''' | |
| Create a function to get the polarity | |
| ''' | |
| return TextBlob(text).sentiment.polarity | |
| def getAnalysis(score): | |
| if score < 0: | |
| return 'Negative' | |
| elif score == 0: | |
| return 'Neutral' | |
| else: | |
| return 'Positive' | |
| def Original_Image(path): | |
| img= cv2.imread(path) | |
| img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
| return img | |
| def Image_Processed(path): | |
| ''' | |
| Reading the image file | |
| ''' | |
| img= cv2.imread(path) | |
| img= cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
| #Thresholding | |
| ret, bw_img = cv2.threshold(img, 124, 255, cv2.THRESH_BINARY) | |
| return bw_img | |
| def word_cloud(orgIm,mask_img,text_Party_pr,maxWord=2000,colorGener=True, | |
| contCol='white',bckColor='white'): | |
| ''' | |
| #Generating word cloud | |
| ''' | |
| mask =mask_img | |
| # Create and generate a word cloud image: | |
| wordcloud = WordCloud(max_words=maxWord, background_color=bckColor, | |
| mask=mask, | |
| colormap='nipy_spectral_r', | |
| contour_color=contCol, | |
| width=800, height=800, | |
| margin=2, | |
| contour_width=3).generate(text_Party_pr) | |
| # create coloring from image | |
| plt.axis("off") | |
| if colorGener==True: | |
| image_colors = ImageColorGenerator(orgIm) | |
| plt.imshow(wordcloud.recolor(color_func= image_colors),interpolation="bilinear") | |
| else: | |
| plt.imshow(wordcloud) | |
| def word_cloud_generator(parsed_text_name,text_Party): | |
| parsed=parsed_text_name.lower() | |
| if 'bjp' in parsed: | |
| orgImg=Original_Image('bjpImg2.jpeg') | |
| bwImg=Image_Processed('bjpImg2.jpeg') | |
| plt.figure(figsize=(6,5)) | |
| word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True, | |
| contCol='white', bckColor='black') | |
| plt.tight_layout() | |
| buf = BytesIO() | |
| plt.savefig(buf) | |
| buf.seek(0) | |
| img1 = Image.open(buf) | |
| plt.clf() | |
| return img1 | |
| elif 'congress' in parsed: | |
| orgImg=Original_Image('congress3.jpeg') | |
| bwImg=Image_Processed('congress3.jpeg') | |
| plt.figure(figsize=(5,4)) | |
| word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=True) | |
| plt.tight_layout() | |
| buf = BytesIO() | |
| plt.savefig(buf) | |
| buf.seek(0) | |
| img2 = Image.open(buf) | |
| plt.clf() | |
| return img2 | |
| #congrsMain.jpg | |
| elif 'aap' in parsed: | |
| orgImg=Original_Image('aapMain2.jpg') | |
| bwImg=Image_Processed('aapMain2.jpg') | |
| plt.figure(figsize=(5,4)) | |
| word_cloud(orgImg,bwImg,text_Party,maxWord=3000,colorGener=False,contCol='black') | |
| plt.tight_layout() | |
| buf = BytesIO() | |
| plt.savefig(buf) | |
| buf.seek(0) | |
| img3 = Image.open(buf) | |
| plt.clf() | |
| return img3 | |
| else : | |
| wordcloud = WordCloud(max_words=2000, background_color="white",mode="RGB").generate(text_Party) | |
| plt.figure(figsize=(5,5)) | |
| plt.imshow(wordcloud, interpolation="bilinear") | |
| plt.axis("off") | |
| plt.tight_layout() | |
| buf = BytesIO() | |
| plt.savefig(buf) | |
| buf.seek(0) | |
| img4 = Image.open(buf) | |
| plt.clf() | |
| return img4 | |
| ''' | |
| url = "http://library.bjp.org/jspui/bitstream/123456789/2988/1/BJP-Election-english-2019.pdf" | |
| path_input = "./Bjp_Manifesto_2019.pdf" | |
| urllib.request.urlretrieve(url, filename=path_input) | |
| url="https://drive.google.com/uc?id=1BLCiy_BWilfVdrUH8kbO-44DJevwO5CG&export=download" | |
| path_input = "./Aap_Manifesto_2019.pdf" | |
| urllib.request.urlretrieve(url, filename=path_input) | |
| url="https://drive.google.com/uc?id=1HVZvTtYntl0YKLnE0cwu0CvAIRhXOv60&export=download" | |
| path_input = "./Congress_Manifesto_2019.pdf" | |
| urllib.request.urlretrieve(url, filename=path_input) | |
| ''' | |
| def analysis(Manifesto,Search): | |
| raw_party = Parsing(Manifesto) | |
| text_Party=clean_text(raw_party) | |
| text_Party= Preprocess(text_Party) | |
| df = pd.DataFrame(raw_party.split('\n'), columns=['Content']) | |
| df['Subjectivity'] = df['Content'].apply(getSubjectivity) | |
| df['Polarity'] = df['Content'].apply(getPolarity) | |
| df['Analysis on Polarity'] = df['Polarity'].apply(getAnalysis) | |
| df['Analysis on Subjectivity'] = df['Subjectivity'].apply(getAnalysis) | |
| plt.title('Sentiment Analysis') | |
| plt.xlabel('Sentiment') | |
| plt.ylabel('Counts') | |
| plt.figure(figsize=(4,3)) | |
| df['Analysis on Polarity'].value_counts().plot(kind ='bar',color="#FF9F45") | |
| plt.tight_layout() | |
| buf = BytesIO() | |
| plt.savefig(buf) | |
| buf.seek(0) | |
| img1 = Image.open(buf) | |
| plt.clf() | |
| plt.figure(figsize=(4,3)) | |
| df['Analysis on Subjectivity'].value_counts().plot(kind ='bar',color="#B667F1") | |
| plt.tight_layout() | |
| buf = BytesIO() | |
| plt.savefig(buf) | |
| buf.seek(0) | |
| img2 = Image.open(buf) | |
| plt.clf() | |
| img3 = word_cloud_generator(Manifesto.name,text_Party) | |
| fdist_Party=fDistance(text_Party) | |
| img4=fDistancePlot(text_Party) | |
| img5=DispersionPlot(text_Party) | |
| #concordance(text_Party,Search) | |
| searChRes=get_all_phases_containing_tar_wrd(Search,text_Party) | |
| searChRes=searChRes.replace(Search,"\u0332".join(Search)) | |
| plt.close('all') | |
| return searChRes,fdist_Party,img1,img2,img3,img4,img5 | |
| Search_txt= "text" | |
| filePdf = "file" | |
| text = gr.Textbox(label='Context Based Search') | |
| mfw=gr.Label(label="Most Relevant Topics") | |
| plot1=gr.Image(label='Sentiment Analysis') | |
| plot2=gr.Image(label='Subjectivity Analysis') | |
| plot3=gr.Image(label='Word Cloud') | |
| plot4=gr.Image(label='Frequency Distribution') | |
| plot5=gr.Image(label='Dispersion Plot') | |
| io=gr.Interface(fn=analysis, inputs=[filePdf,Search_txt], outputs=[text,mfw,plot1,plot2,plot3,plot4,plot5], title='Manifesto Analysis',examples=[['Example/AAP_Manifesto_2019.pdf','government'],['Example/Bjp_Manifesto_2019.pdf','environment'],['Example/Congress_Manifesto_2019.pdf','safety']],theme='peach') | |
| io.launch(debug=True,share=False) | |
| #allow_screenshot=False,allow_flagging="never", | |
| #examples=[['manifestos/Bjp_Manifesto_2019.pdf','modi'],['AAP_Manifesto_2019.pdf','delhi'],['manifestos/Congress_Manifesto_2019.pdf','safety']]) | |
