In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup



In [2]:
# load csv file and process the data
urls_df = pd.read_csv('url_only_data.csv')
urls_df.head()


Unnamed: 0,url
0,https://www.foxnews.com/lifestyle/jack-carrs-e...
1,https://www.foxnews.com/entertainment/bruce-wi...
2,https://www.foxnews.com/politics/blinken-meets...
3,https://www.foxnews.com/entertainment/emily-bl...
4,https://www.foxnews.com/media/the-view-co-host...


In [3]:
# define the function to fetch the title of the news article
def fetch_title(url):
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return f"Error: {response.status_code}"
        soup = BeautifulSoup(response.text, "html.parser")
        # Try to find the headline based on a common class used on Fox News pages
        title = soup.find("h1", class_="headline speakable")
        return title.text.strip() if title else "Title not found"
    except Exception as e:
        return f"Error: {e}"

def fetch_title_altered(url):
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return f"Error: {response.status_code}"
        soup = BeautifulSoup(response.text, "html.parser")
        # Try to find the headline based on a common class used on Fox News pages
        title = soup.find("h1")
        return title.text.strip() if title else "Title not found"
    except Exception as e:
        return f"Error: {e}"

In [4]:
# remove the '.print' from the urls
urls_df['url'] = urls_df['url'].str.replace('.print', '', regex=False)

In [5]:
# fetch the title of the news article
urls_df['title'] = urls_df['url'].apply(fetch_title)

KeyboardInterrupt: 

In [None]:
# fetch the title of the news article that was not found
not_found = urls_df[urls_df['title'] == 'Title not found']
not_found['title'] = not_found['url'].apply(fetch_title_altered)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_found['title'] = not_found['url'].apply(fetch_title_altered)


In [72]:
urls_df.update(not_found)

In [75]:
# remove duplicates titles
urls_df.drop_duplicates(subset='title', keep='first', inplace=True)

In [84]:
# convert title to string
urls_df['title'] = urls_df['title'].astype(str)

In [None]:
# remove the " "" " from the titles
urls_df['title'] = urls_df['title'].str.strip('"')

In [93]:
# save the data to a new csv file
urls_df.to_csv('fetched_headlines.csv', index=False)

In [104]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


In [91]:
# Convert the labels to binary values (0 for ’FoxNews’, 1 for ’NBC’)
urls_df['label'] = urls_df['url'].apply(lambda x: 0 if 'foxnews.com' in x else 1 if 'nbcnews.com' in x else None)

In [97]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(urls_df['title'], urls_df['label'], test_size=0.2, random_state=42)


In [98]:
# Convert the text data to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [99]:
# Train a Logistic Regression model
model = LogisticRegression(max_iter=100)
model.fit(X_train_tfidf, y_train)

In [100]:
y_pred = model.predict(X_test_tfidf)

In [105]:
# 7. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred)
)

Accuracy: 0.7084
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.80      0.76       427
           1       0.70      0.59      0.64       331

    accuracy                           0.71       758
   macro avg       0.71      0.70      0.70       758
weighted avg       0.71      0.71      0.70       758



In [7]:
df = pd.read_csv('fetched_headlines.csv')
df.head

<bound method NDFrame.head of                                                     url  \
0     https://www.foxnews.com/lifestyle/jack-carrs-e...   
1     https://www.foxnews.com/entertainment/bruce-wi...   
2     https://www.foxnews.com/politics/blinken-meets...   
3     https://www.foxnews.com/entertainment/emily-bl...   
4     https://www.foxnews.com/media/the-view-co-host...   
...                                                 ...   
3784  https://www.nbcnews.com/politics/2024-election...   
3785  https://www.nbcnews.com/select/shopping/best-a...   
3786  https://www.nbcnews.com/select/shopping/best-v...   
3787  https://www.nbcnews.com/politics/2024-election...   
3788  https://www.nbcnews.com/select/shopping/white-...   

                                                  title  label  
0     Jack Carr recalls Gen. Eisenhower's D-Day memo...      0  
1     Bruce Willis, Demi Moore avoided doing one thi...      0  
2     Blinken meets Qatar PM, says Israeli actions a...      0  
3

In [None]:
df['outlet'] = df['url'].apply(lambda x: 'FoxNews' if 'foxnews.com' in x else 'NBC')


Unnamed: 0,url,title,label,outlet
0,https://www.foxnews.com/lifestyle/jack-carrs-e...,Jack Carr recalls Gen. Eisenhower's D-Day memo...,0,FoxNews
1,https://www.foxnews.com/entertainment/bruce-wi...,"Bruce Willis, Demi Moore avoided doing one thi...",0,FoxNews
2,https://www.foxnews.com/politics/blinken-meets...,"Blinken meets Qatar PM, says Israeli actions a...",0,FoxNews
3,https://www.foxnews.com/entertainment/emily-bl...,Emily Blunt says her ‘toes curl’ when people t...,0,FoxNews
4,https://www.foxnews.com/media/the-view-co-host...,"'The View' co-host, CNN commentator Ana Navarr...",0,FoxNews


In [10]:
# Swap label and outlet position and update label values
df['label'] = df['outlet'].apply(lambda x: 1 if x == 'FoxNews' else 0)
df = df[[ 'title', 'outlet', 'label']]
df.head()

Unnamed: 0,title,outlet,label
0,Jack Carr recalls Gen. Eisenhower's D-Day memo...,FoxNews,1
1,"Bruce Willis, Demi Moore avoided doing one thi...",FoxNews,1
2,"Blinken meets Qatar PM, says Israeli actions a...",FoxNews,1
3,Emily Blunt says her ‘toes curl’ when people t...,FoxNews,1
4,"'The View' co-host, CNN commentator Ana Navarr...",FoxNews,1


In [11]:
df.to_csv('train_data.csv', index=False)

In [12]:
df['title'].apply(type).unique()

array([<class 'str'>], dtype=object)