Instructions to use hzonuz/imdb-recommender with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use hzonuz/imdb-recommender with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("hzonuz/imdb-recommender", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from ast import literal_eval | |
| import gc | |
| from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
| from sklearn.preprocessing import MinMaxScaler | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # import seaborn as sns | |
| from collections import Counter | |
| import mlflow | |
| def init_mlflow(): | |
| mlflow.set_tracking_uri("http://0.0.0.0:8889") | |
| mlflow.set_experiment("Default") | |
| mlflow.start_run() | |
| mlflow.sklearn.autolog() | |
| def load_data(): | |
| credits_df = pd.read_csv('./datasets/credits.csv') | |
| keywords_df = pd.read_csv('./datasets/keywords.csv') | |
| links_df = pd.read_csv('./datasets/links_small.csv') | |
| movies_df = pd.read_csv('./datasets/movies_metadata.csv') | |
| ratings_df = pd.read_csv('./datasets/ratings_small.csv') | |
| return credits_df, keywords_df, links_df, movies_df, ratings_df | |
| def draw_adult_movies_pie_chart(movies_df): | |
| plt.figure(figsize=(8, 4)) | |
| plt.scatter(x=[0.5, 1.5], y=[1, 1], s=15000, color=['#06837f', '#fdc100']) | |
| plt.xlim(0, 2) | |
| plt.ylim(0.9, 1.2) | |
| plt.title('Distribution of Adult and Non Adult Movies', fontsize=18, weight=600, color='#333d29') | |
| plt.text(0.5, 1, '{}\nMovies'.format(str(len(movies_df[movies_df['adult'] == 'True']))), va='center', ha='center', | |
| fontsize=18, weight=600, color='white') | |
| plt.text(1.5, 1, '{}\nMovies'.format(str(len(movies_df[movies_df['adult'] == 'False']))), va='center', ha='center', | |
| fontsize=18, weight=600, color='white') | |
| plt.text(0.5, 1.11, 'Adult', va='center', ha='center', fontsize=17, weight=500, color='#1c2541') | |
| plt.text(1.5, 1.11, 'Non Adult', va='center', ha='center', fontsize=17, weight=500, color='#1c2541') | |
| plt.axis('off') | |
| plt.savefig('adult.png') | |
| mlflow.log_artifact('adult.png') | |
| def draw_genres_pie_chart(df): | |
| genres_list = [] | |
| for i in df['genres']: | |
| i = i[1:] | |
| i = i[:-1] | |
| genres_list.extend(i.split(', ')) | |
| fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6)) | |
| df_plot = pd.DataFrame(Counter(genres_list).most_common(5), columns=['genre', 'total']) | |
| # ax = sns.barplot(data=df_plot, x='genre', y='total', ax=axes[0], | |
| # palette=['#06837f', '#02cecb', '#b4ffff', '#f8e16c', '#fed811']) | |
| # ax.set_title('Top 5 Genres in Movies', fontsize=18, weight=600, color='#333d29') | |
| # sns.despine() | |
| df_plot_full = pd.DataFrame([Counter(genres_list)]).transpose().sort_values(by=0, ascending=False) | |
| df_plot.loc[len(df_plot)] = {'genre': 'Others', 'total': df_plot_full[6:].sum()[0]} | |
| plt.title('Percentage Ratio of Movie Genres', fontsize=18, weight=600, color='#333d29') | |
| wedges, texts, autotexts = axes[1].pie(x=df_plot['total'], labels=df_plot['genre'], autopct='%.2f%%', | |
| textprops=dict(fontsize=14), explode=[0, 0, 0, 0, 0, 0.1], | |
| colors=['#06837f', '#02cecb', '#b4ffff', '#f8e16c', '#fed811', '#fdc100']) | |
| for autotext in autotexts: | |
| autotext.set_color('#1c2541') | |
| autotext.set_weight('bold') | |
| axes[1].axis('off') | |
| plt.savefig('genres.png') | |
| mlflow.log_artifact('genres.png') | |
| def director(x): | |
| for i in x: | |
| if i["job"] == "Director": | |
| return i["name"] | |
| return "" | |
| def writer_screenplay(x): | |
| names = [] | |
| for i in x: | |
| if (i["job"] == "Writer") | (i["job"] == "Screenplay") | (i["job"] == "Author"): | |
| name = i["name"] | |
| names.append(name) | |
| return names | |
| def calculate_cosine_similarity(train_df): | |
| cosine_sim = cosine_similarity(train_df) | |
| return cosine_sim | |
| def clean_data(credits_df, keywords_df, movies_df): | |
| # draw_adult_movies_pie_chart(movies_df) | |
| # Cast id column to int | |
| movies_df["id"] = movies_df["id"].apply(pd.to_numeric, errors="ignore") | |
| keywords_df["id"] = keywords_df["id"].apply(int) | |
| credits_df["id"] = credits_df["id"].apply(int) | |
| # Merge movies, keywords, credits based on id column | |
| df = movies_df.merge(keywords_df, on="id").merge(credits_df, on="id") | |
| """Cleaning our merged data from from duplicated and null values""" | |
| # Find null values in our merged data frame | |
| df.isnull().sum() | |
| # Remove duplicated values with the same titles | |
| df.drop_duplicates(subset=["title", "id"], inplace=True) | |
| # Remove movies with null titles | |
| df = df[df.title.notnull()] | |
| # Find number of movies with vote count < 30 | |
| (df.vote_count < 30).sum() | |
| # Remove movies with vote count < 30 | |
| df = df[df.vote_count > 30] | |
| # Make release data numeric | |
| df["release_date"] = pd.to_datetime(df['release_date']) | |
| df["release_year"] = df["release_date"].dt.year | |
| df.drop("release_date", axis=1, inplace=True) | |
| # Remove null values | |
| df = df[df["release_year"].notnull()] | |
| df = df[df["runtime"].notnull()] | |
| # Make vote_average and release_year column categorical and normalize them | |
| df["vote_average_bins"] = pd.cut(df["vote_average"].astype(float), 10, labels=range(1, 11)) | |
| scaler = MinMaxScaler() | |
| df["vote_average_bins"] = df["vote_average_bins"].astype(int) | |
| df["vote_average_bins"] = scaler.fit_transform(df["vote_average_bins"].values.reshape(-1, 1)) | |
| df["release_year_bins"] = pd.qcut(df["release_year"].astype(float), q=10, labels=range(1, 11)) | |
| scaler = MinMaxScaler() | |
| df["release_year_bins"] = df["release_year_bins"].astype(int) | |
| df["release_year_bins"] = scaler.fit_transform(df["release_year_bins"].values.reshape(-1, 1)) | |
| # Set data frame primary index to title | |
| df.set_index("title", inplace=True) | |
| # Make languages one-hotted | |
| languages = pd.get_dummies(df["original_language"]) | |
| # Extract genre name from json | |
| df['genres'] = df['genres'].fillna('[]').apply(literal_eval).apply( | |
| lambda x: [i['name'] for i in x] if isinstance(x, list) else "") | |
| df["genres"] = df["genres"].astype(str) | |
| # draw_genres_pie_chart(df) | |
| # Make genres one-hotted | |
| cv = CountVectorizer(lowercase=False) | |
| genres = cv.fit_transform(df["genres"]) | |
| genres_df = pd.DataFrame(genres.todense(), columns=cv.get_feature_names_out()) | |
| genres_df.set_index(df.index, inplace=True) | |
| # Make keywords,tagline,overview one-hotted | |
| df['keywords'] = df['keywords'].fillna('[]').apply(literal_eval).apply( | |
| lambda x: [i['name'] for i in x] if isinstance(x, list) else "") | |
| df["keywords"] = df["keywords"].astype(str) | |
| df["tagline"].fillna("", inplace=True) | |
| df["overview"].fillna("", inplace=True) | |
| df["keywords"].fillna("", inplace=True) | |
| df["text"] = df["overview"] + df["tagline"] + df["keywords"] | |
| tfidf = TfidfVectorizer(max_features=5000) | |
| tfidf_matrix = tfidf.fit_transform(df["text"]) | |
| tfidf_df = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf.get_feature_names_out()) | |
| tfidf_df.set_index(df.index, inplace=True) | |
| # Make cast one-hotted | |
| df['cast'] = df['cast'].fillna('[]').apply(literal_eval).apply( | |
| lambda x: [i['name'] for i in x] if isinstance(x, list) else "") | |
| df["cast"] = df["cast"].apply(lambda x: [c.replace(" ", "") for c in x]) | |
| df["cast"] = df["cast"].apply(lambda x: x[:15]) | |
| df["CC"] = df["cast"].astype(str) | |
| cv = CountVectorizer(lowercase=False, min_df=4) | |
| cast = cv.fit_transform(df["CC"]) | |
| cast_df = pd.DataFrame(cast.todense(), columns=cv.get_feature_names_out()) | |
| cast_df.set_index(df.index, inplace=True) | |
| df["dir"] = df["crew"].apply(literal_eval).apply(director) | |
| directors = pd.get_dummies(df["dir"]) | |
| df["writer_screenplay"] = df["crew"].apply(literal_eval).apply(writer_screenplay) | |
| df["writer_screenplay"] = df["writer_screenplay"].apply(lambda x: [c.replace(" ", "") for c in x]) | |
| df["writer_screenplay"] = df["writer_screenplay"].apply(lambda x: x[:3]) | |
| df["writer_screenplay"] = df["writer_screenplay"].astype(str) | |
| cv = CountVectorizer(lowercase=False, min_df=2) | |
| writing = cv.fit_transform(df["writer_screenplay"]) | |
| writing_df = pd.DataFrame(writing.todense(), columns=cv.get_feature_names_out()) | |
| writing_df.set_index(df.index, inplace=True) | |
| gc.collect() | |
| train_df = pd.concat([languages, genres_df, cast_df, writing_df, directors, tfidf_df], axis=1) | |
| train_df = train_df.astype(np.int8) | |
| gc.collect() | |
| return train_df, df | |
| class RecommenderSystem(mlflow.pyfunc.PythonModel): | |
| def load_context(self, context): | |
| credits_df, keywords_df, links_df, movies_df, ratings_df = load_data() | |
| self.train_df, self.df = clean_data(credits_df, keywords_df, movies_df) | |
| self.cosine_sim = calculate_cosine_similarity(self.train_df) | |
| def predict(self, context, model_input): | |
| return self.recommend(model_input[0], self.cosine_sim) | |
| def recommend(self, title, cosine_sim): | |
| indices = pd.Series(range(0, len(self.train_df.index)), index=self.train_df.index).drop_duplicates() | |
| number = 10 | |
| # Get the index of the movie that matches the title | |
| idx = indices[title] | |
| # Get the pairwsie similarity scores of all movies with that movie | |
| sim_scores = list(enumerate(cosine_sim[idx])) | |
| # Sort the movies based on the similarity scores | |
| sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) | |
| scores_arr = np.array(sim_scores) | |
| scores_mean = np.average(scores_arr, axis=0) | |
| mlflow.log_metric("cosine-total-avg", scores_mean[1]) | |
| # Get the scores of the 10 most similar movies | |
| sim_scores = sim_scores[1:number + 1] | |
| scores_arr = np.array(sim_scores) | |
| scores_mean = np.average(scores_arr, axis=0) | |
| mlflow.log_metric("cosine-result-avg", scores_mean[1]) | |
| mlflow.log_metric("cosine-result-max", sim_scores[0][1]) | |
| mlflow.log_metric("cosine-result-min", sim_scores[number - 1][1]) | |
| mlflow.log_param("number-of-results", number) | |
| # Get the movie indices | |
| movie_indices = [i[0] for i in sim_scores] | |
| recommendations = pd.DataFrame({"Movies": self.df.iloc[movie_indices].index.tolist(), | |
| "Id": self.df.iloc[movie_indices].imdb_id.tolist(), | |
| "Similarity": [sim[1] for sim in sim_scores]}) | |
| return recommendations | |
| if __name__ == '__main__': | |
| mlflow.pyfunc.save_model(path="imdb-recommendation-v2", python_model=RecommenderSystem()) | |
| init_mlflow() | |
| mlflow.pyfunc.log_model("imdb-recommendation-v2", python_model=RecommenderSystem(), registered_model_name="recommendation-model-v2") | |
| loaded_model = mlflow.pyfunc.load_model("imdb-recommendation-v2") | |
| print(loaded_model.predict(["The Dark Knight Rises"])) | |
| mlflow.end_run() | |