in src/graph_notebook/notebooks/03-Neptune-ML/03-Sample-Applications/04-Telco-Networks/neptune_ml_utils.py [0:0]
def __process_movies_genres(self):
# process the movies_vertex.csv
print('Processing Movies', end='\r')
movies_df = pd.read_csv(os.path.join(
self.raw_directory, 'ml-100k/u.item'), sep='|', encoding='ISO-8859-1',
names=['~id', 'title', 'release_date', 'video_release_date', 'imdb_url',
'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
# Parse date and convert to ISO format
movies_df['release_date'] = movies_df['release_date'].apply(
lambda x: str(
datetime.strptime(x, '%d-%b-%Y').isoformat()) if not pd.isna(x) else x)
movies_df['~label'] = 'movie'
movies_df['~id'] = movies_df['~id'].apply(
lambda x: f'movie_{x}')
movie_genre_df = movies_df[[
'~id', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']]
genres_edges_df = pd.DataFrame(
columns=['~id', '~from', '~to', '~label'])
genres = ['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
genre_df = pd.DataFrame(genres, columns=['~id'])
genre_df['~label'] = 'genre'
genre_df['name'] = genre_df['~id']
genre_df.to_csv(os.path.join(self.formatted_directory,
'genre_vertex.csv'), index=False)
# Loop through all the movies and pull out the genres
for index, row in movie_genre_df.iterrows():
genre_lst = []
for g in genres:
if row[g] == 1:
genres_edges_df = genres_edges_df.append(
{'~id': f"{row['~id']}-included_in-{g}", '~label': 'included_in',
'~from': row['~id'], '~to': g}, ignore_index=True)
genre_lst.append(g)
movies_df.loc[index, 'genre:String[]'] = ';'.join(genre_lst)
# rename the release data column to specify the data type
movies_df['release_date:Date'] = movies_df['release_date']
# Drop the genre columns as well as the uneeded release date columns
genres.append('video_release_date')
genres.append('release_date')
movies_df = movies_df.drop(columns=genres)
movies_df.to_csv(os.path.join(self.formatted_directory,
'movie_vertex.csv'), index=False)
genres_edges_df.to_csv(os.path.join(self.formatted_directory,
'genre_edges.csv'), index=False)