def __process_movies_genres()

in src/graph_notebook/notebooks/03-Neptune-ML/neptune_ml_utils.py [0:0]


    def __process_movies_genres(self):
        # process the movies_vertex.csv
        print('Processing Movies', end='\r')
        movies_df = pd.read_csv(os.path.join(
            self.raw_directory, 'ml-100k/u.item'), sep='|', encoding='ISO-8859-1',
            names=['~id', 'title', 'release_date', 'video_release_date', 'imdb_url',
                   'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
                   'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
                   'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
        # Parse date and convert to ISO format
        movies_df['release_date'] = movies_df['release_date'].apply(
            lambda x: str(
                datetime.strptime(x, '%d-%b-%Y').isoformat()) if not pd.isna(x) else x)
        movies_df['~label'] = 'movie'
        movies_df['~id'] = movies_df['~id'].apply(
            lambda x: f'movie_{x}')
        movie_genre_df = movies_df[[
            '~id', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
            'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
            'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']]
        genres_edges_df = pd.DataFrame(
            columns=['~id', '~from', '~to', '~label'])

        genres = ['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
                  'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
                  'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

        genre_df = pd.DataFrame(genres, columns=['~id'])
        genre_df['~label'] = 'genre'
        genre_df['name'] = genre_df['~id']
        genre_df.to_csv(os.path.join(self.formatted_directory,
                                     'genre_vertex.csv'), index=False)
        genres_edge_df_rows_list = [genres_edges_df]

        # Loop through all the movies and pull out the genres
        for index, row in movie_genre_df.iterrows():
            genre_lst = []
            for g in genres:
                if row[g] == 1:
                    row_as_df = pd.DataFrame.from_dict({'~id': f"{row['~id']}-included_in-{g}",
                                                        '~label': 'included_in',
                                                        '~from': row['~id'],
                                                        '~to': g},
                                                       orient='index').T
                    genres_edge_df_rows_list.append(row_as_df)
                    genre_lst.append(g)
            movies_df.loc[index, 'genre:String[]'] = ';'.join(genre_lst)

        genres_edges_df = pd.concat(genres_edge_df_rows_list, ignore_index=True)
        # rename the release data column to specify the data type
        movies_df['release_date:Date'] = movies_df['release_date']
        # Drop the genre columns as well as the uneeded release date columns
        genres.append('video_release_date')
        genres.append('release_date')
        movies_df = movies_df.drop(columns=genres)

        movies_df.to_csv(os.path.join(self.formatted_directory,
                                      'movie_vertex.csv'), index=False)
        genres_edges_df.to_csv(os.path.join(self.formatted_directory,
                                            'genre_edges.csv'), index=False)