src/graph_notebook/notebooks/03-Neptune-ML/03-Sample-Applications/04-Telco-Networks/neptune_ml_utils.py [376:483]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        movies_df['release_date:Date'] = movies_df['release_date']
        # Drop the genre columns as well as the uneeded release date columns
        genres.append('video_release_date')
        genres.append('release_date')
        movies_df = movies_df.drop(columns=genres)

        movies_df.to_csv(os.path.join(self.formatted_directory,
                                      'movie_vertex.csv'), index=False)
        genres_edges_df.to_csv(os.path.join(self.formatted_directory,
                                            'genre_edges.csv'), index=False)

    def __process_ratings_users(self):
        # Create ratings vertices and add edges on both sides
        print('Processing Ratings', end='\r')
        ratings_vertices = pd.read_csv(os.path.join(
            self.raw_directory, 'ml-100k/u.data'), sep='\t', encoding='ISO-8859-1',
            names=['~from', '~to', 'score:Int', 'timestamp'])
        ratings_vertices['~from'] = ratings_vertices['~from'].apply(
            lambda x: f'user_{x}')
        ratings_vertices['~to'] = ratings_vertices['~to'].apply(
            lambda x: f'movie_{x}')
        rated_edges = ratings_vertices.copy(deep=True)

        ratings_vertices['~id'] = ratings_vertices['~from'].str.cat(
            ratings_vertices['~to'], sep=":")
        ratings_vertices['~label'] = "rating"
        dict = {}
        edges = {}
        for index, row in ratings_vertices.iterrows():
            id_from = row['~from']
            id_to = row['~to']
            id_id = row['~id']
            dict[index * 2] = {'~id': f"{id_from}-wrote-{id_id}", '~label': 'wrote',
                               '~from': id_from, '~to': id_id}
            dict[index * 2 + 1] = {'~id': f"{id_id}-about-{id_to}", '~label': 'about',
                                   '~from': id_id, '~to': id_to}
            score = row['score:Int']
            scale = ''
            if score == 1:
                scale = 'Hate'
            elif score == 2:
                scale = 'Dislike'
            elif score == 3:
                scale = 'Neutral'
            elif score == 4:
                scale = 'Like'
            elif score == 5:
                scale = 'Love'
            edges[index] = {'~id': f"{id_from}-rated-{id_to}", '~label': 'rated',
                            '~from': id_from, '~to': id_to, 'score:Int': score, 'scale': scale}
        rating_edges_df = pd.DataFrame.from_dict(dict, "index")

        # Remove the from and to columns and write this out as a vertex now
        ratings_vertices = ratings_vertices.drop(columns=['~from', '~to'])
        ratings_vertices.to_csv(os.path.join(self.formatted_directory,
                                             'ratings_vertices.csv'), index=False)
        # Write out the rating vertex edges for wrote and about
        rating_edges_df.to_csv(os.path.join(self.formatted_directory,
                                            'ratings_vertex_edges.csv'), index=False)
        # Write out the rated edges
        rated_edges_df = pd.DataFrame.from_dict(edges, "index")
        rated_edges_df.to_csv(os.path.join(self.formatted_directory,
                                           'rated_edges.csv'), index=False)

    def __process_users(self):
        print("Processing Users", end='\r')
        # User Vertices - Load, rename column with type, and save

        user_df = pd.read_csv(os.path.join(
            self.raw_directory, 'ml-100k/u.user'), sep='|', encoding='ISO-8859-1',
            names=['~id', 'age:Int', 'gender', 'occupation', 'zip_code'])
        user_df['~id'] = user_df['~id'].apply(
            lambda x: f'user_{x}')
        user_df['~label'] = 'user'
        user_df.to_csv(os.path.join(self.formatted_directory,
                                    'user_vertex.csv'), index=False)

    def __upload_to_s3(self, bucketname: str):
        path = urlparse(bucketname, allow_fragments=False)
        bucket = path.netloc
        file_path = path.path.lstrip('/').rstrip('/')

        s3_client = boto3.client('s3')
        for root, dirs, files in os.walk(self.formatted_directory):
            for file in files:
                s3_client.upload_file(os.path.join(
                    self.formatted_directory, file), bucket, f'{file_path}/{file}')

    def prepare_movielens_data(self, s3_bucket: str):
        bucket_name = f'{s3_bucket}/neptune-formatted/movielens-100k'
        self.__download_and_unzip()
        self.__process_movies_genres()
        self.__process_users()
        self.__process_ratings_users()
        self.__upload_to_s3(bucket_name)
        print('Completed Processing, data is ready for loading using the s3 url below:')
        print(bucket_name)
        return bucket_name


class PretrainedModels:
    SCRIPT_PARAM_NAME = "sagemaker_program"
    DIR_PARAM_NAME = "sagemaker_submit_directory"
    CONTAINER_LOG_LEVEL_PARAM_NAME = "sagemaker_container_log_level"
    ENABLE_CLOUDWATCH_METRICS_PARAM = "sagemaker_enable_cloudwatch_metrics"
    MODEL_SERVER_TIMEOUT_PARAM_NAME = "sagemaker_model_server_timeout"
    MODEL_SERVER_WORKERS_PARAM_NAME = "sagemaker_model_server_workers"
    SAGEMAKER_REGION_PARAM_NAME = "sagemaker_region"
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



src/graph_notebook/notebooks/03-Neptune-ML/neptune_ml_utils.py [381:488]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        movies_df['release_date:Date'] = movies_df['release_date']
        # Drop the genre columns as well as the uneeded release date columns
        genres.append('video_release_date')
        genres.append('release_date')
        movies_df = movies_df.drop(columns=genres)

        movies_df.to_csv(os.path.join(self.formatted_directory,
                                      'movie_vertex.csv'), index=False)
        genres_edges_df.to_csv(os.path.join(self.formatted_directory,
                                            'genre_edges.csv'), index=False)

    def __process_ratings_users(self):
        # Create ratings vertices and add edges on both sides
        print('Processing Ratings', end='\r')
        ratings_vertices = pd.read_csv(os.path.join(
            self.raw_directory, 'ml-100k/u.data'), sep='\t', encoding='ISO-8859-1',
            names=['~from', '~to', 'score:Int', 'timestamp'])
        ratings_vertices['~from'] = ratings_vertices['~from'].apply(
            lambda x: f'user_{x}')
        ratings_vertices['~to'] = ratings_vertices['~to'].apply(
            lambda x: f'movie_{x}')
        rated_edges = ratings_vertices.copy(deep=True)

        ratings_vertices['~id'] = ratings_vertices['~from'].str.cat(
            ratings_vertices['~to'], sep=":")
        ratings_vertices['~label'] = "rating"
        dict = {}
        edges = {}
        for index, row in ratings_vertices.iterrows():
            id_from = row['~from']
            id_to = row['~to']
            id_id = row['~id']
            dict[index * 2] = {'~id': f"{id_from}-wrote-{id_id}", '~label': 'wrote',
                               '~from': id_from, '~to': id_id}
            dict[index * 2 + 1] = {'~id': f"{id_id}-about-{id_to}", '~label': 'about',
                                   '~from': id_id, '~to': id_to}
            score = row['score:Int']
            scale = ''
            if score == 1:
                scale = 'Hate'
            elif score == 2:
                scale = 'Dislike'
            elif score == 3:
                scale = 'Neutral'
            elif score == 4:
                scale = 'Like'
            elif score == 5:
                scale = 'Love'
            edges[index] = {'~id': f"{id_from}-rated-{id_to}", '~label': 'rated',
                            '~from': id_from, '~to': id_to, 'score:Int': score, 'scale': scale}
        rating_edges_df = pd.DataFrame.from_dict(dict, "index")

        # Remove the from and to columns and write this out as a vertex now
        ratings_vertices = ratings_vertices.drop(columns=['~from', '~to'])
        ratings_vertices.to_csv(os.path.join(self.formatted_directory,
                                             'ratings_vertices.csv'), index=False)
        # Write out the rating vertex edges for wrote and about
        rating_edges_df.to_csv(os.path.join(self.formatted_directory,
                                            'ratings_vertex_edges.csv'), index=False)
        # Write out the rated edges
        rated_edges_df = pd.DataFrame.from_dict(edges, "index")
        rated_edges_df.to_csv(os.path.join(self.formatted_directory,
                                           'rated_edges.csv'), index=False)

    def __process_users(self):
        print("Processing Users", end='\r')
        # User Vertices - Load, rename column with type, and save

        user_df = pd.read_csv(os.path.join(
            self.raw_directory, 'ml-100k/u.user'), sep='|', encoding='ISO-8859-1',
            names=['~id', 'age:Int', 'gender', 'occupation', 'zip_code'])
        user_df['~id'] = user_df['~id'].apply(
            lambda x: f'user_{x}')
        user_df['~label'] = 'user'
        user_df.to_csv(os.path.join(self.formatted_directory,
                                    'user_vertex.csv'), index=False)

    def __upload_to_s3(self, bucketname: str):
        path = urlparse(bucketname, allow_fragments=False)
        bucket = path.netloc
        file_path = path.path.lstrip('/').rstrip('/')

        s3_client = boto3.client('s3')
        for root, dirs, files in os.walk(self.formatted_directory):
            for file in files:
                s3_client.upload_file(os.path.join(
                    self.formatted_directory, file), bucket, f'{file_path}/{file}')

    def prepare_movielens_data(self, s3_bucket: str):
        bucket_name = f'{s3_bucket}/neptune-formatted/movielens-100k'
        self.__download_and_unzip()
        self.__process_movies_genres()
        self.__process_users()
        self.__process_ratings_users()
        self.__upload_to_s3(bucket_name)
        print('Completed Processing, data is ready for loading using the s3 url below:')
        print(bucket_name)
        return bucket_name


class PretrainedModels:
    SCRIPT_PARAM_NAME = "sagemaker_program"
    DIR_PARAM_NAME = "sagemaker_submit_directory"
    CONTAINER_LOG_LEVEL_PARAM_NAME = "sagemaker_container_log_level"
    ENABLE_CLOUDWATCH_METRICS_PARAM = "sagemaker_enable_cloudwatch_metrics"
    MODEL_SERVER_TIMEOUT_PARAM_NAME = "sagemaker_model_server_timeout"
    MODEL_SERVER_WORKERS_PARAM_NAME = "sagemaker_model_server_workers"
    SAGEMAKER_REGION_PARAM_NAME = "sagemaker_region"
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



