in src/graph_notebook/notebooks/03-Neptune-ML/02-SPARQL/neptune_ml_sparql_utils.py [0:0]
def __process_movies_genres_rdf(self):
# process the movies_vertex.csv
print('Processing Movies to RDF')
movies_df = pd.read_csv(os.path.join(
self.raw_directory, 'ml-100k/u.item'), sep='|', encoding='ISO-8859-1',
names=['~id', 'title', 'release_date', 'video_release_date', 'imdb_url',
'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
# Parse date and convert to ISO format
movies_df['release_date'] = movies_df['release_date'].apply(
lambda x: str(
datetime.strptime(x, '%d-%b-%Y').isoformat()) if not pd.isna(x) else x)
movies_df['~label'] = 'movie'
movies_df['~id'] = movies_df['~id'].apply(
lambda x: f'movie_{x}')
genres = ['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy',
'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
genre_df = pd.DataFrame(genres, columns=['~id'])
genre_df['~label'] = 'genre'
genre_df['name'] = genre_df['~id']
movie_rdf_filename = os.path.join(self.formatted_directory, 'movies.nq')
movie_genre_rdf_filename = os.path.join(self.formatted_directory, 'movie_genres.nq')
movie_graph = ConjunctiveGraph()
movie_genre_graph = ConjunctiveGraph()
# movie vertex file creation
for index, row in movies_df.iterrows():
id = row['~id']
title = row['title']
imdb_url = row['imdb_url']
release_date = row['release_date']
movie_graph.add((
self.ns_resource[id], RDF.type, self.ns_ontology.Movie, self.ns_ontology.Movies
))
movie_graph.add((
self.ns_resource[id], self.ns_ontology.title, Literal(title, datatype=XSD.string),
self.ns_ontology.Movies
))
movie_graph.add((
self.ns_resource[id], RDFS.label, Literal(title, datatype=XSD.string), self.ns_ontology.Movies
))
movie_graph.add((
self.ns_resource[id], self.ns_ontology.imdbURL, Literal(imdb_url, datatype=XSD.anyURI),
self.ns_ontology.Movies
))
movie_graph.add((
self.ns_resource[id], self.ns_ontology.releaseDate,
Literal(release_date, datatype=XSD.dateTime), self.ns_ontology.Movies
))
# add genre labels
for genre_value in genres:
if row[genre_value]:
movie_genre_graph.add((
self.ns_resource[id], self.ns_ontology.hasGenre,
Literal(genre_value, datatype=XSD.string), self.ns_ontology.Movie
))
movie_genre_graph.add((
self.ns_resource[id], self.ns_ontology.hasOriginalGenre,
Literal(genre_value, datatype=XSD.string), self.ns_ontology.Movie
))
movie_graph.serialize(format='nquads', destination=movie_rdf_filename)
movie_genre_graph.serialize(format='nquads', destination=movie_genre_rdf_filename)