in src/graph_notebook/notebooks/03-Neptune-ML/02-SPARQL/neptune_ml_sparql_utils.py [0:0]
def __process_ratings_users_rdf(self):
# Create ratings vertices and add edges on both sides
print('Processing Ratings to RDF')
ratings_vertices = pd.read_csv(
os.path.join(self.raw_directory, 'ml-100k/u.data'),
sep='\t',
encoding='ISO-8859-1',
names=['~from', '~to', 'score:Int', 'timestamp']
)
ratings_vertices['~from'] = ratings_vertices['~from'].apply(lambda x: f'user_{x}')
ratings_vertices['~to'] = ratings_vertices['~to'].apply(lambda x: f'movie_{x}')
ratings_vertices['~id'] = ratings_vertices['~from'].str.cat(ratings_vertices['~to'], sep="_")
ratings_vertices['~label'] = "rating"
ratings_graph = ConjunctiveGraph()
averages_graph = ConjunctiveGraph()
for index, row in ratings_vertices.groupby('~to').mean(numeric_only=True).iterrows():
score = int(round(row['score:Int']))
averages_graph.add((
self.ns_resource[index], self.ns_ontology.criticScore, Literal(score, datatype=XSD.integer),
self.ns_ontology.Rating
))
for index, row in ratings_vertices.iterrows():
uri = urllib.parse.quote_plus(row['~id'])
ratings_graph.add((
self.ns_resource[uri], RDF.type, self.ns_ontology.Rating, self.ns_ontology.Rating
))
ratings_graph.add((
self.ns_resource[uri], self.ns_ontology.score,
Literal(row['score:Int'], datatype=XSD.integer),
self.ns_ontology.Rating
))
ratings_graph.add((
self.ns_resource[uri], self.ns_ontology.timestamp, Literal(row['timestamp']),
self.ns_ontology.Rating
))
ratings_graph.add((
self.ns_resource[uri], self.ns_ontology.forMovie, self.ns_resource[row['~to']],
self.ns_ontology.Rating
))
ratings_graph.add((
self.ns_resource[uri], self.ns_ontology.byUser, self.ns_resource[row['~from']],
self.ns_ontology.Rating
))
if row['score:Int'] > 3:
ratings_graph.add((
self.ns_resource[row['~from']], self.ns_ontology.recommended, self.ns_resource[row['~to']],
self.ns_ontology.Rating
))
ratings_graph.add((
self.ns_resource[row['~to']], self.ns_ontology.wasRecommendedBy, self.ns_resource[row['~from']],
self.ns_ontology.Rating
))
ratings_rdf_file = os.path.join(self.formatted_directory, 'user_movie_ratings.nq')
averages_graph_file = os.path.join(self.formatted_directory, 'critic_movie_scores.nq')
ratings_graph.serialize(format='nquads', destination=ratings_rdf_file)
averages_graph.serialize(format='nquads', destination=averages_graph_file)