in assets/lambda_helper_neptune/python/rdflib/tools/csv2rdf.py [0:0]
def convert(self, csvreader):
start = time.time()
if self.OUT:
sys.stderr.write("Output to %s\n" % self.OUT.name)
if self.IDENT != "auto" and not isinstance(self.IDENT, tuple):
self.IDENT = (self.IDENT,)
if not self.BASE:
warnings.warn("No base given, using http://example.org/instances/")
self.BASE = rdflib.Namespace("http://example.org/instances/")
if not self.PROPBASE:
warnings.warn(
"No property base given, using http://example.org/property/")
self.PROPBASE = rdflib.Namespace("http://example.org/props/")
# skip lines at the start
for x in range(self.SKIP):
next(csvreader)
# read header line
header_labels = list(next(csvreader))
headers = dict(
enumerate([self.PROPBASE[toProperty(x)] for x in header_labels]))
# override header properties if some are given
for k, v in self.PROPS.items():
headers[k] = v
header_labels[k] = split_uri(v)[1]
if self.DEFINECLASS:
# output class/property definitions
self.triple(self.CLASS, RDF.type, RDFS.Class)
for i in range(len(headers)):
h, l = headers[i], header_labels[i]
if h == "" or l == "":
continue
if self.COLUMNS.get(i) == _config_ignore:
continue
self.triple(h, RDF.type, RDF.Property)
self.triple(h, RDFS.label, rdflib.Literal(toPropertyLabel(l)))
self.triple(h, RDFS.domain, self.CLASS)
self.triple(h, RDFS.range,
self.COLUMNS.get(i, default_node_make).range())
rows = 0
for l in csvreader:
try:
if self.IDENT == 'auto':
uri = self.BASE["%d" % rows]
else:
uri = self.BASE["_".join([urllib.parse.quote(x.encode(
"utf8").replace(" ", "_"), safe="")
for x in index(l, self.IDENT)])]
if self.LABEL:
self.triple(uri, RDFS.label, rdflib.Literal(
" ".join(index(l, self.LABEL))))
if self.CLASS:
# type triple
self.triple(uri, RDF.type, self.CLASS)
for i, x in enumerate(l):
x = x.strip()
if x != '':
if self.COLUMNS.get(i) == _config_ignore:
continue
try:
o = self.COLUMNS.get(i, rdflib.Literal)(x)
if isinstance(o, list):
for _o in o:
self.triple(uri, headers[i], _o)
else:
self.triple(uri, headers[i], o)
except Exception as e:
warnings.warn(
"Could not process value for column " +
"%d:%s in row %d, ignoring: %s " % (
i, headers[i], rows, e.message))
rows += 1
if rows % 100000 == 0:
sys.stderr.write(
"%d rows, %d triples, elapsed %.2fs.\n" % (
rows, self.triples, time.time() - start))
except:
sys.stderr.write("Error processing line: %d\n" % rows)
raise
# output types/labels for generated URIs
classes = set()
for l, x in uris.items():
u, c = x
self.triple(u, RDFS.label, rdflib.Literal(l))
if c:
c = rdflib.URIRef(c)
classes.add(c)
self.triple(u, RDF.type, c)
for c in classes:
self.triple(c, RDF.type, RDFS.Class)
self.OUT.close()
sys.stderr.write(
"Converted %d rows into %d triples.\n" % (rows, self.triples))
sys.stderr.write("Took %.2f seconds.\n" % (time.time() - start))