aios/tools/hape/hape_libs/utils/havenask_dataset.py (72 lines of code) (raw):

#!/bin/env python # *-* coding:utf-8 *-* import os import json HERE = os.path.dirname(os.path.realpath(__file__)) import sys sys.path = [HERE] + sys.path from hape_libs.utils.havenask_schema import HavenaskSchema class HavenaskRecord: def __init__(self, type, fields, raw_doc, raw_doc_dict): self.type = type self.fields = fields self.raw_doc = raw_doc self.raw_doc_dict = raw_doc_dict def get(self, key): return self.fields[key] def to_sql(self, table): keys, values = [], [] for key, value in self.fields.items(): keys.append(key) try: value = value.encode('utf-8') except: pass values.append(value) sql = "insert into {} ({}) values ({}) &&kvpair=databaseName:database".format(table, ",".join(keys), ",".join(values)) return sql class HavenaskDataSet: doc_sep = "\x1e" field_sep = "\x1f" def __init__(self, file_path, schema_path): self.kv_sep = "=" self.records = [] self.file_path = os.path.realpath(file_path) with open(schema_path) as f: self.schema = HavenaskSchema(json.load(f)) self.schema.parse() def to_sqls(self, table): sqls = [] for record in self.records: sqls.append(record.to_sql(table)) return sqls def parse(self): with open(self.file_path) as f: lines = f.readlines() fields = {} type = None raw_doc = "" raw_doc_dict = {} for line in lines: index = line.find(self.kv_sep) if index != -1: key, value = line[:index], line[index+1:] value = value[:value.find(HavenaskDataSet.field_sep)] raw_doc_dict[key] = value if key == "CMD": type = value else: if key not in self.schema.field_to_column_type: print("{} is not in schema".format(key)) continue else: type = self.schema.field_to_column_type[key] if type == "TEXT" or type == "STRING" or type=="RAW": value = "'"+value+"'" fields[key] = value raw_doc += key + "="+value + HavenaskDataSet.field_sep+"\n" else: record = HavenaskRecord(type, fields, raw_doc = raw_doc + HavenaskDataSet.doc_sep +"\n", raw_doc_dict=raw_doc_dict) self.records.append(record) fields = {} type = None raw_doc = "" raw_doc_dict = {}