in rally-custom/custom_tracks/elasticsearch/so_vector/_tools/parse_embed.py [0:0]
def startElement(self, name, attrs):
if name == "row":
postType = int(attrs["PostTypeId"])
# only keep questions, dropping the answer posts
if postType == 1:
record = {}
# In some questions e.g. 10030718 the ownerID is missing and we have OwnerDisplayName instead
ownerDisplayName = ""
ownerId = ""
user = ""
if "OwnerUserId" in attrs:
ownerId = attrs["OwnerUserId"]
record["user"] = ownerId
elif "OwnerDisplayName" in attrs:
ownerDisplayName = attrs["OwnerDisplayName"]
record["user"] = ownerDisplayName
tags = []
if "Tags" in attrs:
tags = re.split("[<>]+", attrs["Tags"])
record["tags"] = [x for x in tags if len(x) > 0]
record["type"] = "question"
record["questionId"] = attrs["Id"]
if "CreationDate" in attrs:
record["creationDate"] = attrs["CreationDate"]
if "Title" in attrs:
record["title"] = attrs["Title"].replace("\n", " ").replace("\r", " ")
record["titleVector"] = embedding_model.encode(record["title"], normalize_embeddings=True).tolist()
if "AcceptedAnswerId" in attrs:
record["acceptedAnswerId"] = attrs["AcceptedAnswerId"]
if "Body" in attrs:
soup = BeautifulSoup(attrs["Body"], "html.parser")
body = soup.get_text().replace("\n", " ").replace("\r", "")
body = re.sub("\s+", " ", body)
record["body"] = body
myjsonfile.write(json.dumps(record, separators=(",", ":")))
myjsonfile.write("\n")