in services/skills.py [0:0]
def extract_skills(self, text: str):
"""Extract skills from text unstructured text"""
doc = self.nlp(text)
found_skills = defaultdict(lambda: defaultdict(list))
for ent in doc.ents:
if "|" in ent.label_:
ent_label, skill_id = ent.label_.split("|")
if ent_label == "SKILL" and skill_id:
found_skills[skill_id]["matches"].append(
{
"start": ent.start_char,
"end": ent.end_char,
"label": ent_label,
"text": ent.text,
}
)
try:
skill_info = self.skills[skill_id]
sources = skill_info['sources']
# Some sources have better Skill Descriptions than others.
# This is a simple heuristic for cascading through the sources
# to pick the best description available per skill
main_source = sources[0]
for source in sources:
if source["sourceName"] == "Github Topics":
main_source = source
break
elif source["sourceName"] == "Microsoft Academic Topics":
main_source = source
break
elif source["sourceName"] == "Stackshare Skills":
main_source = source
break
except KeyError:
# This happens when a pattern defined in data/extra_skill_patterns.jsonl
# is matched. The skill is not added to data/skills.json so there's no
# extra metadata about the skill from an established source.
sources = []
main_source = {
"displayName": ent.text,
"shortDescription": "",
"longDescription": ""
}
keys = ["displayName", "shortDescription", "longDescription"]
for k in keys:
found_skills[skill_id][k] = main_source[k]
found_skills[skill_id]["sources"] = [
{"name": s["sourceName"], "url": s["url"]} for s in sources
]
return found_skills