def extract_skills()

in services/skills.py [0:0]


    def extract_skills(self, text: str):
        """Extract skills from text unstructured text"""
        doc = self.nlp(text)
        found_skills = defaultdict(lambda: defaultdict(list))

        for ent in doc.ents:
            if "|" in ent.label_:
                ent_label, skill_id = ent.label_.split("|")
                if ent_label == "SKILL" and skill_id:
                    found_skills[skill_id]["matches"].append(
                        {
                            "start": ent.start_char,
                            "end": ent.end_char,
                            "label": ent_label,
                            "text": ent.text,
                        }
                    )
                    try:
                        skill_info = self.skills[skill_id]
                        sources = skill_info['sources']

                        # Some sources have better Skill Descriptions than others.
                        # This is a simple heuristic for cascading through the sources 
                        # to pick the best description available per skill
                        main_source = sources[0]
                        for source in sources:
                            if source["sourceName"] == "Github Topics":
                                main_source = source
                                break
                            elif source["sourceName"] == "Microsoft Academic Topics":
                                main_source = source
                                break
                            elif source["sourceName"] == "Stackshare Skills":
                                main_source = source
                                break
                    except KeyError:
                        # This happens when a pattern defined in data/extra_skill_patterns.jsonl
                        # is matched. The skill is not added to data/skills.json so there's no
                        # extra metadata about the skill from an established source.
                        sources = []
                        main_source = {
                            "displayName": ent.text,
                            "shortDescription": "",
                            "longDescription": ""
                        }

                    keys = ["displayName", "shortDescription", "longDescription"]
                    for k in keys:
                        found_skills[skill_id][k] = main_source[k]
                    found_skills[skill_id]["sources"] = [
                        {"name": s["sourceName"], "url": s["url"]} for s in sources
                    ]
        return found_skills