def extract_analytics_categories()

in pca-server/src/pca/pca-aws-sf-process-turn-by-turn.py [0:0]


    def extract_analytics_categories(self, categories):
        """
        This will extract and return the header information for detected categories, but it will also inject
        markers into the SpeechSegments to indicate on which line of the transcript a particular category should
        be highlighted in a UI

        @param categories: "Categories" block from the Call Analytics results
        @return: JSON structure for header-level "CategoriesDetected" block
        """

        # Work around each of the matched categories
        timed_categories = {}
        categories_detected = []
        for matched_cat in categories["MatchedCategories"]:

            # Record the name and the instance count, which will be 0 for a "not found" type of category
            next_category = {"Name": matched_cat}
            next_category["Instances"] = len(categories["MatchedDetails"][matched_cat]["PointsOfInterest"])
            timestamp_array = []

            # Map across all of the instance timestamps (if any)
            for instance in categories["MatchedDetails"][matched_cat]["PointsOfInterest"]:
                # Store the timestamps for the header
                next_poi_time = {"BeginOffsetSecs": float(instance["BeginOffsetMillis"]/1000)}
                next_poi_time["EndOffsetSecs"] = float(instance["EndOffsetMillis"]/1000)
                timestamp_array.append((next_poi_time))

                # Keep our time-keyed category list up to date
                if next_poi_time["BeginOffsetSecs"] not in timed_categories:
                    timed_categories[next_poi_time["BeginOffsetSecs"]] = [matched_cat]
                else:
                    timed_categories[next_poi_time["BeginOffsetSecs"]].append(matched_cat)

            # "Missing" categories have no timestamps, so record them against a time of 0.0 seconds
            if next_category["Instances"] == 0:
                # Keep out time-keyed category list up to date
                if 0.0 not in timed_categories:
                    timed_categories[0.0] = [matched_cat]
                else:
                    timed_categories[0.0].append(matched_cat)

            # Put it all together
            next_category["Timestamps"] = timestamp_array
            categories_detected.append(next_category)

        # If we had some categories then ensure each segment is tagged with them
        if len(timed_categories) > 0:
            # Go through each speech segment and see if a category fits here
            for segment in self.speechSegmentList:
                for cat_time in timed_categories.copy().keys():
                    if cat_time <= segment.segmentStartTime:
                        segment.segmentCategoriesDetectedPre += timed_categories[cat_time]
                        timed_categories.pop(cat_time)

            # If we have any categories left then tag them to the final segment
            for category in timed_categories:
                self.speechSegmentList[-1].segmentCategoriesDetectedPost += timed_categories[category]

        # Return the header structure for detected categories
        return categories_detected