def document_handler()

in code/lab8/getpagemetadata.py [0:0]


def document_handler(fname):
    doc_text = open(fname, 'r').read()
    #List of JSON objects to store entities
    entity_data = dict()
    #List of observed text strings recognized as categories
    category_text = dict()
    #Frequency of each text string
    text_frequency = dict()
    for et in categories:
        entity_data[ et ] = []
        category_text[ et ] = []
        text_frequency[ et ] = dict()
    #Make detect_entities_v2 call in a loop to work with the text limit
    #This splitting approach is likely a liitle too naive for a production application as it can potentially split entities,
    for i in range(0, len(doc_text), compre_text_size):
        try:
            entities = client.detect_entities(Text=doc_text[i:i+compre_text_size], LanguageCode='en')
        except Exception as e:
            print("Exiting - detect_entities terminated with exception", e, file=sys.stderr)
            sys.exit(1)
        for e in entities["Entities"]:
            #For each of the recognized entities take only those that have confidence score higher than min_score, 
            #are printable, dont contain quotes and are previously unseen
            if ((e["Score"] > min_score) and (e["Text"].isprintable()) and (not "\"" in e["Text"]) and (not e["Text"].upper() in category_text[e["Type"]])):
                #Append the text to entity data to be used for a Kendra custom attribute
                entity_data[e["Type"]].append(e["Text"])
                #Keep track of text in upper case so that we don't treat the same text written in different cases differently
                category_text[e["Type"]].append(e["Text"].upper())
                #Keep track of the frequency of the text so that we can take the text with highest frequency of occurrance
                text_frequency[e["Type"]][e["Text"].upper()] = 1
            elif (e["Text"].upper() in category_text[e["Type"]]):
                #Keep track of the frequency of the text so that we can take the text with highest frequency of occurrance
                text_frequency[e["Type"]][e["Text"].upper()] += 1
    #The Kendra attribute metadata JSON object to be populated
    attributes = dict()
    metadata = dict()
    for et in categories:
        metadata[et] = []
        #Take at most elimit number of recognized text strings having the highest frequency of occurrance
        el = [pair[0] for pair in sorted(text_frequency[et].items(), key=lambda item: item[1], reverse=True)][0:elimit]
        for d in entity_data[et]:
            if (d.upper() in el):
                metadata[et].append(d)
    #Use the input filename to determine the wikipedia page URL
    npfile = fname.split('.')[0]
    npname = npfile.split('/')[len(npfile.split('/')) - 1]
    npreplace = npname.replace("#", "/")
    metadata["_source_uri"] = "https://en.wikipedia.org/wiki/" + npreplace
    attributes["Attributes"] = metadata
    print(json.dumps(attributes, sort_keys=True, indent=4))