def parse_key_file()

in modeling/coval/conll/util.py [0:0]


def parse_key_file(key_file):
        try:
                from nltk.parse.stanford import StanfordParser
                parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", java_options='-Xmx8G')
                print("Starting to parse key_file!")
                print("This might take a while...")
                new_file = open(key_file + ".parsed","w")
                with open(key_file) as f:
                        tmp_sentence = [[]]
                        tmp_conll_lines = []
                        for line in f:
                                if line.startswith("#begin"):
                                        new_file.write(line)
                                        continue
                                elif len(line.strip()) == 0 or (line.startswith("#end") and len(tmp_conll_lines)>0):
                                        parse = parser.parse_sents(tmp_sentence)
                                        for tree in parse:
                                                for tree_line in tree:       #line is a Tree
                                                        parse_string = ' '.join(str(tree_line).split())
                                                        treecomp = parse_string.split()
                                                        currlowestindex = 0
                                                        token_index = 0
                                                        for idx, val in enumerate(treecomp):
                                                                if not val.startswith("("):
                                                                        firstindexofbracket = val.index(")")
                                                                        lastindex = val.__len__() - 1
                                                                        tag_components = []
                                                                        pos_tag = treecomp[idx-1].replace("(", "")
                                                                        if currlowestindex == idx - 1:
                                                                                if firstindexofbracket == lastindex:
                                                                                        tag_components.append("*")
                                                                                else:
                                                                                        parsecol = "*" + val[firstindexofbracket:lastindex]
                                                                                        tag_components.append(parsecol)
                                                                        else:
                                                                                for i in range(currlowestindex, idx - 1):
                                                                                        tag_components.append(treecomp[i])
                                                                                if firstindexofbracket == lastindex:
                                                                                        tag_components.append("*")
                                                                                else:
                                                                                        parsecol = "*" + val[firstindexofbracket:lastindex]
                                                                                        tag_components.append(parsecol)
                                                                        currlowestindex = idx + 1

                                                                        new_file.write('\t'.join(tmp_conll_lines[token_index].split()[0:4])+ "\t" + pos_tag + "\t" + ''.join(tag_components) +'\t' +'\t'.join(tmp_conll_lines[token_index].split()[4:])+ '\n')
                                                                        token_index += 1

                                        tmp_sentence[0] = []
                                        tmp_conll_lines = []
                                        new_file.write("\n")

                                elif not line.startswith("#"):
                                        word = line.split()[3]
                                        word_uc = word #.decode(encoding='UTF-8')
                                        tmp_sentence[0].append(word_uc)
                                        tmp_conll_lines.append(line)
                                if line.startswith("#end"):
                                        new_file.write(line)
                                        

        except:
                print("You need to set the CLASSPATH environment variable to point to the Stanford parser!")
                print("Example: export CLASSPATH=/path/to/stanford-parser-full-YYYY-MM-DD/stanford-parser.jar:/path/to/stanford-parser-full-YYYY-MM-DD/stanford-parser-X.X.X-models.jar")
                print("")
                raise