in generate_documents.py [0:0]
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'--rcv-dir',
dest='rcv_dir',
help='Directory of rcv1/rcv2 corpus with sub-directory structure '
'indicated by indices, e.g. index FDCH5-39373 corresponds to '
'document <rcv_dir>/FDCH5/39373.xml.',
)
parser.add_argument(
'--output-filename',
dest='output_filename',
help='Path to store documents being indexed',
)
parser.add_argument(
'--indices-file',
dest='indices_file',
help='Path to indices file.',
)
args = parser.parse_args()
delim_str = '\t'
sentence_delim = ' '
code_class = 'bip:topics:1.0'
labels = ['C', 'E', 'G', 'M']
target_topics = ['{}CAT'.format(label) for label in labels]
with open(args.indices_file, 'r') as indices_f, \
open(args.output_filename, 'w') as output_f:
for line in indices_f:
sub_corpus, file_name = line.strip().split('-')
sub_corpus_path = os.sep.join([args.rcv_dir, sub_corpus])
doc_path = os.sep.join(
[sub_corpus_path, '{}.xml'.format(file_name)]
)
data_str = open(doc_path).read()
try:
xml_parsed = ET.fromstring(data_str)
topics = [
topic.attrib['code'] for topic in xml_parsed.findall(
".//codes[@class='{}']/code".format(code_class)
) if topic.attrib['code'] in target_topics
]
assert len(topics) == 1, 'More than one class label found.'
doc = sentence_delim.join(
[p.text for p in xml_parsed.findall(".//p")]
)
output_f.write(
'{}{}{}\n'.format(topics[0], delim_str, doc.encode('utf-8'))
)
except Exception as e:
logging.error('Failed to parse xml file: {}.'.format(doc_path))