in data_parsing.py [0:0]
def process_quotation(line):
quote_flags = ('passage=', 'text=')
line = re.sub(r'#*?\*:?', '', line)
if 'seemorecites' in line.lower():
return -1, -1
if '|| QUOTE=' in line:
q = line.split('|| QUOTE=')
q_tags = q[0]
if len(q) > 2: q = '/ '.join(q[1:])
else: q = q[1]
if re.search(r'{{.*?}}', q):
q = re.sub(r'{{|}}', '', q)
q = q.split('|')[-1]
q = re.sub(r'passage=|text=', '', q)
elif re.search(r'<ref>.*?</ref>', line):
q_tags = re.search(r'<ref>(.*?)</ref>', line).group(1)
q = re.sub(r'<ref>.*?</ref>', '', line)
elif re.search(r'{{.*?}}', line):
q_tags = re.sub(r'{{|}}', '', line)
q_tags = [t.strip() for t in q_tags.strip().split('|')]
q = [t for t in q_tags if t.lower().startswith(quote_flags)]
if len(q) > 0:
q = re.sub(r'passage=|text=', '', q[0])
q_tags = [t for t in q_tags if not t.lower().startswith(quote_flags)]
else:
q = [t for t in q_tags if not re.match(r'^.*?=', t)]
if len(q) > 0: #hopefully this is okay
q = q[-1]
q_tags = [t for t in q_tags if t != q]
else:
return -1, -1
#assuming the quote is here in quotes
else:
#cleaning double quotes to parse examples
line = re.sub(r'(?<!\')\'{2}(?!\')', '"', line)
if re.search(r'".*?"', line):
q = re.search(r'".*?"', line)
q = q.group(0)
q = re.sub(r'"', '', q)
q_tags = re.sub(r'".*?"', '', line)
else:
q = line
q_tags = []
if len(q) > CHAR_THRESHOLD and ' ' in q:
return q, q_tags
else:
return -1, -1