in preprocess/extract_wiki_data.py [0:0]
def extract(self, uuid, page_id, title, page_content) -> Iterator[Dict]:
page_context = html.unescape(page_content)
wiki_page = wtp.parse(page_context)
for table in wiki_page.tables:
# print(f'[WikiTitle]{title}')
# because calling Java library is costly, I first used wtp to check if the table is a valid one and
# worth being parsed by the Java library
try:
table_data = table.data()
except:
continue
if len(table_data) == 0:
continue
col_num = len(table_data[0])
row_num = len(table_data)
if col_num < 3 or row_num < 4:
continue
# caption = table.caption
# caption_from_first_row = False
# if not caption:
# first_cell = table.cells(0, 0)
# if str(first_cell.get_attr('colspan')) == str(col_num):
# caption = wiki2text(str(first_cell))
# caption_from_first_row = True
#
# if caption:
# caption = wiki2text(caption)
tab_span = table.span
context_end = tab_span[0]
context = wiki_page.string[: context_end]
cleaned_ctx = wiki2text(context)
for regex in tag_regex_list:
cleaned_ctx = regex.sub('', cleaned_ctx)
cleaned_ctx = html_tag_re.sub('', cleaned_ctx)
cleaned_ctx = [
x
for x in cleaned_ctx.strip().split('\n')
if x
][-3:]
if __DEBUG__:
print('*** Total Text ***')
for text in cleaned_ctx:
print(text)
cleaned_text = []
if __DEBUG__:
print('*** Sentence Cleaning ***')
for text in cleaned_ctx:
if __DEBUG__:
print('Text: ', text)
text = text.strip('=')
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf8').replace('()', '')
text = re.sub(r'\s+', ' ', text).strip()
text_tokens = text.replace(', ', ' , ').split(' ')
character_word_num = sum(1 for token in text_tokens if token.isalpha())
non_character_word_num = len(text_tokens) - character_word_num
if character_word_num < non_character_word_num:
if __DEBUG__:
print('Removing Tokens', text_tokens)
continue
if text:
cleaned_text.append(text)
if __DEBUG__:
print('Cleaned: ', text)
if __DEBUG__:
print('**** Cleaned Text ****')
print(cleaned_text)
if any('|' in x for x in cleaned_text) or any('{' in x for x in cleaned_text):
continue
parsed_context = []
for paragraph in cleaned_text:
paragraph_sents = []
parsed_paragraph = self.nlp(paragraph)
# if log:
# print('Paragraph: ', parsed_paragraph)
for sent in parsed_paragraph.sents:
paragraph_sents.append(sent.text)
# if log:
# print('Sent: ', sent)
parsed_context.append(paragraph_sents)
table_html = self.mediaWikiToHtml.convert(str(table))
table = self.extract_table_from_html(table_html)
# if there is not any context
if table and not parsed_context and not table.caption:
continue
if table:
table = table.to_dict()
example = {
'uuid': f'wiki-{page_id}-{"_".join(title.split())}-{uuid}',
'context_before': parsed_context,
}
example.update(table)
yield example