in table_bert/dataset.py [0:0]
def from_dict(cls, entry: Dict, tokenizer: Optional[BertTokenizer], suffix) -> 'Example':
def _get_data_source():
return 'wiki' if 'wiki' in entry['uuid'] else 'common_crawl'
source = _get_data_source()
header_entry = entry['header'] if source == 'wiki' else entry['table']['header']
header = []
column_data = []
for col in header_entry:
sample_value = col['sample_value']['value']
if tokenizer:
name_tokens = tokenizer.tokenize(col['name'])
else: name_tokens = None
column = Column(col['name'],
col['type'],
sample_value,
name_tokens=name_tokens)
header.append(column)
if source == 'wiki':
for row in entry['data'][1:]:
for col_id, (tag, cell_val) in enumerate(row):
if col_id >= len(column_data):
column_data.append([])
column_data[col_id].append(cell_val)
else:
for row in entry['table']['data']:
for col_id, (cell_val) in enumerate(row):
if col_id >= len(column_data):
column_data.append([])
column_data[col_id].append(cell_val)
context_before = []
context_after = []
if source == 'wiki':
for para in entry['context_before']:
for sent in para:
if tokenizer:
sent = tokenizer.tokenize(sent)
context_before.append(sent)
caption = entry['caption']
if caption:
if tokenizer:
caption = tokenizer.tokenize(entry['caption'])
context_before.append(caption)
else:
for sent in entry['context_before']:
if tokenizer:
sent = tokenizer.tokenize(sent)
context_before.append(sent)
for sent in entry['context_after']:
if tokenizer:
sent = tokenizer.tokenize(sent)
context_after.append(sent)
uuid = entry['uuid']
return cls(uuid, header,
[context_before, context_after],
column_data=column_data,
source=source)