in E2E_TOD/clean_dataset.py [0:0]
def clean_text(text, mapping_pair_path):
text = text.strip()
text = text.lower()
text = text.replace(u"’", "'")
text = text.replace(u"‘", "'")
text = text.replace(';', ',')
text = text.replace('"', ' ')
text = text.replace('/', ' and ')
text = text.replace("don't", "do n't")
text = clean_time(text)
baddata = { r'c\.b (\d), (\d) ([a-z])\.([a-z])': r'cb\1\2\3\4',
'c.b. 1 7 d.y': 'cb17dy',
'c.b.1 7 d.y': 'cb17dy',
'c.b 25, 9 a.q': 'cb259aq',
'isc.b 25, 9 a.q': 'is cb259aq',
'c.b2, 1 u.f': 'cb21uf',
'c.b 1,2 q.a':'cb12qa',
'0-122-336-5664': '01223365664',
'postcodecb21rs': 'postcode cb21rs',
r'i\.d': 'id',
' i d ': 'id',
'Telephone:01223358966': 'Telephone: 01223358966',
'depature': 'departure',
'depearting': 'departing',
'-type': ' type',
r"b[\s]?&[\s]?b": "bed and breakfast",
"b and b": "bed and breakfast",
r"guesthouse[s]?": "guest house",
r"swimmingpool[s]?": "swimming pool",
"wo n\'t": "will not",
" \'d ": " would ",
" \'m ": " am ",
" \'re' ": " are ",
" \'ll' ": " will ",
" \'ve ": " have ",
r'^\'': '',
r'\'$': '',
}
for tmpl, good in baddata.items():
text = re.sub(tmpl, good, text)
text = re.sub(r'([a-zT]+)\.([a-z])', r'\1 . \2', text) # 'abc.xyz' -> 'abc . xyz'
text = re.sub(r'(\w+)\.\.? ', r'\1 . ', text) # if 'abc. ' -> 'abc . '
#mapping_path = r'../data/multiwoz/data/multi-woz/mapping.pair'
with open(mapping_pair_path, 'r') as fin:
for line in fin.readlines():
fromx, tox = line.replace('\n', '').split('\t')
text = ' ' + text + ' '
text = text.replace(' ' + fromx + ' ', ' ' + tox + ' ')[1:-1]
return text