in inference/etl.py [0:0]
def cleanup(ip,column=None):
'''
ip: list of strings at inference time or dataframe at training time
'''
if column:
print("training ETL")
ip[column] = ip.apply(lambda row: row[column].replace("\n"," "), axis=1)
ip[column] = ip.apply(lambda row: re.sub('http://\S+|https://\S+', 'urls',row[column]).lower(),axis=1)
ip[column] = ip.apply(lambda row: re.sub('[^A-Za-z\' ]+', '',row[column]).lower(), axis=1)
else:
print("inference ETL")
ip= [i.replace("\n"," ") for i in ip]
ip = [re.sub('http://\S+|https://\S+', 'url',i).lower() for i in ip]
ip = [re.sub('[^A-Za-z\' ]+', '',i).lower() for i in ip]
return ip