in data_preparation/text_retrieval/bartleby.py [0:0]
def get_bartheleby_data(url):
extension = url.split('.')[-1]
isUniquePage = extension == 'html'
def loadText(locUrl):
parser = BarthelebyParser()
req = requests.get(locUrl)
parser.feed(str(req._content))
time.sleep(1)
if not parser.textFound:
return None
return parser.title + '\n' + '\n' + parser.getCleanText()
if not isUniquePage:
# Load title
parser = BarthelebyTitleParser()
req = requests.get(url)
parser.feed(str(req._content))
if not parser.titleFound:
raise RuntimeError("No title found")
fullText = parser.title + '\n' + '\n'
if url[-1] != '/':
url += '/'
data = url.split('/')
try:
int(data[-2])
except ValueError:
raise RuntimeError("Invalid url")
index = 1
while True:
nextUrl = f"{url}{index}.html"
textData = loadText(nextUrl)
if textData is None:
break
fullText += '\n\n' + textData
index += 1
return fullText
text = loadText(url)
if text is None:
raise RuntimeError("Couldn't find the page")
return text