def get_bartheleby_data()

in data_preparation/text_retrieval/bartleby.py [0:0]


def get_bartheleby_data(url):

    extension = url.split('.')[-1]
    isUniquePage = extension == 'html'

    def loadText(locUrl):
        parser = BarthelebyParser()
        req = requests.get(locUrl)
        parser.feed(str(req._content))
        time.sleep(1)
        if not parser.textFound:
            return None
        return parser.title + '\n' + '\n' + parser.getCleanText()

    if not isUniquePage:

        # Load title
        parser = BarthelebyTitleParser()
        req = requests.get(url)
        parser.feed(str(req._content))

        if not parser.titleFound:
            raise RuntimeError("No title found")

        fullText = parser.title + '\n' + '\n'

        if url[-1] != '/':
            url += '/'
        data = url.split('/')

        try:
            int(data[-2])
        except ValueError:
            raise RuntimeError("Invalid url")

        index = 1
        while True:
            nextUrl = f"{url}{index}.html"
            textData = loadText(nextUrl)
            if textData is None:
                break
            fullText += '\n\n' + textData
            index += 1

        return fullText

    text = loadText(url)
    if text is None:
        raise RuntimeError("Couldn't find the page")
    return text