in src/utils/str.py [0:0]
def split_to_sentence(s: str):
# s = "Mr. Smith bought cheapsite.com for 1.5 million dollars, i.e. he paid a lot for it. Did he mind? Adam Jones Jr. thinks he didn't. In any case, this isn't true... Well, with a probability of .9 it isn't."
# s = 'The Saronic Gulf (Greek: , Saroniks klpos) or Gulf of Aegina in Greece is formed between the peninsulas of Attica and Argolis and forms part of the Aegean Sea. It defines the eastern side of the isthmus of Corinth, being the eastern terminus of the Corinth Canal, which cuts across the isthmus. The Saronic Islands in the gulf have played a pivotal role in the history of Greece, with the largest, Salamis, naming a significant naval battle in the Greco-Persian wars. The Megara Gulf makes up the northern end of the Saronic Gulf.'
sentences = re.split('(?<!\w\.\w.)(?<![A-Z]\.)(?<![A-Z][a-z]\.)(?<! [a-z]\.)(?<![A-Z][a-z][a-z]\.)(?<=\.|\?|\!)\"*\s*\s*(?:\W*)([A-Z])', s)
sentences_new: List[str] = []
for sentence in sentences:
if len(sentences_new) and len(sentences_new[-1]) == 1 and sentences_new[-1].isupper():
sentences_new[-1] += sentence
else:
sentences_new.append(sentence)
sentences = sentences_new
sentences = [white_space_fix(line) for line in sentences]
sentences = [sentence for sentence in sentences if len(sentence)]
return sentences