in pipeline/data/hplt.py [0:0]
def _process_line(self, line_locale: str, line: str):
# Line locale does not match expected locale, filter
if line_locale != self.hplt_locale:
self.stats.filtered_line_locale.value += 1
self._maybe_write_accumulated_text()
return
char_count = len(line)
# Filter long segments
if char_count > self.max_characters:
self.stats.filtered_too_long.value += 1
self._maybe_write_accumulated_text()
return
# Just write the current line if merging is disabled
if not self.merge_lines:
self.accumulated_text = line
self.stats.visited_lines.kept += 1
self._maybe_write_accumulated_text()
return
# Text accumulation mode starts here
self.stats.visited_lines.kept += 1
# Determine if this sentence should be added to the previous one or
# written out as a new line.
if self.cumulative_char_count + char_count + 1 > self.max_characters:
# This line would be too long, write it out.
self._maybe_write_accumulated_text()
self.cumulative_char_count += char_count
# Collect this line to write.
if self.accumulated_text:
self.accumulated_text = f"{self.accumulated_text} {line}"
# count the whitespace
self.cumulative_char_count += 1
else:
self.accumulated_text = line