# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'spec_helper'

describe TwitterCldr::Segmentation::BreakIterator do
  describe "#each_sentence" do
    let(:iterator) { described_class.new(:en, use_uli_exceptions: true) }

    it "should return an enumerator if called without a block" do
      expect(iterator.each_sentence("foo bar")).to be_a(Enumerator)
    end

    it "splits a simple string into sentences" do
      str = "The. Quick. Brown. Fox."
      expect(iterator.each_sentence(str).map { |word, _, _| word }).to eq([
        "The. ", "Quick. ", "Brown. ", "Fox."
      ])
    end

    it "does not split on commas, for example" do
      str = "The. Quick, brown. Fox."
      expect(iterator.each_sentence(str).map { |word, _, _| word }).to eq([
        "The. ", "Quick, brown. ", "Fox."
      ])
    end

    it "does not split periods in the midst of other letters, eg. in a URL" do
      str = "Visit us. Go to https://translate.twitter.com."
      expect(iterator.each_sentence(str).map { |word, _, _| word }).to eq([
        "Visit us. ",
        "Go to https://translate.twitter.com."
      ])
    end

    it "splits on sentences that end with other kinds of punctuation" do
      str = "Help us translate! Speak another language? You really, really rock."
      expect(iterator.each_sentence(str).map { |word, _, _| word }).to eq([
        "Help us translate! ",
        "Speak another language? ",
        "You really, really rock."
      ])
    end

    context "with ULI exceptions" do
      it "does not split on certain abbreviations like Mr. and Mrs." do
        str = "I really like Mrs. Patterson. She's nice."
        expect(iterator.each_sentence(str).map { |word, _, _| word }).to eq([
          "I really like Mrs. Patterson. ",
          "She's nice."
        ])
      end

      it "splits correctly when a string ends with an exception directly followed by a single space" do
        str = "I like the Mrs. "
        expect(iterator.each_sentence(str).map { |word, _, _| word }).to eq([
          "I like the Mrs. "
        ])
      end
    end

    context "without ULI exceptions" do
      let(:iterator) { described_class.new(:en, use_uli_exceptions: false) }

      it "splits on certain abbreviations like Mr. and Mrs. (use ULI rules to avoid this behavior)" do
        str = "I really like Mrs. Patterson. She's nice."
        expect(iterator.each_sentence(str).map { |word, _, _| word }).to eq([
          "I really like Mrs. ",
          "Patterson. ",
          "She's nice."
        ])
      end
    end
  end

  describe "#each_word" do
    let(:iterator) { described_class.new(:en) }

    it "should return an enumerator if called without a block" do
      expect(iterator.each_word("foo bar")).to be_a(Enumerator)
    end

    it "splits a simple string into words" do
      str = "the quick brown fox"
      expect(iterator.each_word(str).map { |word, _, _| word }).to eq([
        "the", " ", "quick", " ", "brown", " ", "fox"
      ])
    end

    it "breaks around periods" do
      str = "The. Quick. Brown. Fox."
      expect(iterator.each_word(str).map { |word, _, _| word }).to eq([
        "The", ".", " ", "Quick", ".", " ", "Brown", ".", " ", "Fox", "."
      ])
    end

    it "does not break at apostrophes" do
      str = "I like cats. They're cute."
      expect(iterator.each_word(str).map { |word, _, _| word }).to eq([
        "I", " ", "like", " ", "cats", ".", " ", "They're", " ", "cute", "."
      ])
    end
  end
end
