# Copyright 2023 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utility library of instructions."""

import functools
import random
import re

import immutabledict
import nltk


def download_nltk_resources():
    """Download 'punkt' if not already installed"""
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt")


download_nltk_resources()

WORD_LIST = [
    "western",
    "sentence",
    "signal",
    "dump",
    "spot",
    "opposite",
    "bottom",
    "potato",
    "administration",
    "working",
    "welcome",
    "morning",
    "good",
    "agency",
    "primary",
    "wish",
    "responsibility",
    "press",
    "problem",
    "president",
    "steal",
    "brush",
    "read",
    "type",
    "beat",
    "trainer",
    "growth",
    "lock",
    "bone",
    "case",
    "equal",
    "comfortable",
    "region",
    "replacement",
    "performance",
    "mate",
    "walk",
    "medicine",
    "film",
    "thing",
    "rock",
    "tap",
    "total",
    "competition",
    "ease",
    "south",
    "establishment",
    "gather",
    "parking",
    "world",
    "plenty",
    "breath",
    "claim",
    "alcohol",
    "trade",
    "dear",
    "highlight",
    "street",
    "matter",
    "decision",
    "mess",
    "agreement",
    "studio",
    "coach",
    "assist",
    "brain",
    "wing",
    "style",
    "private",
    "top",
    "brown",
    "leg",
    "buy",
    "procedure",
    "method",
    "speed",
    "high",
    "company",
    "valuable",
    "pie",
    "analyst",
    "session",
    "pattern",
    "district",
    "pleasure",
    "dinner",
    "swimming",
    "joke",
    "order",
    "plate",
    "department",
    "motor",
    "cell",
    "spend",
    "cabinet",
    "difference",
    "power",
    "examination",
    "engine",
    "horse",
    "dimension",
    "pay",
    "toe",
    "curve",
    "literature",
    "bother",
    "fire",
    "possibility",
    "debate",
    "activity",
    "passage",
    "hello",
    "cycle",
    "background",
    "quiet",
    "author",
    "effect",
    "actor",
    "page",
    "bicycle",
    "error",
    "throat",
    "attack",
    "character",
    "phone",
    "tea",
    "increase",
    "outcome",
    "file",
    "specific",
    "inspector",
    "internal",
    "potential",
    "staff",
    "building",
    "employer",
    "shoe",
    "hand",
    "direction",
    "garden",
    "purchase",
    "interview",
    "study",
    "recognition",
    "member",
    "spiritual",
    "oven",
    "sandwich",
    "weird",
    "passenger",
    "particular",
    "response",
    "reaction",
    "size",
    "variation",
    "a",
    "cancel",
    "candy",
    "exit",
    "guest",
    "condition",
    "fly",
    "price",
    "weakness",
    "convert",
    "hotel",
    "great",
    "mouth",
    "mind",
    "song",
    "sugar",
    "suspect",
    "telephone",
    "ear",
    "roof",
    "paint",
    "refrigerator",
    "organization",
    "jury",
    "reward",
    "engineering",
    "day",
    "possession",
    "crew",
    "bar",
    "road",
    "description",
    "celebration",
    "score",
    "mark",
    "letter",
    "shower",
    "suggestion",
    "sir",
    "luck",
    "national",
    "progress",
    "hall",
    "stroke",
    "theory",
    "offer",
    "story",
    "tax",
    "definition",
    "history",
    "ride",
    "medium",
    "opening",
    "glass",
    "elevator",
    "stomach",
    "question",
    "ability",
    "leading",
    "village",
    "computer",
    "city",
    "grand",
    "confidence",
    "candle",
    "priest",
    "recommendation",
    "point",
    "necessary",
    "body",
    "desk",
    "secret",
    "horror",
    "noise",
    "culture",
    "warning",
    "water",
    "round",
    "diet",
    "flower",
    "bus",
    "tough",
    "permission",
    "week",
    "prompt",
    "connection",
    "abuse",
    "height",
    "save",
    "corner",
    "border",
    "stress",
    "drive",
    "stop",
    "rip",
    "meal",
    "listen",
    "confusion",
    "girlfriend",
    "living",
    "relation",
    "significance",
    "plan",
    "creative",
    "atmosphere",
    "blame",
    "invite",
    "housing",
    "paper",
    "drink",
    "roll",
    "silver",
    "drunk",
    "age",
    "damage",
    "smoke",
    "environment",
    "pack",
    "savings",
    "influence",
    "tourist",
    "rain",
    "post",
    "sign",
    "grandmother",
    "run",
    "profit",
    "push",
    "clerk",
    "final",
    "wine",
    "swim",
    "pause",
    "stuff",
    "singer",
    "funeral",
    "average",
    "source",
    "scene",
    "tradition",
    "personal",
    "snow",
    "nobody",
    "distance",
    "sort",
    "sensitive",
    "animal",
    "major",
    "negotiation",
    "click",
    "mood",
    "period",
    "arrival",
    "expression",
    "holiday",
    "repeat",
    "dust",
    "closet",
    "gold",
    "bad",
    "sail",
    "combination",
    "clothes",
    "emphasis",
    "duty",
    "black",
    "step",
    "school",
    "jump",
    "document",
    "professional",
    "lip",
    "chemical",
    "front",
    "wake",
    "while",
    "inside",
    "watch",
    "row",
    "subject",
    "penalty",
    "balance",
    "possible",
    "adult",
    "aside",
    "sample",
    "appeal",
    "wedding",
    "depth",
    "king",
    "award",
    "wife",
    "blow",
    "site",
    "camp",
    "music",
    "safe",
    "gift",
    "fault",
    "guess",
    "act",
    "shame",
    "drama",
    "capital",
    "exam",
    "stupid",
    "record",
    "sound",
    "swing",
    "novel",
    "minimum",
    "ratio",
    "machine",
    "shape",
    "lead",
    "operation",
    "salary",
    "cloud",
    "affair",
    "hit",
    "chapter",
    "stage",
    "quantity",
    "access",
    "army",
    "chain",
    "traffic",
    "kick",
    "analysis",
    "airport",
    "time",
    "vacation",
    "philosophy",
    "ball",
    "chest",
    "thanks",
    "place",
    "mountain",
    "advertising",
    "red",
    "past",
    "rent",
    "return",
    "tour",
    "house",
    "construction",
    "net",
    "native",
    "war",
    "figure",
    "fee",
    "spray",
    "user",
    "dirt",
    "shot",
    "task",
    "stick",
    "friend",
    "software",
    "promotion",
    "interaction",
    "surround",
    "block",
    "purpose",
    "practice",
    "conflict",
    "routine",
    "requirement",
    "bonus",
    "hole",
    "state",
    "junior",
    "sweet",
    "catch",
    "tear",
    "fold",
    "wall",
    "editor",
    "life",
    "position",
    "pound",
    "respect",
    "bathroom",
    "coat",
    "script",
    "job",
    "teach",
    "birth",
    "view",
    "resolve",
    "theme",
    "employee",
    "doubt",
    "market",
    "education",
    "serve",
    "recover",
    "tone",
    "harm",
    "miss",
    "union",
    "understanding",
    "cow",
    "river",
    "association",
    "concept",
    "training",
    "recipe",
    "relationship",
    "reserve",
    "depression",
    "proof",
    "hair",
    "revenue",
    "independent",
    "lift",
    "assignment",
    "temporary",
    "amount",
    "loss",
    "edge",
    "track",
    "check",
    "rope",
    "estimate",
    "pollution",
    "stable",
    "message",
    "delivery",
    "perspective",
    "mirror",
    "assistant",
    "representative",
    "witness",
    "nature",
    "judge",
    "fruit",
    "tip",
    "devil",
    "town",
    "emergency",
    "upper",
    "drop",
    "stay",
    "human",
    "neck",
    "speaker",
    "network",
    "sing",
    "resist",
    "league",
    "trip",
    "signature",
    "lawyer",
    "importance",
    "gas",
    "choice",
    "engineer",
    "success",
    "part",
    "external",
    "worker",
    "simple",
    "quarter",
    "student",
    "heart",
    "pass",
    "spite",
    "shift",
    "rough",
    "lady",
    "grass",
    "community",
    "garage",
    "youth",
    "standard",
    "skirt",
    "promise",
    "blind",
    "television",
    "disease",
    "commission",
    "positive",
    "energy",
    "calm",
    "presence",
    "tune",
    "basis",
    "preference",
    "head",
    "common",
    "cut",
    "somewhere",
    "presentation",
    "current",
    "thought",
    "revolution",
    "effort",
    "master",
    "implement",
    "republic",
    "floor",
    "principle",
    "stranger",
    "shoulder",
    "grade",
    "button",
    "tennis",
    "police",
    "collection",
    "account",
    "register",
    "glove",
    "divide",
    "professor",
    "chair",
    "priority",
    "combine",
    "peace",
    "extension",
    "maybe",
    "evening",
    "frame",
    "sister",
    "wave",
    "code",
    "application",
    "mouse",
    "match",
    "counter",
    "bottle",
    "half",
    "cheek",
    "resolution",
    "back",
    "knowledge",
    "make",
    "discussion",
    "screw",
    "length",
    "accident",
    "battle",
    "dress",
    "knee",
    "log",
    "package",
    "it",
    "turn",
    "hearing",
    "newspaper",
    "layer",
    "wealth",
    "profile",
    "imagination",
    "answer",
    "weekend",
    "teacher",
    "appearance",
    "meet",
    "bike",
    "rise",
    "belt",
    "crash",
    "bowl",
    "equivalent",
    "support",
    "image",
    "poem",
    "risk",
    "excitement",
    "remote",
    "secretary",
    "public",
    "produce",
    "plane",
    "display",
    "money",
    "sand",
    "situation",
    "punch",
    "customer",
    "title",
    "shake",
    "mortgage",
    "option",
    "number",
    "pop",
    "window",
    "extent",
    "nothing",
    "experience",
    "opinion",
    "departure",
    "dance",
    "indication",
    "boy",
    "material",
    "band",
    "leader",
    "sun",
    "beautiful",
    "muscle",
    "farmer",
    "variety",
    "fat",
    "handle",
    "director",
    "opportunity",
    "calendar",
    "outside",
    "pace",
    "bath",
    "fish",
    "consequence",
    "put",
    "owner",
    "go",
    "doctor",
    "information",
    "share",
    "hurt",
    "protection",
    "career",
    "finance",
    "force",
    "golf",
    "garbage",
    "aspect",
    "kid",
    "food",
    "boot",
    "milk",
    "respond",
    "objective",
    "reality",
    "raw",
    "ring",
    "mall",
    "one",
    "impact",
    "area",
    "news",
    "international",
    "series",
    "impress",
    "mother",
    "shelter",
    "strike",
    "loan",
    "month",
    "seat",
    "anything",
    "entertainment",
    "familiar",
    "clue",
    "year",
    "glad",
    "supermarket",
    "natural",
    "god",
    "cost",
    "conversation",
    "tie",
    "ruin",
    "comfort",
    "earth",
    "storm",
    "percentage",
    "assistance",
    "budget",
    "strength",
    "beginning",
    "sleep",
    "other",
    "young",
    "unit",
    "fill",
    "store",
    "desire",
    "hide",
    "value",
    "cup",
    "maintenance",
    "nurse",
    "function",
    "tower",
    "role",
    "class",
    "camera",
    "database",
    "panic",
    "nation",
    "basket",
    "ice",
    "art",
    "spirit",
    "chart",
    "exchange",
    "feedback",
    "statement",
    "reputation",
    "search",
    "hunt",
    "exercise",
    "nasty",
    "notice",
    "male",
    "yard",
    "annual",
    "collar",
    "date",
    "platform",
    "plant",
    "fortune",
    "passion",
    "friendship",
    "spread",
    "cancer",
    "ticket",
    "attitude",
    "island",
    "active",
    "object",
    "service",
    "buyer",
    "bite",
    "card",
    "face",
    "steak",
    "proposal",
    "patient",
    "heat",
    "rule",
    "resident",
    "broad",
    "politics",
    "west",
    "knife",
    "expert",
    "girl",
    "design",
    "salt",
    "baseball",
    "grab",
    "inspection",
    "cousin",
    "couple",
    "magazine",
    "cook",
    "dependent",
    "security",
    "chicken",
    "version",
    "currency",
    "ladder",
    "scheme",
    "kitchen",
    "employment",
    "local",
    "attention",
    "manager",
    "fact",
    "cover",
    "sad",
    "guard",
    "relative",
    "county",
    "rate",
    "lunch",
    "program",
    "initiative",
    "gear",
    "bridge",
    "breast",
    "talk",
    "dish",
    "guarantee",
    "beer",
    "vehicle",
    "reception",
    "woman",
    "substance",
    "copy",
    "lecture",
    "advantage",
    "park",
    "cold",
    "death",
    "mix",
    "hold",
    "scale",
    "tomorrow",
    "blood",
    "request",
    "green",
    "cookie",
    "church",
    "strip",
    "forever",
    "beyond",
    "debt",
    "tackle",
    "wash",
    "following",
    "feel",
    "maximum",
    "sector",
    "sea",
    "property",
    "economics",
    "menu",
    "bench",
    "try",
    "language",
    "start",
    "call",
    "solid",
    "address",
    "income",
    "foot",
    "senior",
    "honey",
    "few",
    "mixture",
    "cash",
    "grocery",
    "link",
    "map",
    "form",
    "factor",
    "pot",
    "model",
    "writer",
    "farm",
    "winter",
    "skill",
    "anywhere",
    "birthday",
    "policy",
    "release",
    "husband",
    "lab",
    "hurry",
    "mail",
    "equipment",
    "sink",
    "pair",
    "driver",
    "consideration",
    "leather",
    "skin",
    "blue",
    "boat",
    "sale",
    "brick",
    "two",
    "feed",
    "square",
    "dot",
    "rush",
    "dream",
    "location",
    "afternoon",
    "manufacturer",
    "control",
    "occasion",
    "trouble",
    "introduction",
    "advice",
    "bet",
    "eat",
    "kill",
    "category",
    "manner",
    "office",
    "estate",
    "pride",
    "awareness",
    "slip",
    "crack",
    "client",
    "nail",
    "shoot",
    "membership",
    "soft",
    "anybody",
    "web",
    "official",
    "individual",
    "pizza",
    "interest",
    "bag",
    "spell",
    "profession",
    "queen",
    "deal",
    "resource",
    "ship",
    "guy",
    "chocolate",
    "joint",
    "formal",
    "upstairs",
    "car",
    "resort",
    "abroad",
    "dealer",
    "associate",
    "finger",
    "surgery",
    "comment",
    "team",
    "detail",
    "crazy",
    "path",
    "tale",
    "initial",
    "arm",
    "radio",
    "demand",
    "single",
    "draw",
    "yellow",
    "contest",
    "piece",
    "quote",
    "pull",
    "commercial",
    "shirt",
    "contribution",
    "cream",
    "channel",
    "suit",
    "discipline",
    "instruction",
    "concert",
    "speech",
    "low",
    "effective",
    "hang",
    "scratch",
    "industry",
    "breakfast",
    "lay",
    "join",
    "metal",
    "bedroom",
    "minute",
    "product",
    "rest",
    "temperature",
    "many",
    "give",
    "argument",
    "print",
    "purple",
    "laugh",
    "health",
    "credit",
    "investment",
    "sell",
    "setting",
    "lesson",
    "egg",
    "middle",
    "marriage",
    "level",
    "evidence",
    "phrase",
    "love",
    "self",
    "benefit",
    "guidance",
    "affect",
    "you",
    "dad",
    "anxiety",
    "special",
    "boyfriend",
    "test",
    "blank",
    "payment",
    "soup",
    "obligation",
    "reply",
    "smile",
    "deep",
    "complaint",
    "addition",
    "review",
    "box",
    "towel",
    "minor",
    "fun",
    "soil",
    "issue",
    "cigarette",
    "internet",
    "gain",
    "tell",
    "entry",
    "spare",
    "incident",
    "family",
    "refuse",
    "branch",
    "can",
    "pen",
    "grandfather",
    "constant",
    "tank",
    "uncle",
    "climate",
    "ground",
    "volume",
    "communication",
    "kind",
    "poet",
    "child",
    "screen",
    "mine",
    "quit",
    "gene",
    "lack",
    "charity",
    "memory",
    "tooth",
    "fear",
    "mention",
    "marketing",
    "reveal",
    "reason",
    "court",
    "season",
    "freedom",
    "land",
    "sport",
    "audience",
    "classroom",
    "law",
    "hook",
    "win",
    "carry",
    "eye",
    "smell",
    "distribution",
    "research",
    "country",
    "dare",
    "hope",
    "whereas",
    "stretch",
    "library",
    "if",
    "delay",
    "college",
    "plastic",
    "book",
    "present",
    "use",
    "worry",
    "champion",
    "goal",
    "economy",
    "march",
    "election",
    "reflection",
    "midnight",
    "slide",
    "inflation",
    "action",
    "challenge",
    "guitar",
    "coast",
    "apple",
    "campaign",
    "field",
    "jacket",
    "sense",
    "way",
    "visual",
    "remove",
    "weather",
    "trash",
    "cable",
    "regret",
    "buddy",
    "beach",
    "historian",
    "courage",
    "sympathy",
    "truck",
    "tension",
    "permit",
    "nose",
    "bed",
    "son",
    "person",
    "base",
    "meat",
    "usual",
    "air",
    "meeting",
    "worth",
    "game",
    "independence",
    "physical",
    "brief",
    "play",
    "raise",
    "board",
    "she",
    "key",
    "writing",
    "pick",
    "command",
    "party",
    "yesterday",
    "spring",
    "candidate",
    "physics",
    "university",
    "concern",
    "development",
    "change",
    "string",
    "target",
    "instance",
    "room",
    "bitter",
    "bird",
    "football",
    "normal",
    "split",
    "impression",
    "wood",
    "long",
    "meaning",
    "stock",
    "cap",
    "leadership",
    "media",
    "ambition",
    "fishing",
    "essay",
    "salad",
    "repair",
    "today",
    "designer",
    "night",
    "bank",
    "drawing",
    "inevitable",
    "phase",
    "vast",
    "chip",
    "anger",
    "switch",
    "cry",
    "twist",
    "personality",
    "attempt",
    "storage",
    "being",
    "preparation",
    "bat",
    "selection",
    "white",
    "technology",
    "contract",
    "side",
    "section",
    "station",
    "till",
    "structure",
    "tongue",
    "taste",
    "truth",
    "difficulty",
    "group",
    "limit",
    "main",
    "move",
    "feeling",
    "light",
    "example",
    "mission",
    "might",
    "wait",
    "wheel",
    "shop",
    "host",
    "classic",
    "alternative",
    "cause",
    "agent",
    "consist",
    "table",
    "airline",
    "text",
    "pool",
    "craft",
    "range",
    "fuel",
    "tool",
    "partner",
    "load",
    "entrance",
    "deposit",
    "hate",
    "article",
    "video",
    "summer",
    "feature",
    "extreme",
    "mobile",
    "hospital",
    "flight",
    "fall",
    "pension",
    "piano",
    "fail",
    "result",
    "rub",
    "gap",
    "system",
    "report",
    "suck",
    "ordinary",
    "wind",
    "nerve",
    "ask",
    "shine",
    "note",
    "line",
    "mom",
    "perception",
    "brother",
    "reference",
    "bend",
    "charge",
    "treat",
    "trick",
    "term",
    "homework",
    "bake",
    "bid",
    "status",
    "project",
    "strategy",
    "orange",
    "let",
    "enthusiasm",
    "parent",
    "concentrate",
    "device",
    "travel",
    "poetry",
    "business",
    "society",
    "kiss",
    "end",
    "vegetable",
    "employ",
    "schedule",
    "hour",
    "brave",
    "focus",
    "process",
    "movie",
    "illegal",
    "general",
    "coffee",
    "ad",
    "highway",
    "chemistry",
    "psychology",
    "hire",
    "bell",
    "conference",
    "relief",
    "show",
    "neat",
    "funny",
    "weight",
    "quality",
    "club",
    "daughter",
    "zone",
    "touch",
    "tonight",
    "shock",
    "burn",
    "excuse",
    "name",
    "survey",
    "landscape",
    "advance",
    "satisfaction",
    "bread",
    "disaster",
    "item",
    "hat",
    "prior",
    "shopping",
    "visit",
    "east",
    "photo",
    "home",
    "idea",
    "father",
    "comparison",
    "cat",
    "pipe",
    "winner",
    "count",
    "lake",
    "fight",
    "prize",
    "foundation",
    "dog",
    "keep",
    "ideal",
    "fan",
    "struggle",
    "peak",
    "safety",
    "solution",
    "hell",
    "conclusion",
    "population",
    "strain",
    "alarm",
    "measurement",
    "second",
    "train",
    "race",
    "due",
    "insurance",
    "boss",
    "tree",
    "monitor",
    "sick",
    "course",
    "drag",
    "appointment",
    "slice",
    "still",
    "care",
    "patience",
    "rich",
    "escape",
    "emotion",
    "royal",
    "female",
    "childhood",
    "government",
    "picture",
    "will",
    "sock",
    "big",
    "gate",
    "oil",
    "cross",
    "pin",
    "improvement",
    "championship",
    "silly",
    "help",
    "sky",
    "pitch",
    "man",
    "diamond",
    "most",
    "transition",
    "work",
    "science",
    "committee",
    "moment",
    "fix",
    "teaching",
    "dig",
    "specialist",
    "complex",
    "guide",
    "people",
    "dead",
    "voice",
    "original",
    "break",
    "topic",
    "data",
    "degree",
    "reading",
    "recording",
    "bunch",
    "reach",
    "judgment",
    "lie",
    "regular",
    "set",
    "painting",
    "mode",
    "list",
    "player",
    "bear",
    "north",
    "wonder",
    "carpet",
    "heavy",
    "officer",
    "negative",
    "clock",
    "unique",
    "baby",
    "pain",
    "assumption",
    "disk",
    "iron",
    "bill",
    "drawer",
    "look",
    "double",
    "mistake",
    "finish",
    "future",
    "brilliant",
    "contact",
    "math",
    "rice",
    "leave",
    "restaurant",
    "discount",
    "sex",
    "virus",
    "bit",
    "trust",
    "event",
    "wear",
    "juice",
    "failure",
    "bug",
    "context",
    "mud",
    "whole",
    "wrap",
    "intention",
    "draft",
    "pressure",
    "cake",
    "dark",
    "explanation",
    "space",
    "angle",
    "word",
    "efficiency",
    "management",
    "habit",
    "star",
    "chance",
    "finding",
    "transportation",
    "stand",
    "criticism",
    "flow",
    "door",
    "injury",
    "insect",
    "surprise",
    "apartment",
]  # pylint: disable=line-too-long

# ISO 639-1 codes to language names.
LANGUAGE_CODES = immutabledict.immutabledict(
    {
        "en": "English",
        "es": "Spanish",
        "pt": "Portuguese",
        "ar": "Arabic",
        "hi": "Hindi",
        "fr": "French",
        "ru": "Russian",
        "de": "German",
        "ja": "Japanese",
        "it": "Italian",
        "bn": "Bengali",
        "uk": "Ukrainian",
        "th": "Thai",
        "ur": "Urdu",
        "ta": "Tamil",
        "te": "Telugu",
        "bg": "Bulgarian",
        "ko": "Korean",
        "pl": "Polish",
        "he": "Hebrew",
        "fa": "Persian",
        "vi": "Vietnamese",
        "ne": "Nepali",
        "sw": "Swahili",
        "kn": "Kannada",
        "mr": "Marathi",
        "gu": "Gujarati",
        "pa": "Punjabi",
        "ml": "Malayalam",
        "fi": "Finnish",
    }
)

_ALPHABETS = "([A-Za-z])"
_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
_DIGITS = "([0-9])"
_MULTIPLE_DOTS = r"\.{2,}"


def split_into_sentences(text):
    """Split the text into sentences.

    Args:
      text: A string that consists of more than or equal to one sentences.

    Returns:
      A list of strings where each string is a sentence.
    """
    text = " " + text + "  "
    text = text.replace("\n", " ")
    text = re.sub(_PREFIXES, "\\1<prd>", text)
    text = re.sub(_WEBSITES, "<prd>\\1", text)
    text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
    text = re.sub(
        _MULTIPLE_DOTS,
        lambda match: "<prd>" * len(match.group(0)) + "<stop>",
        text,
    )
    if "Ph.D" in text:
        text = text.replace("Ph.D.", "Ph<prd>D<prd>")
    text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
    text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
    text = re.sub(
        _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
        "\\1<prd>\\2<prd>\\3<prd>",
        text,
    )
    text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
    text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
    text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
    text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
    if "”" in text:
        text = text.replace(".”", "”.")
    if '"' in text:
        text = text.replace('."', '".')
    if "!" in text:
        text = text.replace('!"', '"!')
    if "?" in text:
        text = text.replace('?"', '"?')
    text = text.replace(".", ".<stop>")
    text = text.replace("?", "?<stop>")
    text = text.replace("!", "!<stop>")
    text = text.replace("<prd>", ".")
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]
    if sentences and not sentences[-1]:
        sentences = sentences[:-1]
    return sentences


def count_words(text):
    """Counts the number of words."""
    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
    tokens = tokenizer.tokenize(text)
    num_words = len(tokens)
    return num_words


@functools.lru_cache(maxsize=None)
def _get_sentence_tokenizer():
    return nltk.data.load("nltk:tokenizers/punkt/english.pickle")


def count_sentences(text):
    """Count the number of sentences."""
    tokenizer = _get_sentence_tokenizer()
    tokenized_sentences = tokenizer.tokenize(text)
    return len(tokenized_sentences)


def generate_keywords(num_keywords):
    """Randomly generates a few keywords."""
    return random.sample(WORD_LIST, k=num_keywords)
