"""
Copyright (c) Facebook, Inc. and its affiliates.
All rights reserved.

This source code is licensed under the MIT-style license found in the
LICENSE file in the root directory of this source tree.

----------

Script to prepare recipe to train/eval model on Librispeech in wav2letter++ pipelines
Please install `sph2pipe` on your own -
see https://www.ldc.upenn.edu/language-resources/tools/sphere-conversion-tools \
  with commands :

  wget https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/ctools/sph2pipe_v2.5.tar.gz
  tar -xzf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5
  gcc -o sph2pipe *.c -lm

Command : python3 prepare_data.py --wsj0 [...]/WSJ0/media \
    --wsj1 [...]/WSJ1/media --data_dst [...] --model_dst [...]
    --sph2pipe [...]/sph2pipe_v2.5/sph2pipe

Replace [...] with appropriate paths
"""

from __future__ import absolute_import, division, print_function, unicode_literals

import argparse
import functools
import os
import re
import sys
from collections import defaultdict

import numpy

FILE_DIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(FILE_DIR, "../utilities"))
from utils import convert_words_to_letters_asg_rep2

# FILE = __file__


def compare(x, y):
    # sort by counts, if counts equal then sort in lex order
    if x[1] > y[1]:
        return -1
    elif x[1] == y[1]:
        if x[0] < y[0]:
            return -1
        else:
            return 1
    else:
        return 1


def remap_words_with_same_spelling(data_dst, decoder_dst):
    words_dict = defaultdict(int)
    spellings_dict = defaultdict(set)
    spellings_appearence_dict = defaultdict(int)

    with open(os.path.join(data_dst, "lists/si284.lst"), "r") as flist:
        for line in flist:
            for word in line.strip().split(" ")[3:]:
                word = re.sub(r"\(\S+\)", "", word)  # not pronounced
                words_dict[word] += 1
                spelling = re.sub("[^a-z'.]+", "", word)
                spellings_dict[spelling].update([word])
                spellings_appearence_dict[spelling] += 1

    with open(os.path.join(data_dst, "text/lm.txt"), "r") as flm:
        for line in flm:
            for word in line.strip().split(" "):
                word = re.sub(r"\(\S+\)", "", word)  # not pronounced
                spelling = re.sub("[^a-z'.]+", "", word)
                spellings_dict[spelling].update([word])
                spellings_appearence_dict[spelling] += 1

    sorted_spellings = sorted(
        spellings_appearence_dict.items(), key=functools.cmp_to_key(compare)
    )

    special_mapping = {"al": "al-", "st": "st", "nd": "nd", "rd": "rd"}
    remap_result = dict()
    with open(os.path.join(decoder_dst, "dict-remap.txt"), "w") as fmap:
        for spelling, _ in sorted_spellings:
            words_count = {w: words_dict[w] for w in spellings_dict[spelling]}
            sorted_words = sorted(
                words_count.items(), key=functools.cmp_to_key(compare)
            )
            for word, _ in sorted_words:
                remap_result[word] = (
                    sorted_words[0][0]
                    if spelling not in special_mapping
                    else special_mapping[spelling]
                )
                fmap.write("{} {}\n".format(word, remap_result[word]))
    return remap_result


def get_spelling(word):
    spelling = re.sub(r"\(\S+\)", "", word)  # not pronounced
    spelling = re.sub("[^a-z'.]+", "", spelling)

    return spelling


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Librispeech Dataset creation.")
    parser.add_argument("--wsj0", help="top level directory containing all WSJ0 discs")
    parser.add_argument("--wsj1", help="top level directory containing all WSJ1 discs")
    parser.add_argument(
        "--data_dst", help="data destination directory", default="./wsj"
    )
    parser.add_argument(
        "--model_dst",
        help="model auxilary files destination directory",
        default="./conv_glu_librispeech_char",
    )
    parser.add_argument(
        "--wsj1_type",
        help="if you are using larger corpus LDC94S13A, set parameter to `LDC94S13A`",
        default="LDC94S13B",
    )
    parser.add_argument(
        "--sph2pipe",
        help="path to sph2pipe executable",
        default="./sph2pipe_v2.5/sph2pipe",
    )
    parser.add_argument("--kenlm", help="location to installed kenlm directory")
    parser.add_argument(
        "-p", "--process", help="# of process for Multiprocessing", default=8, type=int
    )
    args = parser.parse_args()
    os.system(
        "python3 {}/../../../data/wsj/prepare.py "
        "--wsj0 {} --wsj1 {} --sph2pipe {} --wsj1_type {} --dst {} -p {}".format(
            os.path.dirname(os.path.abspath(__file__)),
            args.wsj0,
            args.wsj1,
            args.sph2pipe,
            args.wsj1_type,
            args.data_dst,
            args.process,
        )
    )
    lists_path = os.path.join(args.data_dst, "lists")
    am_path = os.path.join(args.model_dst, "am")
    lm_data_path = os.path.join(args.data_dst, "text/lm.txt")
    decoder_path = os.path.join(args.model_dst, "decoder")
    os.makedirs(am_path, exist_ok=True)
    os.makedirs(decoder_path, exist_ok=True)

    # Generating am/*
    print("Generating tokens.lst for acoustic model training", flush=True)
    with open(os.path.join(am_path, "tokens.lst"), "w") as f_tokens:
        f_tokens.write("|\n")
        f_tokens.write("'\n")
        f_tokens.write(".\n")
        for alphabet in range(ord("a"), ord("z") + 1):
            f_tokens.write(chr(alphabet) + "\n")

    print(
        "Generating lexicon.lst (word -> tokens) for acoustic model training",
        flush=True,
    )

    # generating remapping for words:
    # among words with the same spelling take the most frequent word
    # use only this word in the lexicon
    # do this substitution for the dev during
    # acoustic model training for WER computation
    remap_dict = remap_words_with_same_spelling(args.data_dst, decoder_path)
    with open(os.path.join(lists_path, "si284.lst"), "r") as fin, open(
        os.path.join(am_path, "si284.lst.remap"), "w"
    ) as fout:
        for line in fin:
            line = line.strip().split(" ")
            for index in range(3, len(line)):
                word = re.sub(r"\(\S+\)", "", line[index])
                line[index] = remap_dict[word]
            fout.write(" ".join(line) + "\n")

    # words used in training/eval to prepare spelling
    words_set = set()

    for name in [
        os.path.join(am_path, "si284.lst.remap"),
        os.path.join(lists_path, "nov93dev.lst"),
    ]:
        with open(name, "r") as flist:
            for line in flist:
                transcription = line.strip().split(" ")[3:]
                words_set.update(transcription)

    print(
        "Writing lexicon file - {}...".format(
            os.path.join(am_path, "lexicon_si284+nov93dev.txt")
        ),
        flush=True,
    )
    with open(os.path.join(am_path, "lexicon_si284+nov93dev.txt"), "w") as f:
        for word in words_set:
            spelling = get_spelling(word)
            assert re.match(
                r"[a-z'.]+", spelling
            ), "invalid spelling for word '{}'".format(word)

            f.write(
                "{word}\t{tokens} |\n".format(
                    word=word, tokens=" ".join(list(spelling))
                )
            )

    # Generating decoder/*
    # prepare lexicon word -> tokens spelling
    # write words to lexicon.txt file
    print("Generating lexicon.txt (word -> tokens) for decoding", flush=True)

    lex_file = os.path.join(decoder_path, "lexicon.lst")
    print("Writing lexicon file - {}...".format(lex_file), flush=True)
    with open(lex_file, "w") as f:
        for word in numpy.unique(list(remap_dict.values())):
            if len(re.findall(r"\d", word)) > 0:
                continue
            spelling = get_spelling(word)
            if spelling != "":
                if re.match("^[a-z'.]+$", spelling):
                    f.write("{w}\t{s} |\n".format(w=word, s=" ".join(spelling)))
                else:
                    print('Ignore word "{}" in lexicon'.format(word))

    # Prepare data for char lm training/evaluation
    if os.path.exists(os.path.join(decoder_path, "char_lm_data.train")):
        print(
            "Skip generation of {}. Please remove the file to regenerate it".format(
                os.path.join(decoder_path, "char_lm_data.train")
            )
        )
    else:
        convert_words_to_letters_asg_rep2(
            os.path.join(args.data_dst, "text/lm.txt"),
            os.path.join(decoder_path, "char_lm_data.train"),
        )
    convert_words_to_letters_asg_rep2(
        os.path.join(args.data_dst, "text/nov93dev.txt"),
        os.path.join(decoder_path, "char_lm_data.nov93dev"),
    )

    with open(os.path.join(args.data_dst, "text/nov93dev.txt"), "r") as f, \
         open(os.path.join(decoder_path, "word_lm_data.nov93dev"), "w") as fout:
        for line in f:
            result = []
            for word in line.strip().split(" "):
                word = re.sub("[^a-z'.]+", "", word)
                if word != "":
                    result.append(word)
            fout.write(" ".join(result) + "\n")
    print("Done!", flush=True)