def read_subset_imdb2dic()

in data_handler/imdb/imdb_data_loader.py [0:0]


def read_subset_imdb2dic(nlp_model='en_fr_lang', IMDB_DIR='../data/imdb_data/'):
    #_download_imdb()
    id2numer_info_dic = {}
    id2str_info_dic = {}
    id2genre_dic = {}
    # TODO load first to dictionary and then process in parallel for the nlp model ...
    nlps_title, nlps_characters, nlps_primary_profession = load_nlp_models(nlp_model)

    with open(os.path.join(IMDB_DIR, "title.basics.tsv"), newline='', encoding='utf-8') as csvfile:
        IMDB_title_name =  (csv.reader(csvfile, delimiter='\t'))
        next(IMDB_title_name)
        genre_embed_dict={}
        count=0 #30 total
        for row in IMDB_title_name:
            #if count== 10:
                # for debugging
            #    break
            if len(row)==9:
                str_id = row[0]
                title_type = row[1].lower()
                title1 = row[2].lower()
                title2 = row[3].lower()
                assert "\n" not in title1 and "\n" not in title2
                is_adult=row[4]
                start_year = row[5]
                end_year = row[6]
                runtime=row[7]
                #print(row)
                start_year = None if start_year == '\\N' else float(start_year)
                end_year = None if end_year == '\\N' else float(end_year)
                is_adult = None if is_adult == '\\N' else float(is_adult)
                runtime = None if runtime == '\\N' else float(runtime)
                if start_year is not None and len(row) == 9:
                    if str_id not in id2numer_info_dic:
                        id2numer_info_dic[str_id] = [start_year, end_year, title_type, (is_adult), (runtime)]
                        id2str_info_dic[str_id] =title1+' '+title2

                    if str_id not in id2genre_dic:
                        id2genre_dic[str_id] = row[8].lower().split(",")
                        for gen in id2genre_dic[str_id]:
                            if gen not in genre_embed_dict:
                                genre_embed_dict[gen]=count
                                count+=1
                else:
                    continue
            else:
                continue
    s=time.time()
    id2genre_dic=embed_genre_dic(genre_embed_dict, id2genre_dic)
    e=time.time()
    print('Genre embedding runtime')
    print(e - s)
    s = time.time()
    # get the real dict from the proxy dictionary of the multiprocessing
    id2str_info_dic=dict(parallel_dict_nlp_processing(id2str_info_dic,nlps=nlps_title))
    e = time.time()
    print('Parallel processing runtime')
    print(e - s)
    print(len(id2str_info_dic.values()))
    #print(id2str_info_dic.keys())

    #print(id2str_info_dic)


    with open(os.path.join(IMDB_DIR, "title.ratings.tsv"), newline='', encoding='utf-8') as csvfile:
        IMDB_title_rating =  (csv.reader(csvfile, delimiter='\t'))
        next(IMDB_title_rating)
        for row in IMDB_title_rating:
            str_id = row[0]
            average_rating = row[1]
            num_votes = row[2]
        # TODO Check if this title is important or not.. does it exist in other dictionaries

            if str_id in id2numer_info_dic:
                id2numer_info_dic[str_id] +=\
                    [float(average_rating),float(num_votes)]
            else:
                id2numer_info_dic[str_id] = [float(average_rating), float(num_votes)]



    print("#movie id: {}".format(len(id2numer_info_dic)))

    with open(os.path.join(IMDB_DIR, '_id2numer_info_dic.pkl'), 'wb') as f:
        pickle.dump(id2numer_info_dic, f)
    with open(os.path.join(IMDB_DIR, '_id2str_info_dic.pkl'), 'wb') as f:
        pickle.dump(id2str_info_dic, f)   
    with open(os.path.join(IMDB_DIR, '_id2genre_dic.pkl'), 'wb') as f:
        pickle.dump(id2genre_dic, f)
    sys.stdout.flush()
    ###################################################################################
    ###################################################################################
    id2l_director_dic = {}
    id2l_writer_dic = {}
    id2l_principal_dic={}
    # for different categories
    mlb = MultiLabelBinarizer(classes=list(range(15)), sparse_output=True)
    s = time.time()
    with open(os.path.join(IMDB_DIR, "title.principals.tsv"), newline='', encoding='utf-8') as csvfile:
        IMDB_title_principals =  (csv.reader(csvfile, delimiter='\t'))
        next(IMDB_title_principals)
        total_category = 15
        job_category_count = 0
        job_category_dict = {}
        job_title_dict = {}
        zero_category=csr_matrix((total_category), dtype=np.int8)
        for row in IMDB_title_principals:
            if len(row)==6:
                str_id = row[0]
                ordering = row[1]
                person_id = row[2]
                job_category=row[3].lower()
                job_title = row[4]
                job_title = None if job_title == '\\N' else job_title.lower()
                character = row[5]
                character = None if character == '\\N' else character.lower()
                # TODO is it an entry per person and movie or list of persons
                if job_category is not None:
                    if job_category not in job_category_dict:
                        job_category_dict[job_category] = job_category_count
                        job_category_count += 1
                if job_category_count>total_category:
                    print(job_category_count)
                    total_category=job_category_count

                if  job_category is not None:
                    #s=time.time()
                    job_category_one_hot=mlb.fit_transform([set(([job_category_dict[job_category]]))])
                    #e=time.time()
                    #print(e-s)
                else:
                    job_category_one_hot=zero_category
                job_category=job_category_one_hot
                # TODO should we include job title and character played ? job category, more meaningfull
                if  str_id not in id2l_principal_dic:
                    id2l_principal_dic[str_id] = [[person_id,job_category]]#,job_title,character ]]
                else:
                    id2l_principal_dic[str_id] += [[person_id, job_category]]#, job_title,character]]

            else:
                continue
    e = time.time()
    print('Read and process principals')
    print(e - s)
    #print(len(id2str_info_dic.values()))

    # TODO Possible Bug. How about when only one or 2 directors or writers exists?
    with open(os.path.join(IMDB_DIR, "title.crew.tsv"), newline='', encoding='utf-8') as csvfile:
        file_rows =  (csv.reader(csvfile, delimiter='\t'))
        next(file_rows)
        for row in file_rows:
            id = row[0]
            director_str = row[1]
            writer_str = row[2]

            if id in id2l_director_dic:
                print(id, id2l_director_dic[id])
            else:
                if director_str != "\\N" and len(director_str) > 2:
                    director_vec = director_str.split(",")
                    id2l_director_dic[id] = director_vec


            if id in id2l_writer_dic:
                print(id, id2l_writer_dic[id])
            else:
                if writer_str != "\\N" and len(writer_str) > 2:
                    writer_vec = writer_str.split(",")
                    id2l_writer_dic[id] = writer_vec
    with open(os.path.join(IMDB_DIR, '_id2director_dic.pkl'), 'wb') as f:
        pickle.dump(id2l_director_dic, f)
    with open(os.path.join(IMDB_DIR, '_id2writer_dic.pkl'), 'wb') as f:
        pickle.dump(id2l_writer_dic, f)
    # TODO test data dumping and loading
    with open(os.path.join(IMDB_DIR, '_id2_principal_dic.pkl'), 'wb') as f:
        pickle.dump(id2l_principal_dic, f)
    sys.stdout.flush()
    ###################################################################################
    ###################################################################################

    people_id2name_dic = {}
    people_id2primaryProfession={}
    # for different proffesions
    mlb = MultiLabelBinarizer(classes=list(range(43)), sparse_output=True)
    s=time.time()
    with open(os.path.join(IMDB_DIR, "name.basics.tsv"), newline='', encoding='utf-8') as csvfile:
        file_rows =  (csv.reader(csvfile, delimiter='\t'))
        count_professions = 0
        professions_dict = {}
        next(file_rows)
        for row in file_rows:
            id = row[0]
            name = row[1]
            birthyear=row[2]
            deathyear=row[3]
            primaryProfession=row[4]
            knownfortitles=row[5]
            birthyear = None if birthyear == '\\N' else float(birthyear)
            deathyear = None if deathyear == '\\N' else float(deathyear)
            for prof in primaryProfession.split(","):
                if prof not in professions_dict:
                    professions_dict[prof] = count_professions
                    count_professions += 1
            #primaryProfession= None if len(primaryProfession)==0 else embed_word2vec(primaryProfession,nlps_primary_profession)
            if id in people_id2name_dic:
                print(id, people_id2name_dic[id])
            else:
                people_id2name_dic[id] = [name, birthyear, deathyear, knownfortitles]
                iprof=[professions_dict[prof] for prof in primaryProfession.split(',')]
                people_id2primaryProfession[id]=mlb.fit_transform([set(iprof)])
    with open(os.path.join(IMDB_DIR, '_people_id2name_dic.pkl'), 'wb') as f:
        pickle.dump(people_id2name_dic, f)
    with open(os.path.join(IMDB_DIR, '_people_id2primaryProfession.pkl'), 'wb') as f:
        pickle.dump(people_id2primaryProfession, f)
    print('Read and process people info')
    e = time.time()
    print(e - s)

    #print(len(id2str_info_dic.values()))

    print("IMDb dics generated ...")
    sys.stdout.flush()
    return  id2numer_info_dic,id2str_info_dic, id2genre_dic, id2l_director_dic, id2l_writer_dic,\
            id2l_principal_dic, people_id2name_dic, people_id2primaryProfession