parquet_flask/io_logic/query

# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from datetime import datetime import pyspark.sql.functions as F from parquet_flask.utils.file_utils import FileUtils from pyspark.sql.session import SparkSession from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import lit from pyspark.sql.types import Row from pyspark.sql.utils import AnalysisException from parquet_flask.io_logic.cdms_schema import CdmsSchema from parquet_flask.io_logic.parquet_query_condition_management_v4 import ParquetQueryConditionManagementV4 from parquet_flask.io_logic.partitioned_parquet_path import PartitionedParquetPath from parquet_flask.io_logic.query_v2 import QueryProps from parquet_flask.io_logic.cdms_constants import CDMSConstants from parquet_flask.utils.config import Config from parquet_flask.utils.general_utils import GeneralUtils LOGGER = logging.getLogger(__name__) class QueryV4: def __init__(self, props=QueryProps()): self.__props = props config = Config() self.__app_name = config.get_spark_app_name() self.__master_spark = config.get_value(Config.master_spark_url) self.__parquet_name = config.get_value(Config.parquet_file_name) self.__es_config = { 'es_url': config.get_value(Config.es_url), 'es_index': CDMSConstants.es_index_parquet_stats, 'es_port': int(config.get_value(Config.es_port, '443')), } self.__parquet_name = self.__parquet_name if not self.__parquet_name.endswith('/') else self.__parquet_name[:-1] self.__missing_depth_value = CDMSConstants.missing_depth_value self.__conditions = [] self.__sorting_columns = [CDMSConstants.time_col, CDMSConstants.platform_code_col, CDMSConstants.depth_col, CDMSConstants.lat_col, CDMSConstants.lon_col] self.__set_missing_depth_val() def __set_missing_depth_val(self): possible_missing_depth = Config().get_value(Config.missing_depth_value) if GeneralUtils.is_int(possible_missing_depth): self.__missing_depth_value = int(possible_missing_depth) return def __retrieve_spark(self): from parquet_flask.io_logic.retrieve_spark_session import RetrieveSparkSession spark = RetrieveSparkSession().retrieve_spark_session(self.__app_name, self.__master_spark) return spark def __strip_duplicates_maintain_order(self, condition_manager: ParquetQueryConditionManagementV4): LOGGER.warning(f'length of parquet_names: {len(condition_manager.parquet_names)}') distinct_list = [] distinct_set = set([]) for each in condition_manager.parquet_names: each: PartitionedParquetPath = each parquet_path = each.generate_path() if parquet_path in distinct_set: continue distinct_set.add(parquet_path) distinct_list.append(each) LOGGER.warning(f'length of distinct_parquet_names: {len(distinct_list)}') LOGGER.warning(f'distinct_parquet_names: {distinct_set}') return distinct_list def get_unioned_read_df(self, condition_manager: ParquetQueryConditionManagementV4, spark: SparkSession) -> DataFrame: cdms_spark_struct = CdmsSchema().get_schema_from_json(FileUtils.read_json(Config().get_value(Config.in_situ_schema))) if len(condition_manager.parquet_names) < 1: LOGGER.fatal(f'cannot find any in ES. returning None instead of searching entire parquet directory for now. ') return None # read_df: DataFrame = spark.read.schema(cdms_spark_struct).parquet(condition_manager.parquet_name) # return read_df read_df_list = [] distinct_parquet_names = self.__strip_duplicates_maintain_order(condition_manager) for each in distinct_parquet_names: each: PartitionedParquetPath = each try: temp_df: DataFrame = spark.read.schema(cdms_spark_struct).parquet(each.generate_path()) for k, v in each.get_df_columns().items(): temp_df: DataFrame = temp_df.withColumn(k, lit(v)) read_df_list.append(temp_df) except Exception as e: LOGGER.exception(f'failed to retrieve data from spark for: {each.generate_path()}') if len(read_df_list) < 1: return None main_read_df: DataFrame = read_df_list[0] for each in read_df_list[1:]: main_read_df = main_read_df.union(each) return main_read_df def __get_paged_result(self, result_df: DataFrame, total_result: int): remaining_size = total_result - self.__props.start_at current_page_size = remaining_size if remaining_size < self.__props.size else self.__props.size result = result_df.limit(self.__props.start_at + current_page_size).tail(current_page_size) return result def __get_paged_result_v2(self, result_df: DataFrame): offset = self.__props.start_at + self.__props.size limit = self.__props.size df = result_df.withColumn('_id', F.monotonically_increasing_id()) df = df.where(F.col('_id').between(offset, offset + limit)) return df.collect() def __is_in_old_page(self, current_item: dict) -> bool: return current_item[CDMSConstants.time_col] == self.__props.min_datetime and current_item[CDMSConstants.platform_col]['code'] <= self.__props.marker_platform_code def __get_sorting_params(self, query_result: DataFrame): return [query_result[k].asc() for k in self.__sorting_columns] def __get_nth_first_page(self, query_result: DataFrame): result_head = query_result.where(f"{CDMSConstants.time_col} = '{self.__props.min_datetime}'").sort(self.__get_sorting_params(query_result)).collect() new_index = -1 for i, each_row in enumerate(result_head): each_row: Row = each_row each_sha_256 = GeneralUtils.gen_sha_256_json_obj(each_row.asDict()) if each_sha_256 == self.__props.marker_platform_code: new_index = i break if new_index < 0: LOGGER.warning(f'comparing sha256: {self.__props.marker_platform_code}') for each_row in result_head: each_row: Row = each_row each_sha_256 = GeneralUtils.gen_sha_256_json_obj(each_row.asDict()) LOGGER.warning(f'each row: {str(each_row)}. each_sha_256: {each_sha_256}') raise ValueError(f'cannot find existing row. It should not happen.') result_page = query_result.take(self.__props.size + new_index + 1) result_tail = result_page[new_index + 1:] return result_tail def __get_page(self, query_result: DataFrame, total_result: int): if self.__props.size == 0: return [] if self.__props.marker_platform_code is not None: # pagination new logic return self.__get_nth_first_page(query_result) if total_result < 0: raise ValueError('total_result is not calculated for old pagination logic. This should not happen. Something has horribly gone wrong') # result = self.__get_paged_result_v2(query_result) return self.__get_paged_result(query_result, total_result) def __get_total_count(self, query_result: DataFrame): if self.__props.marker_platform_code is not None: LOGGER.debug(f'not counting total since this is an Nth page') return -1 LOGGER.debug(f'counting total') return int(query_result.count()) def search(self, spark_session=None): LOGGER.debug(f'<delay_check> query_v4_search started') condition_manager = ParquetQueryConditionManagementV4(self.__parquet_name, self.__missing_depth_value, self.__es_config, self.__props) condition_manager.manage_query_props() conditions = ' AND '.join(condition_manager.conditions) query_begin_time = datetime.now() LOGGER.debug(f'<delay_check> query begins at {query_begin_time}') spark = self.__retrieve_spark() if spark_session is None else spark_session created_spark_session_time = datetime.now() LOGGER.debug(f'<delay_check>spark session created at {created_spark_session_time}. duration: {created_spark_session_time - query_begin_time}') LOGGER.debug(f'__parquet_name: {condition_manager.parquet_name}') read_df: DataFrame = self.get_unioned_read_df(condition_manager, spark) if read_df is None: return { 'total': 0, 'results': [], } read_df_time = datetime.now() LOGGER.debug(f'<delay_check> parquet read created at {read_df_time}. duration: {read_df_time - created_spark_session_time}') query_result = read_df.where(conditions) query_result = query_result.sort(self.__get_sorting_params(query_result)) query_time = datetime.now() LOGGER.debug(f'<delay_check> parquet read filtered at {query_time}. duration: {query_time - read_df_time}') LOGGER.debug(f'<delay_check> total duration: {query_time - query_begin_time}') total_result = self.__get_total_count(query_result) LOGGER.debug(f'<delay_check> total calc count duration: {datetime.now() - query_time}') if self.__props.size < 1: LOGGER.debug(f'returning only the size: {total_result}') return { 'total': total_result, 'results': [], } query_time = datetime.now() # result = query_result.withColumn('_id', F.monotonically_increasing_id()) removing_cols = [CDMSConstants.time_obj_col, CDMSConstants.year_col, CDMSConstants.month_col] # result = result.where(F.col('_id').between(self.__props.start_at, self.__props.start_at + self.__props.size)).drop(*removing_cols) if len(condition_manager.columns) > 0: query_result = query_result.select(condition_manager.columns) else: query_result = query_result.drop(*removing_cols) LOGGER.debug(f'<delay_check> returning size : {total_result}') result = self.__get_page(query_result, total_result) query_result.unpersist() LOGGER.debug(f'<delay_check> total retrieval duration: {datetime.now() - query_time}') # spark.stop() return { 'total': total_result, 'results': [k.asDict() for k in result], }

parquet_flask/io_logic/query_v4.py (173 lines of code) (raw):