marketing-analytics/predicting/ml-data-windowing-pipeline/DataVisualizationPipeline.java (59 lines of code) (raw):

// Copyright 2019 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.google.corp.gtech.ads.datacatalyst.components.mldatawindowingpipeline; import com.google.corp.gtech.ads.datacatalyst.components.mldatawindowingpipeline.model.Session; import com.google.corp.gtech.ads.datacatalyst.components.mldatawindowingpipeline.transform.MapFactToTableRow; import com.google.corp.gtech.ads.datacatalyst.components.mldatawindowingpipeline.transform.MapSessionToFacts; import com.google.corp.gtech.ads.datacatalyst.components.mldatawindowingpipeline.transform.MapSortedSessionsToUserActivities; import com.google.corp.gtech.ads.datacatalyst.components.mldatawindowingpipeline.transform.MapUserActivityToTableRow; import com.google.corp.gtech.ads.datacatalyst.components.mldatawindowingpipeline.transform.MapUserIdToSession; import com.google.corp.gtech.ads.datacatalyst.components.mldatawindowingpipeline.transform.SortSessionsByTime; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.AvroIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.transforms.GroupByKey; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.values.PCollection; /** * Pipeline for outputing the Facts and UserActivity data tables from the input user Sessions. */ public class DataVisualizationPipeline { public static void main(String[] args) { DataVisualizationPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as( DataVisualizationPipelineOptions.class); Pipeline pipeline = Pipeline.create(options); PCollection<Session> sessions = pipeline .apply(AvroIO.read(Session.class).from(options.getInputAvroSessionsLocation())); // Output Facts BigQuery table. sessions .apply("MapSessionToFacts", ParDo.of(new MapSessionToFacts())) .apply("MapFactToTableRow", ParDo.of(new MapFactToTableRow())) .apply("WriteFactsToBigQuery", BigQueryIO.writeTableRows() .to(options.getOutputBigQueryFactsTable()) .withoutValidation() .withSchema(MapFactToTableRow.getTableSchema()) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)); // Output UserActivity BigQuery table. sessions .apply("MapUserIdToSession", ParDo.of(new MapUserIdToSession())) .apply(GroupByKey.<String, Session>create()) .apply("SortSessionsByTime", ParDo.of(new SortSessionsByTime())) .apply( "MapSortedSessionsToUserActivities", ParDo.of( new MapSortedSessionsToUserActivities( options.getSnapshotStartDate(), options.getSnapshotEndDate(), options.getSlideTimeInSeconds(), options.getMinimumLookaheadTimeInSeconds(), options.getMaximumLookaheadTimeInSeconds(), options.getStopOnFirstPositiveLabel()))) .apply("MapUserActivityToTableRow", ParDo.of(new MapUserActivityToTableRow())) .apply( "WriteUserActivitiesToBigQuery", BigQueryIO.writeTableRows() .to(options.getOutputBigQueryUserActivityTable()) .withoutValidation() .withSchema(MapUserActivityToTableRow.getTableSchema()) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)); pipeline.run(); } }