courses/streaming/publish/send_sensor_data.py (63 lines of code) (raw):
#!/usr/bin/env python3
# Copyright 2018 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import gzip
import logging
import argparse
import datetime
from google.cloud import pubsub
TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
TOPIC = 'sandiego'
INPUT = 'sensor_obs2008.csv.gz'
def publish(publisher, topic, events):
numobs = len(events)
if numobs > 0:
logging.info('Publishing {0} events from {1}'.format(numobs, get_timestamp(events[0])))
for event_data in events:
publisher.publish(topic,event_data)
def get_timestamp(line):
## convert from bytes to str
line = line.decode('utf-8')
# look at first field of row
timestamp = line.split(',')[0]
return datetime.datetime.strptime(timestamp, TIME_FORMAT)
def simulate(topic, ifp, firstObsTime, programStart, speedFactor):
# sleep computation
def compute_sleep_secs(obs_time):
time_elapsed = (datetime.datetime.utcnow() - programStart).seconds
sim_time_elapsed = ((obs_time - firstObsTime).days * 86400.0 + (obs_time - firstObsTime).seconds) / speedFactor
to_sleep_secs = sim_time_elapsed - time_elapsed
return to_sleep_secs
topublish = list()
for line in ifp:
event_data = line # entire line of input CSV is the message
obs_time = get_timestamp(line) # from first column
# how much time should we sleep?
if compute_sleep_secs(obs_time) > 1:
# notify the accumulated topublish
publish(publisher, topic, topublish) # notify accumulated messages
topublish = list() # empty out list
# recompute sleep, since notification takes a while
to_sleep_secs = compute_sleep_secs(obs_time)
if to_sleep_secs > 0:
logging.info('Sleeping {} seconds'.format(to_sleep_secs))
time.sleep(to_sleep_secs)
topublish.append(event_data)
# left-over records; notify again
publish(publisher, topic, topublish)
def peek_timestamp(ifp):
# peek ahead to next line, get timestamp and go back
pos = ifp.tell()
line = ifp.readline()
ifp.seek(pos)
return get_timestamp(line)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Send sensor data to Cloud Pub/Sub in small groups, simulating real-time behavior')
parser.add_argument('--speedFactor', help='Example: 60 implies 1 hour of data sent to Cloud Pub/Sub in 1 minute', required=True, type=float)
parser.add_argument('--project', help='Example: --project $DEVSHELL_PROJECT_ID', required=True)
args = parser.parse_args()
# create Pub/Sub notification topic
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
publisher = pubsub.PublisherClient()
event_type = publisher.topic_path(args.project,TOPIC)
try:
publisher.get_topic(event_type)
logging.info('Reusing pub/sub topic {}'.format(TOPIC))
except:
publisher.create_topic(event_type)
logging.info('Creating pub/sub topic {}'.format(TOPIC))
# notify about each line in the input file
programStartTime = datetime.datetime.utcnow()
with gzip.open(INPUT, 'rb') as ifp:
header = ifp.readline() # skip header
firstObsTime = peek_timestamp(ifp)
logging.info('Sending sensor data from {}'.format(firstObsTime))
simulate(event_type, ifp, firstObsTime, programStartTime, args.speedFactor)