courses/streaming/fromSHRP/to_messages.py (39 lines of code) (raw):
#!/usr/bin/env python
import datetime
import gzip
INPUT='raw.csv'
OUTPUT='messages.csv.gz'
def notify(line, colnames, ofp):
# DATE,TIME,STATION_ID,LATITUDE,LONGITUDE,DISTRICT,FREEWAY_ID,FREEWAY_DIR,STATION_TYPE,LENGTH,SAMPLES,PCT_OBSERVED,FLOW,OCC,SPEED,LANE_1_SAMPLES,LANE_1_FLOW,LANE_1_OCC,LANE_1_SPEED,LANE_1_OBS,LANE_2_SAMPLES,LANE_2_FLOW,LANE_2_OCC,LANE_2_SPEED,LANE_2_OBS,LANE_3_SAMPLES,LANE_3_FLOW,LANE_3_OCC,LANE_3_SPEED,LANE_3_OBS,LANE_4_SAMPLES,LANE_4_FLOW,LANE_4_OCC,LANE_4_SPEED,LANE_4_OBS,LANE_5_SAMPLES,LANE_5_FLOW,LANE_5_OCC,LANE_5_SPEED,LANE_5_OBS,LANE_6_SAMPLES,LANE_6_FLOW,LANE_6_OCC,LANE_6_SPEED,LANE_6_OBS,LANE_7_SAMPLES,LANE_7_FLOW,LANE_7_OCC,LANE_7_SPEED,LANE_7_OBS,LANE_8_SAMPLES,LANE_8_FLOW,LANE_8_OCC,LANE_8_SPEED,LANE_8_OBS
incols = line.strip().split(',')
indict = {}
for name, value in zip(colnames, incols):
indict[name] = value
outcols = []
INTIME_FORMAT = '%m/%d/%Y %H:%M:%S'
intime = indict['DATE'] + ' ' + indict['TIME']
intime = datetime.datetime.strptime(intime, INTIME_FORMAT)
outcols.append( str(intime) )
if str(intime) > '2008-12-31':
exit(0) # limited to one year of data
for colname in 'LATITUDE,LONGITUDE,FREEWAY_ID,FREEWAY_DIR'.split(','):
outcols.append(indict[colname])
for laneno in range(1,9):
lane = str(laneno)
samples = indict['LANE_' + lane + '_SAMPLES']
occ = indict['LANE_' + lane + '_OCC']
speed = indict['LANE_' + lane + '_SPEED']
obs = indict['LANE_' + lane + '_OBS']
if len(samples) > 0 and len(occ) > 0 and len(speed) > 0 and len(obs) > 0:
lanemsg = list(outcols) # copy
lanemsg.append(lane)
# lanemsg.append(samples) # number of measurements
# lanemsg.append(occ) # number of occupants
lanemsg.append(speed) # speed
# lanemsg.append(obs)
ofp.write(','.join(lanemsg))
ofp.write('\n')
if __name__ == '__main__':
with open(INPUT, 'r') as ifp:
with gzip.open(OUTPUT, 'wb') as ofp:
ofp.write('TIMESTAMP,LATITUDE,LONGITUDE,FREEWAY_ID,FREEWAY_DIR,LANE,SPEED\n')
colnames = ifp.readline().strip().split(',')
for lineno, line in enumerate(ifp):
notify(line, colnames, ofp)
if lineno%10000 == 0:
print(lineno, line[:10])