courses/unstructured/PySpark-Test-Solution.ipynb (132 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from pyspark import SparkContext\n",
"\n",
"if 'sc' not in locals():\n",
" sc = SparkContext(\"local\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"499.5\n"
]
}
],
"source": [
"rdd = sc.parallelize(range(1000), 10)\n",
"print(rdd.mean())"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"231\n"
]
}
],
"source": [
"data = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89]\n",
"distData = sc.parallelize(data)\n",
"res = distData.reduce(lambda a, b: a + b)\n",
"print(res)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Row(animal=u'Frog', count(1)=2), Row(animal=u'Pig', count(1)=3), Row(animal=u'Cat', count(1)=5), Row(animal=u'Dog', count(1)=6)]\n"
]
}
],
"source": [
"from pyspark.sql.types import *\n",
"header = 'animal,name'\n",
"schema = StructType([StructField(colname, StringType(), True) for colname in header.split(',')])\n",
"pets = spark.read.schema(schema).csv('gs://BUCKET_NAME/unstructured/pets.txt')\n",
"\n",
"pets.createOrReplaceTempView('pets')\n",
"countsByPet = spark.sql('SELECT animal, COUNT(*) from pets GROUP BY animal')\n",
"\n",
"print(countsByPet.take(10))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(u'Frog', [u'Kermit', u'Hoppy']), (u'Pig', [u'Bacon', u'Babe', u'Tasty']), (u'Dog', [u'Noir', u'Bree', u'Pickles', u'Sparky', u'Gigi', u'Fred']), (u'Cat', [u'Tom', u'Alley', u'Cleo', u'George', u'Suzy'])]\n"
]
}
],
"source": [
"file = sc.textFile(\"gs://BUCKET_NAME/unstructured/pets.txt\")\n",
"\n",
"pets = file.map(lambda s: s.split(\",\")).map(lambda x : (x[0], [x[1]]))\n",
"petsByType = pets.reduceByKey(lambda a, b: a + b)\n",
"print(petsByType.take(10))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}