notebooks/load_and_run.ipynb (236 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# This comes from https://github.com/harterrt/cookiecutter-python-etl/\n",
"# Thanks Harter!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import boto3\n",
"import botocore\n",
"import os\n",
"\n",
"from io import BytesIO\n",
"from gzip import GzipFile"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"repo_dir = \"probe-scraper\"\n",
"output_dir = \"/home/hadoop/analyses/probe_data\"\n",
"cache_dir = \"/home/hadoop/analyses/probe_cache\"\n",
"repo_https_url = \"https://github.com/mozilla/probe-scraper\"\n",
"\n",
"S3_PUBLIC_BUCKET = \"telemetry-public-analysis-2\"\n",
"S3_DATA_PATH = \"probe-scraper/data-rest/\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!rm -rf $repo_dir"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!rm -rf $output_dir"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!rm -rf $cache_dir"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!git config --global user.email \"gfritzsche@mozilla.com\" && \\\n",
"git config --global user.name \"Georg Fritzsche\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!git clone $repo_https_url $repo_dir"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!cd $repo_dir && git pull origin master && python setup.py bdist_egg"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!mkdir $output_dir && mkdir $cache_dir"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"!cd $repo_dir && pip install -r requirements.txt && python probe_scraper/runner.py --outdir $output_dir --tempdir $cache_dir"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Upload the output to S3."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Get access to the S3 connect API.\n",
"client = boto3.client('s3', 'us-west-2')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def gzip_compress(source_file):\n",
" \"\"\" Apply GZIP compression to the content of the provided file.\n",
"\n",
" :param source_file: the absolute path of the file to compress.\n",
" :return: The gzip compressed content of the input file.\n",
" \"\"\"\n",
" with open(source_file) as fi:\n",
" text_body = fi.read().decode(\"utf-8\")\n",
"\n",
" gz_body = BytesIO()\n",
" gz = GzipFile(None, 'wb', 9, gz_body)\n",
" gz.write(text_body.encode('utf-8'))\n",
" gz.close()\n",
" \n",
" return gz_body.getvalue()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for path, subdirs, files in os.walk(output_dir):\n",
" relative_path = os.path.relpath(path, output_dir)\n",
" # GZIP-compress the files, then copy them to S3. Allow caching for 8 hours.\n",
" for file_name in files:\n",
" source_path = os.path.join(path, file_name)\n",
" key_path = os.path.join(S3_DATA_PATH, relative_path, file_name)\n",
" print \"uploading \" + file_name + \" to s3: \" + key_path\n",
" client.put_object(ACL='public-read', Bucket=S3_PUBLIC_BUCKET,\n",
" Key=key_path, Body=gzip_compress(source_path),\n",
" ContentEncoding='gzip', CacheControl='max-age=28800',\n",
" ContentType='application/json')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}