notebooks/load_and_run.ipynb (236 lines of code) (raw):

{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# This comes from https://github.com/harterrt/cookiecutter-python-etl/\n", "# Thanks Harter!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import boto3\n", "import botocore\n", "import os\n", "\n", "from io import BytesIO\n", "from gzip import GzipFile" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "repo_dir = \"probe-scraper\"\n", "output_dir = \"/home/hadoop/analyses/probe_data\"\n", "cache_dir = \"/home/hadoop/analyses/probe_cache\"\n", "repo_https_url = \"https://github.com/mozilla/probe-scraper\"\n", "\n", "S3_PUBLIC_BUCKET = \"telemetry-public-analysis-2\"\n", "S3_DATA_PATH = \"probe-scraper/data-rest/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!rm -rf $repo_dir" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!rm -rf $output_dir" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!rm -rf $cache_dir" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!git config --global user.email \"gfritzsche@mozilla.com\" && \\\n", "git config --global user.name \"Georg Fritzsche\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!git clone $repo_https_url $repo_dir" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!cd $repo_dir && git pull origin master && python setup.py bdist_egg" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!mkdir $output_dir && mkdir $cache_dir" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!cd $repo_dir && pip install -r requirements.txt && python probe_scraper/runner.py --outdir $output_dir --tempdir $cache_dir" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Upload the output to S3." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Get access to the S3 connect API.\n", "client = boto3.client('s3', 'us-west-2')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def gzip_compress(source_file):\n", " \"\"\" Apply GZIP compression to the content of the provided file.\n", "\n", " :param source_file: the absolute path of the file to compress.\n", " :return: The gzip compressed content of the input file.\n", " \"\"\"\n", " with open(source_file) as fi:\n", " text_body = fi.read().decode(\"utf-8\")\n", "\n", " gz_body = BytesIO()\n", " gz = GzipFile(None, 'wb', 9, gz_body)\n", " gz.write(text_body.encode('utf-8'))\n", " gz.close()\n", " \n", " return gz_body.getvalue()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "for path, subdirs, files in os.walk(output_dir):\n", " relative_path = os.path.relpath(path, output_dir)\n", " # GZIP-compress the files, then copy them to S3. Allow caching for 8 hours.\n", " for file_name in files:\n", " source_path = os.path.join(path, file_name)\n", " key_path = os.path.join(S3_DATA_PATH, relative_path, file_name)\n", " print \"uploading \" + file_name + \" to s3: \" + key_path\n", " client.put_object(ACL='public-read', Bucket=S3_PUBLIC_BUCKET,\n", " Key=key_path, Body=gzip_compress(source_path),\n", " ContentEncoding='gzip', CacheControl='max-age=28800',\n", " ContentType='application/json')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }