mrjob.conf.example

# This is basically the config file we use in production at Yelp, with some
# strategic edits. ;)
#
# If you don't have the yaml module installed, you'll have to use JSON instead,
# which would look something like this:
#
# {"runners": {
#    "emr": {
#      "aws_access_key_id": "HADOOPHADOOPBOBADOOP",
#      "aws_region": "us-west-1",
#      "aws_secret_access_key": "MEMIMOMADOOPBANANAFANAFOFADOOPHADOOP",
#      "base_tmp_dir": "/scratch/$USER"
#      "bootstrap_python_packages": [
#        "$BT/aws/python-packages/*.tar.gz"
#      ],
#      ...
#
runners:
  emr:
    aws_access_key_id: HADOOPHADOOPBOBADOOP
    # We run on in the west region because we're located on the west coast,
    # and there are no eventual consistency issues with newly created S3 keys.
    aws_region: us-west-1
    aws_secret_access_key: MEMIMOMADOOPBANANAFANAFOFADOOPHADOOP
    # instead of keys, you can use a temporary security token
    #aws_security_token: AREALLYLONGHASH
    # alternate tmp dir
    base_tmp_dir: /scratch/$USER
    # $BT is the path to our source tree. This lets us add modules to
    # install on EMR by simply dumping them in this dir.
    bootstrap_python_packages:
    - $BT/aws/python-packages/*.tar.gz
    # specifying an ssh key pair allows us to ssh tunnel to the job tracker
    # and fetch logs via ssh
    ec2_key_pair: EMR
    ec2_key_pair_file: $BT/config/EMR.pem
    # use beefier instances in production
    ec2_instance_type: c1.xlarge
    # but only use one unless overridden
    num_ec2_instances: 1
    # use our local time zone (this is important for deciding when
    # days start and end, for instance)
    cmdenv:
      TZ: America/Los_Angeles
    # we create the src-tree.tar.gz tarball with a Makefile. It only contains
    # a subset of our code
    python_archives: &python_archives
    - $BT/aws/src-tree.tar.gz
    # our bucket also lives in the us-west region
    s3_log_uri: s3://walrus/tmp/logs/
    s3_scratch_uri: s3://walrus/tmp/
    setup_cmds: &setup_cmds
    # these files are different between dev and production, so they're
    # uploaded separately. copying them into place isn't safe because
    # src-tree.tar.gz is actually shared between several mappers/reducers.
    # Another safe approach would be to add a rule to Makefile.emr that
    # copies these files if they haven't already been copied (setup_cmds
    # from two mappers/reducers won't run simultaneously on the same machine)
    - ln -sf $(readlink -f config.py) src-tree.tar.gz/config/config.py
    - ln -sf $(readlink -f secret.py) src-tree.tar.gz/config/secret.py
    # run Makefile.emr to compile C code (EMR has a different architecture,
    # so we can't just upload the .so files)
    - cd src-tree.tar.gz; make -f Makefile.emr
    # generally, we run jobs on a Linux server separate from our desktop
    # machine. So the SSH tunnel needs to be open so a browser on our
    # desktop machine can connect to it.
    ssh_tunnel_is_open: true
    ssh_tunnel_to_job_tracker: true
    # upload these particular files on the fly because they're different
    # between development and production
    upload_files: &upload_files
    - $BT/config/config.py
    - $BT/config/secret.py
  hadoop:
    # Note the use of YAML references to re-use parts of the EMR config.
    # We don't currently run our own hadoop cluster, so this section is
    # pretty boring.
    base_tmp_dir: /scratch/$USER
    python_archives: *python_archives
    setup_cmds: *setup_cmds
    upload_files: *upload_files
  local:
    # We don't have gcc installed in production, so if we have to run an
    # MRJob in local mode in production, don't run the Makefile
    # and whatnot; just fall back on the original copy of the code.
    base_tmp_dir: /scratch/$USER