From dab668a1adaee21c9bfc41bb1b146227f1844f7d Mon Sep 17 00:00:00 2001 From: Ed Lewis Date: Fri, 11 Dec 2015 14:09:11 +0000 Subject: [PATCH] EmrEtlRunner: added support for ndjson loader format (closes #2251) WIP runner_spec remove pp EmrEtlRunner: add urbanairship example to sample config.yml --- 3-enrich/emr-etl-runner/config/config.yml.sample | 2 +- .../lib/snowplow-emr-etl-runner/runner.rb | 2 +- .../spec/snowplow-emr-etl-runner/runner_spec.rb | 13 ++++++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/3-enrich/emr-etl-runner/config/config.yml.sample b/3-enrich/emr-etl-runner/config/config.yml.sample index 2e4777d469..1adcec3a2d 100644 --- a/3-enrich/emr-etl-runner/config/config.yml.sample +++ b/3-enrich/emr-etl-runner/config/config.yml.sample @@ -46,7 +46,7 @@ aws: task_instance_bid: 0.015 # In USD. Adjust bid, or leave blank for non-spot-priced (i.e. on-demand) task instances bootstrap_failure_tries: 3 # Number of times to attempt the job in the event of bootstrap failures collectors: - format: cloudfront # Or 'clj-tomcat' for the Clojure Collector, or 'thrift' for Thrift records, or 'tsv/com.amazon.aws.cloudfront/wd_access_log' for Cloudfront access logs + format: cloudfront # For example: 'clj-tomcat' for the Clojure Collector, 'thrift' for Thrift records, 'tsv/com.amazon.aws.cloudfront/wd_access_log' for Cloudfront access logs or 'ndjson/urbanairship.connect/v1' for UrbanAirship Connect events enrich: job_name: Snowplow ETL # Give your job a name versions: diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/runner.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/runner.rb index 1fbfc30603..2187b2b052 100644 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/runner.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/runner.rb @@ -22,7 +22,7 @@ class Runner include Contracts # Supported options - @@collector_format_regex = /^(?:cloudfront|clj-tomcat|thrift|(?:json\/.+\/.+)|(?:tsv\/.+\/.+))$/ + @@collector_format_regex = /^(?:cloudfront|clj-tomcat|thrift|(?:json\/.+\/.+)|(?:tsv\/.+\/.+)|(?:ndjson\/.+\/.+))$/ @@skip_options = Set.new(%w(staging s3distcp emr enrich shred elasticsearch archive_raw)) include Monitoring::Logging diff --git a/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/runner_spec.rb b/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/runner_spec.rb index e840bb5365..1208896f8d 100644 --- a/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/runner_spec.rb +++ b/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/runner_spec.rb @@ -14,7 +14,6 @@ # License:: Apache License Version 2.0 require 'spec_helper' -require 'pp' Runner = Snowplow::EmrEtlRunner::Runner Cli = Snowplow::EmrEtlRunner::Cli @@ -155,4 +154,16 @@ def get_mock_config Runner.new args, config, enrichments, resolver end + it 'should accept the ndjson collector format' do + args, config, enrichments, resolver = get_mock_config + config[:collectors][:format] = "ndjson" + expect {Runner.new args, config, enrichments, resolver}.to raise_exception(ConfigError, "collector_format 'ndjson' not supported") + config[:collectors][:format] = "ndjson/something" + expect {Runner.new args, config, enrichments, resolver}.to raise_exception(ConfigError, "collector_format 'ndjson/something' not supported") + config[:collectors][:format] = "ndjson/something/" + expect {Runner.new args, config, enrichments, resolver}.to raise_exception(ConfigError, "collector_format 'ndjson/something/' not supported") + config[:collectors][:format] = "ndjson/something/something" + Runner.new args, config, enrichments, resolver + end + end