SOLR_LOCATOR : { # Name of solr collection collection : tweets # ZooKeeper ensemble zkHost : "node1.dlrl:2181/solr" } morphlines: [ { id: morphline1 importCommands: ["org.kitesdk.morphline.**", "com.ngdata.**", "com.cloudera.cdk.morphline.**", "org.apache.solr.**"] commands: [ { extractHBaseCells { mappings: [ { inputColumn: "original:tweet_id" outputField: "tweet_id" type: long source: value } { inputColumn: "original:text_clean" outputField: "text" type: string source: value } { inputColumn: "original:created_at" outputField: "created_at" type: string source: value } { inputColumn: "original:collection" outputField: "collection" type: string source: value } { inputColumn: "original:source" outputField: "source" type: string source: value } { inputColumn: "original:user_screen_name" outputField: "user_screen_name" type: string source: value } { inputColumn: "original:user_id" outputField: "user_id" type: string source: value } { inputColumn: "original:lang" outputField: "lang" type: string source: value } { inputColumn: "original:retweet_count" outputField: "retweet_count" type: int source: value } { inputColumn: "original:favorite_count" outputField: "favorite_count" type: int source: value } { inputColumn: "original:contributors_id" outputField: "contributors_id" type: string source: value } { inputColumn: "original:coordinates" outputField: "coordinates" type: string source: value } { inputColumn: "original:urls" outputField: "urls_multiple" type: string source: value } { inputColumn: "original:hashtags" outputField: "hashtags_multiple" type: string source: value } { inputColumn: "original:user_mentions_id" outputField: "user_mentions_id_multiple" type: string source: value } { inputColumn: "original:in_reply_to_user_id" outputField: "in_reply_to_user_id" type: string source: value } { inputColumn: "original:in_reply_to_status_id" outputField: "in_reply_to_status_id" type: string source: value } { inputColumn: "analysis:cluster_label" outputField: "cluster_label" type: string source: value } { inputColumn: "analysis:cluster_id" outputField: "cluster_id" type: string source: value } { inputColumn: "analysis:ner_people" outputField: "ner_people_multiple" type: string source: value } { inputColumn: "analysis:ner_locations" outputField: "ner_locations_multiple" type: string source: value } { inputColumn: "analysis:ner_dates" outputField: "ner_dates_multiple" type: string source: value } { inputColumn: "analysis:ner_organizations" outputField: "ner_organizations_multiple" type: string source: value } { inputColumn: "analysis:importance" outputField: "social_vector_json" type: string source: value } { inputColumn: "analysis:class" outputField: "classification_labels_multiple" type: string source: value } { inputColumn: "analysis:lda_topics" outputField: "lda_topics_multiple" type: string source: value } ] } } { split { inputField: "urls_multiple" outputField: "urls" separator: "|" } } { split { inputField: "hashtags_multiple" outputField: "hashtags" separator: "|" } } { split { inputField: "user_mentions_id_multiple" outputField: "user_mentions_id" separator: "|" } } { split { inputField: "ner_people_multiple" outputField: "ner_people" separator: "|" } } { split { inputField: "ner_locations_multiple" outputField: "ner_locations" separator: "|" } } { split { inputField: "ner_dates_multiple" outputField: "ner_dates" separator: "|" } } { split { inputField: "ner_organizations_multiple" outputField: "ner_organizations" separator: "|" } } { split { inputField: "classification_labels_multiple" outputField: "classification_labels" separator: "|" } } { split { inputField: "lda_topics_multiple" outputField: "lda_topics" separator: "|" } } # This command deletes record fields that are unknown to Solr # schema.xml. Solr throws an exception on any attempt to load a # document that contains a field that is not specified in schema.xml. { sanitizeUnknownSolrFields { # Location from which to fetch Solr schema solrLocator : ${SOLR_LOCATOR} } } # convert timestamp field to native Solr timestamp format # such as 2012-09-06T07:14:34Z to 2012-09-06T07:14:34.000Z { convertTimestamp { field : created_at inputFormats : ["unixTimeInSeconds"] inputTimezone : UTC outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'" outputTimezone : UTC } } { logTrace { format : "output record: {}", args : ["@{}"] } } ] } ]