SOLR_LOCATOR : { # Name of solr collection collection : webpages # ZooKeeper ensemble zkHost : "node1.dlrl:2181/solr" } morphlines: [ { id: morphline1 importCommands: ["org.kitesdk.morphline.**", "com.ngdata.**", "com.cloudera.cdk.morphline.**", "org.apache.solr.**"] commands: [ { extractHBaseCells { mappings: [ { inputColumn: "original:title" outputField: "title" type: string source: value } { inputColumn: "original:domain" outputField: "domain" type: string source: value } { inputColumn: "original:url" outputField: "url" type: string source: value } { inputColumn: "original:collection" outputField: "collection" type: string source: value } { inputColumn: "original:text" outputField: "text" type: string source: value } { inputColumn: "analysis:cluster_label" outputField: "cluster_label" type: string source: value } { inputColumn: "analysis:cluster_id" outputField: "cluster_id" type: string source: value } { inputColumn: "analysis:ner_people" outputField: "ner_people_multiple" type: string source: value } { inputColumn: "analysis:ner_locations" outputField: "ner_locations_multiple" type: string source: value } { inputColumn: "analysis:ner_dates" outputField: "ner_dates_multiple" type: string source: value } { inputColumn: "analysis:ner_organizations" outputField: "ner_organizations_multiple" type: string source: value } { inputColumn: "analysis:importance" outputField: "social_vector_json" type: string source: value } { inputColumn: "analysis:class" outputField: "classification_labels_multiple" type: string source: value } { inputColumn: "analysis:lda_topics" outputField: "lda_topics_multiple" type: string source: value } ] } } { split { inputField: "ner_people_multiple" outputField: "ner_people" separator: "|" } } { split { inputField: "ner_locations_multiple" outputField: "ner_locations" separator: "|" } } { split { inputField: "ner_dates_multiple" outputField: "ner_dates" separator: "|" } } { split { inputField: "ner_organizations_multiple" outputField: "ner_organizations" separator: "|" } } { split { inputField: "classification_labels_multiple" outputField: "classification_labels" separator: "|" } } { split { inputField: "lda_topics_multiple" outputField: "lda_topics" separator: "|" } } # This command deletes record fields that are unknown to Solr # schema.xml. Solr throws an exception on any attempt to load a # document that contains a field that is not specified in schema.xml. { sanitizeUnknownSolrFields { # Location from which to fetch Solr schema solrLocator : ${SOLR_LOCATOR} } } # convert timestamp field to native Solr timestamp format # such as 2012-09-06T07:14:34Z to 2012-09-06T07:14:34.000Z { convertTimestamp { field : created_at inputFormats : ["unixTimeInSeconds"] inputTimezone : UTC outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'" outputTimezone : UTC } } { logTrace { format : "output record: {}", args : ["@{}"] } } ] } ]