{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "// imports\n", "import de.l3s.archivespark._\n", "import de.l3s.archivespark.implicits._\n", "import de.l3s.archivespark.enrich._\n", "import de.l3s.archivespark.enrich.functions._\n", "import de.l3s.archivespark.specific.warc.implicits._\n", "import de.l3s.archivespark.specific.warc._\n", "import de.l3s.archivespark.specific.warc.specs._" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "// load CDX and WARC (lazily) from local filesystem\n", "val rdd = ArchiveSpark.load(sc, WarcCdxHdfsSpec(\"/cdx/*.cdx\", \"/warc\"))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// count records\n", "rdd.count" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{\n", " \"record\":{\n", " \"redirectUrl\":\"-\",\n", " \"timestamp\":\"20140103030321\",\n", " \"digest\":\"B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A\",\n", " \"originalUrl\":\"http://example.com?example=1\",\n", " \"surtUrl\":\"com,example)/?example=1\",\n", " \"mime\":\"text/html\",\n", " \"compressedSize\":1043,\n", " \"meta\":\"-\",\n", " \"status\":200\n", " }\n", "}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// fetch first record as a string (only CDX headers)\n", "rdd.first.toJsonString" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{\n", " \"httpStatusLine\":\"HTTP/1.1 200 OK\",\n", " \"recordHeader\":{\n", " \"WARC-Target-URI\":\"http://example.com?example=1\",\n", " \"WARC-Date\":\"2014-01-03T03:03:21Z\",\n", " \"WARC-Type\":\"response\",\n", " \"Content-Length\":\"1610\",\n", " \"WARC-Payload-Digest\":\"sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A\",\n", " \"Content-Type\":\"application/http; msgtype=response\",\n", " \"absolute-offset\":\"0\",\n", " \"WARC-Record-ID\":\"\",\n", " \"reader-identifier\":\"example.warc.gz\",\n", " \"WARC-Warcinfo-ID\":\"\"\n", " },\n", " \"payload\":\"bytes(length: 1270)\",\n", " \"httpHeader\":{\n", " \"Last-Modified\":\"Fri, 09 Aug 2013 23:54:35 GMT\",\n", " \"X-Cache\":\"HIT\",\n", " \"Server\":\"ECS (sjc/4FCE)\",\n", " \"Accept-Ranges\":\"bytes\",\n", " \"Etag\":\"\\\"359670651\\\"\",\n", " \"Expires\":..." ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// retrieve corresponding WARC records\n", "rdd.enrich(WarcPayload).first.toJsonString" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// get html records with status code = 200\n", "val htmlOnline = rdd.filter(r => r.status == 200 && r.mime == \"text/html\")\n", "htmlOnline.count" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{\n", " \"record\":{\n", " \"redirectUrl\":\"-\",\n", " \"timestamp\":\"20140103030321\",\n", " \"digest\":\"B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A\",\n", " \"originalUrl\":\"http://example.com?example=1\",\n", " \"surtUrl\":\"com,example)/?example=1\",\n", " \"mime\":\"text/html\",\n", " \"compressedSize\":1043,\n", " \"meta\":\"-\",\n", " \"status\":200\n", " },\n", " \"payload\":{\n", " \"string\":\"\\n\\n\\n Example Domain\\n\\n \\n \\n \\n