#Extracts the vertex and edges from the cleaned web pages avro file #The vertex are written in vertex.scala #The edges are written in edges.scala from avro import schema, datafile, io import re def read_avro_file(): pages=100000000 rec_reader = io.DatumReader() df_reader = datafile.DataFileReader(open("/Users/vcedeno/Desktop/webpages/social_00000_v2"),rec_reader) file = open("/Users/vcedeno/Desktop/webpages/vertex.scala",'w') file2 = open("/Users/vcedeno/Desktop/webpages/edges.scala",'w') file.write("import org.apache.spark._\nimport org.apache.spark.graphx._\nimport org.apache.spark.rdd.RDD\nval vertexRDD: RDD[(Long, (Int, Int, Int))] = sc.parallelize(Array(") file2.write("val edgeRDD: RDD[Edge[Int]] = sc.parallelize(Array(") file.close file2.close for record in df_reader: id=int(re.search(r'\d+$', record['doc_id']).group()) file.write("("+str(id)+"L,(0,0,0)),") urls=record['urls'].split("|") for url in urls: if bool(url.strip()): file.write("("+str(pages)+"L,(0,0,0)),") file2.write("Edge("+str(id)+"L,"+str(pages)+"L,200),") pages=pages+1 if __name__ == '__main__': read_avro_file()