#Prints the id and url of the webpages from the avro file to be used by the graphEdges.py script from avro import schema, datafile, io import re def read_avro_file(): rec_reader = io.DatumReader() df_reader = datafile.DataFileReader(open("/Users/vcedeno/Desktop/webpages/social_00000_v2"),rec_reader) file = open("/Users/vcedeno/Desktop/webpages/vertexID.scala",'w') for record in df_reader: id=int(re.search(r'\d+$', record['doc_id']).group()) urlN=record['url'] file.write(str(id)+" "+str(urlN)) file.write('\n') urls=record['urls'].split("|") for url in urls: if bool(url.strip()): file.write(str(id)+" "+str(url)) file.write('\n') file.close if __name__ == '__main__': read_avro_file()