#Produces the social importance score for webpages in avro files from avro import schema, datafile, io OUTFILE_NAME = '/Users/vcedeno/Desktop/webpages/pagesocial.avro' SCHEMA_STR = """{ "namespace": "cs5604.tweet.social", "type": "record", "name": "PageSocial", "fields": [ { "name": "doc_id" , "type": "string" }, { "doc": "analysis" , "name": "social_importance", "type": ["double", "null"]} ] }""" SCHEMA = schema.parse(SCHEMA_STR) def makeObject(doc_id, social_importance): return {'doc_id': doc_id, 'social_importance': social_importance} def write_avro_file(): file = open("/Users/vcedeno/Desktop/webpages/webpages.txt",'r') rec_writer = io.DatumWriter(SCHEMA) df_writer = datafile.DataFileWriter(open(OUTFILE_NAME, 'wb'),rec_writer,writers_schema = SCHEMA,codec = 'deflate') for line in file: data=line.split(" ") id="shooting_B--webpage--"+data[0] df_writer.append(makeObject(id, float(data[1]))) df_writer.close() if __name__ == '__main__': # Write an AVRO file first write_avro_file()