# 2015. May 4th # Author: Jin, Yilong jin28@vt.edu # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . from basic_classes import mention import time import json, codecs, operator import avro from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter import sys from multiprocessing import Process, Manager, Value ############# constants used to calculate tweet importance ################### TERM_WEIGHT_CONSTANT = 0.2 NUM_OF_TERMS = 5 CONST_MIN_FOLLOW = 0 CONST_MAX_FOLLOW = 0 CONST_MIN_FAV = 0 CONST_MAX_FAV = 1 CONST_MIN_LIST = 0 CONST_MAX_LIST = 1 CONST_MIN_RT = 0 CONST_MAX_RT = 0 SCRIPT_RUN_TIME = 0.0 ############################################################################# #dictionary contains tweet messages TWEETS = dict() #dictionary contains user importance values UIV = dict() # TIV is updated by multiple threads, therefore it is an instance of Manager dictionary m = Manager() TIV = m.dict() # dictionary that contains edge information from one user to another EDGE = dict() # dictionary that contains user follower count and list count USER = dict() # "@me_at_vt" : 0x32fdsaffdsa (a reference to the object) username_userstruct = dict() def showTop10(): global TIV global TWEETS top10 = sorted(TIV.values(), key=lambda k: k.values(), reverse = True)[:10] for i in top10: t_id = i.values()[1] tmp_dict = TWEETS.get(t_id) user_name = tmp_dict.get('user_screen_name') fav_count = tmp_dict.get('fav_count') retweet_count = tmp_dict.get('retweet_count') total_mention = mention.get_total_mentions_by_users(user_name) print tmp_dict.get('content') print user_name print 'score %f' % i.values()[0] print 'UIV: %r' % UIV.get(user_name) print '# of followers: %d' % USER.get(user_name).get('follower_count') print 'list count: %d' % USER.get(user_name).get('list_count') print 'mentioned by users in the collection %d times' % total_mention print 'retweet count: %d' % retweet_count print 'fav_count: %d' % fav_count print '----------------------\n' def output_tweet_importance(output_name, output_schema_path): output_schema = avro.schema.parse(open(output_schema_path).read()) output_avro_writer = DataFileWriter(open(output_name, "w"), DatumWriter(), output_schema) tmp_counter = 0 lenth = len(TIV) for i in TIV.values(): tmp_counter += 1 sys.stdout.write('\rwriting %d/%d output') sys.stdout.flush() output_avro_writer.append(i) output_avro_writer.close() return tmp_counter def calculate_tweet_importance(verbose, start, end, proc, TIV): global TERM_WEIGHT_CONSTANT global NUM_OF_TERMS global CONST_MIN_FOLLOW global CONST_MAX_FOLLOW global CONST_MIN_FAV global CONST_MAX_FAV global CONST_MIN_LIST global CONST_MAX_LIST global CONST_MIN_RT global CONST_MAX_RT global TWEETS total = 0 #print 'proc_%d is processing %d to %d' % (proc, start, end-1) #return for tweet_id, meta in TWEETS.items()[start: end]: #sys.stdout.write('\rcalculating importance: %d/%d' % (total, len(TWEETS))) fav_count = meta.get('fav_count') retweet_count = meta.get('retweet_count') user_screen_name = meta.get('user_screen_name') # get user follower count follower_count = USER[user_screen_name].get('follower_count') # get user list count list_count = USER[user_screen_name].get('list_count') #term1 Favorite Count -- (# Fav(i) - Fav(min) ) / ( Fav(max) - Fav(min) ) term1 = float(float(fav_count - CONST_MIN_FAV) / float(CONST_MAX_FAV - CONST_MIN_FAV) + 1) #term2 Retweet Count -- (# RT(i) - RT(min) ) / ( RT(max) - RT(min) ) term2 = float(float(retweet_count - CONST_MIN_RT) / float(CONST_MAX_RT - CONST_MIN_RT) + 1) # term3 List Count -- (# List(i) - List(min) ) / ( List(max) - List(min) ) term3 = float(float(list_count - CONST_MIN_LIST) / float(CONST_MAX_LIST - CONST_MIN_LIST) + 1) # term4 Number of Followers -- (# Followers(i) - Followers(min) ) / ( Followers(max) - Followers(min) ) term4 = float(float(follower_count - CONST_MIN_FOLLOW) / float(CONST_MAX_FOLLOW - CONST_MIN_FOLLOW) + 1) term5 = UIV.get(user_screen_name) if term5 is None: term5 = 0 #important = sum(term_i) * TERM_WEIGHT_CONSTANT / # of terms used #TERM_WEIGHT_CONSTANT = 0.2 sum = term1 + term2 + term3 + term4 + term5 importance = float(sum) / NUM_OF_TERMS * TERM_WEIGHT_CONSTANT TIV[tweet_id] = {'doc_id': tweet_id, 'importance' : importance} total += 1 print 'proc_%d processed %d tweets' % (proc, total) return total def update_EDGE(u1, u2, verbose=False): """update_EDGE goes both ways""" mention_of_u1_by_u2 = mention.get_mentioned_by_stat(u2, u1) total_mention_u2 = mention.get_total_mention(u2) mention_of_u2_by_u1 = mention.get_mentioned_by_stat(u1, u2) total_mention_u1 = mention.get_total_mention(u1) weight1 = 0.0 weight2 = 0.0 if total_mention_u2 != 0: weight1 = float(mention_of_u1_by_u2) / float(total_mention_u2) if total_mention_u1 != 0: weight2 = float(mention_of_u2_by_u1) / float(total_mention_u1) tmp1 = EDGE.get(u1) tmp2 = EDGE.get(u2) # here, update edge from u1 -> u2 if tmp1 is None: EDGE[u1] = dict() EDGE[u1][u2] = weight1 # here, update edge from u2 -> u1 if tmp2 is None: EDGE[u2] = dict() EDGE[u2][u1] = weight2 if verbose: print '\n=========================' print 'mention_of_u1_by_u2: %d' % mention_of_u1_by_u2 print 'mention_of_u2_by_u1: %d' % mention_of_u2_by_u1 print 'total mention u1 %d' % total_mention_u1 print 'total mention u2 %d' % total_mention_u2 print 'weight1 : %f' % weight1 print 'weight2 : %f' % weight2 print '=========================' def calculate_UIV(): all_user_set = set(mention.mention_user_dict.keys()) | set(mention.mentioned_by_user_dict.keys()) tmp_counter = 0 for user in all_user_set: tmp_counter += 1 printProgress(tmp_counter, len(all_user_set), 'calculating UIV') UIV_helper(user) def UIV_helper(u_name, verbose = False): """ """ global UIV inlink_edges = mention.get_total_mentions_by_users(u_name) new_UIV = 0.0 UIV[u_name] = new_UIV all_edge_uname_dict = None #if the inlinks if inlink_edges != 0: all_edge_uname_dict = EDGE.get(u_name) tmp_weight_sum = 0.0 if all_edge_uname_dict is not None: tmp_weight_sum = sum(all_edge_uname_dict.values()) new_UIV = float(tmp_weight_sum) / float(inlink_edges) UIV[u_name] = new_UIV if verbose: print '\n=========================' print 'UIV(%s) is: %r' % (u_name, new_UIV) if inlink_edges != 0: valstr = '' for i in all_edge_uname_dict.values(): valstr += '%f+ ' % i print valstr print '--------------------' print ' %d' % inlink_edges print 'inlink_edge of %s is %d' % (u_name, inlink_edges) print '=========================' ############## global functions ######### def print_tweet_mention(): total = 0 count = 0 for tweet in tweet_array: mentioned_count =tweet.get_user_mentioned_count() if mentioned_count != 0: #print 'tweet_id_%s mentioned: %d users' % \ # (tweet.get_tweet_id(), mentioned_count) for user_handle in tweet.get_user_mentioned_list(): total += 1 if user_handle in username_userstruct.keys(): count += 1 msg = '\t%s (in collection)' % user_handle else: msg = '\t%s' % user_handle #print msg print '%d/%d users in collection' % (count, total) def print_tweet_stat(): counter = 0 total = len(tweet_array) for tweet in tweet_array: rt_count = tweet.get_rt_count() if rt_count != 0: counter += 1 print 'tweet_id_%s has %d RT' % \ (tweet.get_tweet_id(),rt_count) print '%d/%d has RT' % (counter, total)