Merge branch 'paul'

paul-chelarescu · paul-chelarescu · commit b5a046a70b88 · 2016-11-27T11:29:55.000Z
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,13 @@
 *.pyc
 *.json
+*.db
+raw_tweets.db
+utils.pyc
+FEATURES_DB
+donald.py
+twitter_stream_download.py
+twitterstream.py
+extracted_features.db
+raw_tweets.db
+streaming_new_tweets.py
+
diff --git a/search_for_words.py b/search_for_words.py
@@ -136,3 +136,37 @@ def add_tweets_in_db(tweets, keyword):
     # Join stuff
     for proc in jobs:
         proc.join()
+    # keyword = TEST_PARAMS['ANALYSER_COMPANY']
+    # max_tweets = TEST_PARAMS['MAX_TWEETS']
+    # tweets = get_tweets_from_keyword(keyword, max_tweets)
+
+    # keywords = COMPANIES['Astrazeneca']
+    keywords = "Microsoft"
+    max_tweets = TEST_PARAMS['MAX_TWEETS']
+
+    tweets = get_tweets_from_keyword(keywords, max_tweets)
+    tweets = tweets + get_tweets_from_keyword('#' + keywords, max_tweets)
+    tweets = tweets + get_tweets_from_keyword('@' + keywords, max_tweets)
+    add_tweets_in_db(tweets, keywords)
+
+    # The old way to print it out
+    # for keyword in keywords:
+        # tweets = get_tweets_from_keyword(keyword[0], max_tweets)
+# =======
+    # # keywords = COMPANIES['Astrazeneca']
+    # keywords = "Microsoft"
+    # max_tweets = TEST_PARAMS['MAX_TWEETS']
+
+    # tweets = get_tweets_from_keyword(keywords, max_tweets)
+    # tweets = tweets + get_tweets_from_keyword('#' + keywords, max_tweets)
+    # tweets = tweets + get_tweets_from_keyword('@' + keywords, max_tweets)
+    # add_tweets_in_db(tweets, keywords)
+
+    # # The old way to print it out
+    # # for keyword in keywords:
+        # # tweets = get_tweets_from_keyword(keyword[0], max_tweets)
+# >>>>>>> switching branches
+
+        # # for tweet in tweets:
+            # # print tweet
+# >>>>>>> master
diff --git a/streaming_new_tweets.py b/streaming_new_tweets.py
@@ -5,49 +5,119 @@
 import json
 import config
 from utils import *
+from analyser import *
+import time
 
 # This is the listener, resposible for receiving data
 class DBListener(tweepy.StreamListener):
+    def __init__(self ,company_name):
+        self.company_name = company_name
+        create_tweets_table(DATABASES['RAW_TWEETS_DB'], self.company_name)
+
     def on_data(self, data):
-        # Parsing 
+        # Parsing
+        # print data
         decoded = json.loads(data)
+        # print "PASSES"
+        # file = open('NEW_JSON.json', 'wb')
+        # json.dump(decoded,file,sort_keys = True,indent = 4)
+    
+
+        if "id" not in decoded:
+            print "ID NOT IN DECODED"
+            return True
+        the_id = decoded['id']
+        tweet = decoded['text']
+        created_at = decoded['created_at']
+        created_at = time.mktime(time.strptime(created_at,"%a %b %d %H:%M:%S +0000 %Y"))
+        # print created_at
         # #open a file to store the status objects
-        # file = open('streaming_new_tweets.json', 'wb')  
         #write json to file
 
-        write_to_DB(decoded)
+        self.write_tweets_to_DB(the_id, tweet, created_at)
 
-        json.dump(decoded,file,sort_keys = True,indent = 4)
+        # json.dump(decoded,file,sort_keys = True,indent = 4)
         #show progress
-        print "Writing tweets to file,CTRL+C to terminate the program"
+        # print "Writing tweets to file,CTRL+C to terminate the program"
+
 
-        
         return True
 
     def on_error(self, status):
         print "Error with status " + status
 
-def write_to_DB(decoded):
+    def write_tweets_to_DB(self, the_id, tweet, timestamp):
+
+        self.conn, self.c = get_database_connection(DATABASES['RAW_TWEETS_DB'])
+        self.c.execute('''INSERT OR IGNORE INTO ''' + self.company_name + \
+                ''' VALUES (?,?,?)''' ,(the_id, tweet, timestamp))
+
+        self.conn.commit()
+        self.conn.close()
+
+
+        try:
+            self.conn, self.c = get_database_connection(DATABASES['FEATURES_DB'])
+            self.c.execute('''CREATE TABLE IF NOT EXISTS ''' + self.company_name + \
+              ''' (hash INTEGER PRIMARY KEY, neg REAL, neu REAL, pos REAL, com REAL)''')
+
+            feature = get_sentiment(tweet)
+            features = []
+            features.append((the_id, feature['neg'], feature['neu'], \
+                            feature['pos'], feature['compound']))
+
+            print features
+            for index, feature in enumerate(features):
+                self.c.execute('''INSERT OR IGNORE INTO ''' + self.company_name + \
+                          ''' VALUES(?, ?, ?, ?, ?)''', \
+                          (feature[0], feature[1], feature[2], feature[3], feature[4]))
+
+        except UnicodeEncodeError:
+            print "UnicodeEncodeError"
+            pass
+        finally:
+            self.conn.commit()
+            self.conn.close()
+            print tweet
+            print "--------------------------------------------------"
+
+
+
+
+    def get_filtered_tweets_features(self, company):
+        # Get all tweets from db
+        conn, c = read_tweets(company)
+        # Filter them
+        features = []
+        for tweet in c.fetchall():
+            filtered_tweet = filter_tweet(tweet[1])
+            if filtered_tweet == '':
+                continue
+            feature = get_sentiment(filtered_tweet)
+            features.append((tweet[0], feature['pos']))
+        # Close connection
+        conn.close()
+        return company, features
 
-    return
 
 
 
 if __name__ == '__main__':
-    l = DBListener()
-    #authorize twitter, initialize tweepy
-    auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_secret)
-    auth.set_access_token(config.access_token, config.access_secret)
 
     # There are different kinds of streams: public stream, user stream, multi-user streams
     # For more details refer to https://dev.twitter.com/docs/streaming-apis
 
 
-    search_query = COMPANIES["Microsoft"]
+    company_name = "Microsoft"
+    search_query = COMPANIES[company_name]
+    # create_tweets_table('FEATURES_DB', company_name )
 
-    create_tweets_table()
+    l = DBListener(company_name)
+    #authorize twitter, initialize tweepy
+    auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_secret)
+    auth.set_access_token(config.access_token, config.access_secret)
 
     stream = tweepy.Stream(auth, l)
 
     #Hashtag to stream
-    stream.filter(track=search_query)  #Replace with your favorite hashtag or query
+    stream.filter(track=[company_name])  #Replace with your favorite hashtag or query
diff --git a/utils.py b/utils.py
@@ -16,7 +16,7 @@
                 'MAX_TWEETS' : 1100 }
 
 COMPANIES = {
-        "Apple" : [
+        "Apple" : 
             "Apple OR "
             +"#Apple OR "
             +"iPad OR "
@@ -26,35 +26,35 @@
             +"mac OR "
             +"macbook OR "
             +"iMac"
-            ],
-        "Facebook" : [
+            ,
+        "Facebook" : 
             "Facebook OR "
             +"#Facebook OR "
             +"news feed OR "
             +"poke OR "
             +"status OR "
             +"timeline OR "
             +"messenger"
-            ],
-        "Costa" : [
+            ,
+        "Costa" : 
             'Costa OR '
             +'#Costa OR '
             +'Whitbread '
-            ],
-        "Microsoft" : [
+            ,
+        "Microsoft" : 
             'Mircosoft OR '
             +'#Mircosoft OR '
-            # +'Surface OR '
-            # +'power point OR '
-            # +'excel OR '
-            # +'vista OR '
-            # +'kinect OR '
-            # +'bing OR '
-            # +'visual basic OR '
-            # +'visual studio OR '
+            +'Surface OR '
+            +'power point OR '
+            +'excel OR '
+            +'vista OR '
+            +'kinect OR '
+            +'bing OR '
+            +'visual basic OR '
+            +'visual studio OR '
             +'ms-dos'
-            ],
-        "Astrazeneca" : [
+            ,
+        "Astrazeneca" : 
             "Astrazeneca OR "
             +"ARIMIDEX OR "
             +"ATACAND OR "
@@ -75,33 +75,33 @@
             +"Astrazeneca OR "
             +"Astrazeneca OR "
             +"Astrazeneca OR "
-            +"Carbocaine  OR "
-            +"Citanes OR "
-            +"Diprivan OR "
-            +"EMLA OR "
-            +"MarcaineOR "
-            +"Naropin OR "
-            +"Xylocaine OR "
-            +"Xyloproct OR "
-            +"Atacand OR "
-            +"Betaloc OR "
-            +"BrilintaOR "
-            +"Crestor OR "
-            +"ExantaOR "
-            +"Epanova OR "
-            +"Imdur OR "
-            +"Inderal OR "
-            +"Lexxel OR "
-            +"Logimax OR "
-            +"Nif-Ten OR "
-            +"Plendil OR "
-            +"Ramace OR "
-            +"SelokenOR "
-            +"Tenoretic OR "
-            +"Tenormin OR "
-            +"Unimax OR "
-            +"Zestoretic OR "
-            +"Zestril OR "
+            # +"Carbocaine  OR "
+            # +"Citanes OR "
+            # +"Diprivan OR "
+            # +"EMLA OR "
+            # +"MarcaineOR "
+            # +"Naropin OR "
+            # +"Xylocaine OR "
+            # +"Xyloproct OR "
+            # +"Atacand OR "
+            # +"Betaloc OR "
+            # +"BrilintaOR "
+            # +"Crestor OR "
+            # +"ExantaOR "
+            # +"Epanova OR "
+            # +"Imdur OR "
+            # +"Inderal OR "
+            # +"Lexxel OR "
+            # +"Logimax OR "
+            # +"Nif-Ten OR "
+            # +"Plendil OR "
+            # +"Ramace OR "
+            # +"SelokenOR "
+            # +"Tenoretic OR "
+            # +"Tenormin OR "
+            # +"Unimax OR "
+            # +"Zestoretic OR "
+            # +"Zestril OR "
             # +"Bydureon OR "
             # +"Byetta OR "
             # +"FarxigaOR "
@@ -143,7 +143,10 @@
             # +"Bricanyl OR "
             # +"Oxis OR "
             +"Pulmicort"
-            ]
+            ,
+            "ytytye9393jenmebrhriue" : 
+                    "ytytye9393jenmebrhriue"
+                    
 }
 
 DATABASES = {'RAW_TWEETS_DB': 'raw_tweets.db',