remove printlines, doubles, don't scrape twitter links

2025-10-12 05:10:45 -04:00 · 2020-10-29 17:32:18 -04:00 · 2020-10-29 17:32:18 -04:00 · d4c51c30a4
commit d4c51c30a4
parent 4e93a9b17d
1 changed files with 20 additions and 28 deletions
--- a/twscraper.py
+++ b/twscraper.py
@ -30,9 +30,13 @@ fname = 'ssitw' + date_time + '.csv'
 csvFile = open(fname, 'w+')
 csvWriter = csv.writer(csvFile)
 csvWriter.writerow(["Time", "Link", "Text", "Hashtags", "User", "Favorites", "Following", "Followers", "Retweets", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL","QuoteID","QuoteText","QuoteImg","QuoteUrl"])
 lines_seen = []
 for keyword in keywords:
    # Search hashtags\keywords
    for tweet in tweepy.Cursor(api.search,q=keyword + '-filter:retweets',count=100, tweet_mode="extended", lang="en", since=days_before).items():
        ### Set \ Reset Variables
        medias = []
        lnks = []
        replink = ""
@ -51,74 +55,62 @@ for keyword in keywords:
        followers = tweet.user.followers_count
        username = tweet.user.screen_name
        id = "https://twitter.com/" + username + "/status/" + tweet.id_str
        print(id)
        text = tweet.full_text
        print(tweet.full_text)
        hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]]     
        created = tweet.created_at
        print(hashtags)
        try:
            reply = tweet.in_reply_to_status_id_str
            user = tweet.in_reply_to_user_id_str
            replink = "https://twitter.com/" + user + "/" + reply
            print(reply)
        except:
            print("no reply url")
        if 'media' in tweet.entities:
            for media in tweet.extended_entities['media']:
                print(media['media_url_https'])
                medias.append(media['media_url_https'])
        ### Add URLs to array
        if 'urls' in tweet.entities:
            for url in tweet.entities['urls']:
                print(url['expanded_url'])
                lnks.append(url['expanded_url'])
            quoted = 'FALSE'
-                if hasattr(tweet, 'quoted_status'):
+            for url in tweet.entities['urls']:
-                    qtuser = tweet.quoted_status.user.screen_name
+                lkn = url['expanded_url']
-                    qtid = "https://twitter.com/" + username + "/status/" + tweet.quoted_status.id_str
+                lnks.append(url['expanded_url'])
                    if qtid == url:
                        quoted = 'TRUE'
                        print("quoted")
                    else:
                        print("notquoted")
                from webpreview import web_preview
-                if username == "Docbasia" or quoted == 'TRUE':
+                if username == "Docbasia" or 'twitter.com' in lkn: continue
                    print("Docbasia")
                else:
                    try:
                        ### get title img description
                        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
-                        tit, desc, ima = web_preview(url['expanded_url'],timeout=10,headers=headers,parser='lxml')
+                        tit, desc, ima = web_preview(lkn,timeout=10,headers=headers,parser='lxml')
                        title.append(tit)
                        description.append(desc)
                        image.append(ima) 
                    except:
                        print("broken link")
-        # If it's a quote-tweet, get original stats
+        ### If it's a quote-tweet, get original stats
        if hasattr(tweet, 'quoted_status'):
            qtmedia = ['']
            qturls = ['']
            qttext = tweet.quoted_status.full_text
            print(qttext)
            qtuser = tweet.quoted_status.user.screen_name
            qtid = "https://twitter.com/" + qtuser + "/status/" + tweet.quoted_status.id_str
            if 'media' in tweet.quoted_status.entities:
                for media in tweet.quoted_status.extended_entities['media']:
                    print(media['media_url_https'])
                    qtmedia.append(media['media_url_https'])
            if 'urls' in tweet.quoted_status.entities:
                for url in tweet.quoted_status.entities['urls']:
                    print(url['expanded_url'])
                    qturls.append(url['expanded_url'])
        line = [created, id, text, hashtags, username, favorites, following, followers, retweetcount, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
        lin = "created + ',' + id + ',' + text + ',' + hashtags + ',' + username + ',' + favorites + ',' + following + ',' + followers + ',' + retweetcount + ',' + lnks + ',' + title + ',' + description + ',' + image + ',' + medias + ',' + replink + ',' + qtid + ',' + qttext + ',' + qtmedia + ',' + qturls + '," 
        seen = 'FALSE'
-        for row in csvFile:
+        count = 0
-            if lin == row:
+        for y in lines_seen:
-                print("seen")
+            if id == y:
                seen = 'TRUE'
        #### Column attributes
        line = [created, id, text, hashtags, username, favorites, following, followers, retweetcount, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
        #### Only add line to csv if it's not already been added
        lines_seen.append(id)
        if seen == 'TRUE' or username == "Docbasia" :
-            print("seen")
+            print("Seen. Don't Save.")
        else:
            csvWriter.writerow(line)
            print(line)