diff --git a/twscraper.py b/twscraper.py index 2a1f9828..1c1daf7a 100644 --- a/twscraper.py +++ b/twscraper.py @@ -30,9 +30,13 @@ fname = 'ssitw' + date_time + '.csv' csvFile = open(fname, 'w+') csvWriter = csv.writer(csvFile) csvWriter.writerow(["Time", "Link", "Text", "Hashtags", "User", "Favorites", "Following", "Followers", "Retweets", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL","QuoteID","QuoteText","QuoteImg","QuoteUrl"]) +lines_seen = [] + for keyword in keywords: + # Search hashtags\keywords for tweet in tweepy.Cursor(api.search,q=keyword + '-filter:retweets',count=100, tweet_mode="extended", lang="en", since=days_before).items(): + ### Set \ Reset Variables medias = [] lnks = [] replink = "" @@ -51,74 +55,62 @@ for keyword in keywords: followers = tweet.user.followers_count username = tweet.user.screen_name id = "https://twitter.com/" + username + "/status/" + tweet.id_str - print(id) text = tweet.full_text - print(tweet.full_text) hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]] created = tweet.created_at - print(hashtags) try: reply = tweet.in_reply_to_status_id_str user = tweet.in_reply_to_user_id_str replink = "https://twitter.com/" + user + "/" + reply - print(reply) except: print("no reply url") if 'media' in tweet.entities: for media in tweet.extended_entities['media']: - print(media['media_url_https']) medias.append(media['media_url_https']) + ### Add URLs to array if 'urls' in tweet.entities: + quoted = 'FALSE' for url in tweet.entities['urls']: - print(url['expanded_url']) + lkn = url['expanded_url'] lnks.append(url['expanded_url']) - quoted = 'FALSE' - if hasattr(tweet, 'quoted_status'): - qtuser = tweet.quoted_status.user.screen_name - qtid = "https://twitter.com/" + username + "/status/" + tweet.quoted_status.id_str - if qtid == url: - quoted = 'TRUE' - print("quoted") - else: - print("notquoted") from webpreview import web_preview - if username == "Docbasia" or quoted == 'TRUE': - print("Docbasia") + if username == "Docbasia" or 'twitter.com' in lkn: continue else: try: + ### get title img description headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} - tit, desc, ima = web_preview(url['expanded_url'],timeout=10,headers=headers,parser='lxml') + tit, desc, ima = web_preview(lkn,timeout=10,headers=headers,parser='lxml') title.append(tit) description.append(desc) image.append(ima) except: print("broken link") - # If it's a quote-tweet, get original stats + ### If it's a quote-tweet, get original stats if hasattr(tweet, 'quoted_status'): qtmedia = [''] qturls = [''] qttext = tweet.quoted_status.full_text - print(qttext) qtuser = tweet.quoted_status.user.screen_name qtid = "https://twitter.com/" + qtuser + "/status/" + tweet.quoted_status.id_str if 'media' in tweet.quoted_status.entities: for media in tweet.quoted_status.extended_entities['media']: - print(media['media_url_https']) qtmedia.append(media['media_url_https']) if 'urls' in tweet.quoted_status.entities: for url in tweet.quoted_status.entities['urls']: - print(url['expanded_url']) qturls.append(url['expanded_url']) - line = [created, id, text, hashtags, username, favorites, following, followers, retweetcount, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls] - lin = "created + ',' + id + ',' + text + ',' + hashtags + ',' + username + ',' + favorites + ',' + following + ',' + followers + ',' + retweetcount + ',' + lnks + ',' + title + ',' + description + ',' + image + ',' + medias + ',' + replink + ',' + qtid + ',' + qttext + ',' + qtmedia + ',' + qturls + '," seen = 'FALSE' - for row in csvFile: - if lin == row: - print("seen") + count = 0 + for y in lines_seen: + if id == y: seen = 'TRUE' + #### Column attributes + line = [created, id, text, hashtags, username, favorites, following, followers, retweetcount, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls] + + #### Only add line to csv if it's not already been added + lines_seen.append(id) if seen == 'TRUE' or username == "Docbasia" : - print("seen") + print("Seen. Don't Save.") else: csvWriter.writerow(line) print(line)