remove printlines, doubles, don't scrape twitter links

This commit is contained in:
⧉ infominer 2020-10-29 17:32:18 -04:00
parent 4e93a9b17d
commit d4c51c30a4

View File

@ -30,9 +30,13 @@ fname = 'ssitw' + date_time + '.csv'
csvFile = open(fname, 'w+') csvFile = open(fname, 'w+')
csvWriter = csv.writer(csvFile) csvWriter = csv.writer(csvFile)
csvWriter.writerow(["Time", "Link", "Text", "Hashtags", "User", "Favorites", "Following", "Followers", "Retweets", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL","QuoteID","QuoteText","QuoteImg","QuoteUrl"]) csvWriter.writerow(["Time", "Link", "Text", "Hashtags", "User", "Favorites", "Following", "Followers", "Retweets", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL","QuoteID","QuoteText","QuoteImg","QuoteUrl"])
lines_seen = []
for keyword in keywords: for keyword in keywords:
# Search hashtags\keywords
for tweet in tweepy.Cursor(api.search,q=keyword + '-filter:retweets',count=100, tweet_mode="extended", lang="en", since=days_before).items(): for tweet in tweepy.Cursor(api.search,q=keyword + '-filter:retweets',count=100, tweet_mode="extended", lang="en", since=days_before).items():
### Set \ Reset Variables
medias = [] medias = []
lnks = [] lnks = []
replink = "" replink = ""
@ -51,74 +55,62 @@ for keyword in keywords:
followers = tweet.user.followers_count followers = tweet.user.followers_count
username = tweet.user.screen_name username = tweet.user.screen_name
id = "https://twitter.com/" + username + "/status/" + tweet.id_str id = "https://twitter.com/" + username + "/status/" + tweet.id_str
print(id)
text = tweet.full_text text = tweet.full_text
print(tweet.full_text)
hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]] hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]]
created = tweet.created_at created = tweet.created_at
print(hashtags)
try: try:
reply = tweet.in_reply_to_status_id_str reply = tweet.in_reply_to_status_id_str
user = tweet.in_reply_to_user_id_str user = tweet.in_reply_to_user_id_str
replink = "https://twitter.com/" + user + "/" + reply replink = "https://twitter.com/" + user + "/" + reply
print(reply)
except: except:
print("no reply url") print("no reply url")
if 'media' in tweet.entities: if 'media' in tweet.entities:
for media in tweet.extended_entities['media']: for media in tweet.extended_entities['media']:
print(media['media_url_https'])
medias.append(media['media_url_https']) medias.append(media['media_url_https'])
### Add URLs to array
if 'urls' in tweet.entities: if 'urls' in tweet.entities:
quoted = 'FALSE'
for url in tweet.entities['urls']: for url in tweet.entities['urls']:
print(url['expanded_url']) lkn = url['expanded_url']
lnks.append(url['expanded_url']) lnks.append(url['expanded_url'])
quoted = 'FALSE'
if hasattr(tweet, 'quoted_status'):
qtuser = tweet.quoted_status.user.screen_name
qtid = "https://twitter.com/" + username + "/status/" + tweet.quoted_status.id_str
if qtid == url:
quoted = 'TRUE'
print("quoted")
else:
print("notquoted")
from webpreview import web_preview from webpreview import web_preview
if username == "Docbasia" or quoted == 'TRUE': if username == "Docbasia" or 'twitter.com' in lkn: continue
print("Docbasia")
else: else:
try: try:
### get title img description
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
tit, desc, ima = web_preview(url['expanded_url'],timeout=10,headers=headers,parser='lxml') tit, desc, ima = web_preview(lkn,timeout=10,headers=headers,parser='lxml')
title.append(tit) title.append(tit)
description.append(desc) description.append(desc)
image.append(ima) image.append(ima)
except: except:
print("broken link") print("broken link")
# If it's a quote-tweet, get original stats ### If it's a quote-tweet, get original stats
if hasattr(tweet, 'quoted_status'): if hasattr(tweet, 'quoted_status'):
qtmedia = [''] qtmedia = ['']
qturls = [''] qturls = ['']
qttext = tweet.quoted_status.full_text qttext = tweet.quoted_status.full_text
print(qttext)
qtuser = tweet.quoted_status.user.screen_name qtuser = tweet.quoted_status.user.screen_name
qtid = "https://twitter.com/" + qtuser + "/status/" + tweet.quoted_status.id_str qtid = "https://twitter.com/" + qtuser + "/status/" + tweet.quoted_status.id_str
if 'media' in tweet.quoted_status.entities: if 'media' in tweet.quoted_status.entities:
for media in tweet.quoted_status.extended_entities['media']: for media in tweet.quoted_status.extended_entities['media']:
print(media['media_url_https'])
qtmedia.append(media['media_url_https']) qtmedia.append(media['media_url_https'])
if 'urls' in tweet.quoted_status.entities: if 'urls' in tweet.quoted_status.entities:
for url in tweet.quoted_status.entities['urls']: for url in tweet.quoted_status.entities['urls']:
print(url['expanded_url'])
qturls.append(url['expanded_url']) qturls.append(url['expanded_url'])
line = [created, id, text, hashtags, username, favorites, following, followers, retweetcount, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
lin = "created + ',' + id + ',' + text + ',' + hashtags + ',' + username + ',' + favorites + ',' + following + ',' + followers + ',' + retweetcount + ',' + lnks + ',' + title + ',' + description + ',' + image + ',' + medias + ',' + replink + ',' + qtid + ',' + qttext + ',' + qtmedia + ',' + qturls + ',"
seen = 'FALSE' seen = 'FALSE'
for row in csvFile: count = 0
if lin == row: for y in lines_seen:
print("seen") if id == y:
seen = 'TRUE' seen = 'TRUE'
#### Column attributes
line = [created, id, text, hashtags, username, favorites, following, followers, retweetcount, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
#### Only add line to csv if it's not already been added
lines_seen.append(id)
if seen == 'TRUE' or username == "Docbasia" : if seen == 'TRUE' or username == "Docbasia" :
print("seen") print("Seen. Don't Save.")
else: else:
csvWriter.writerow(line) csvWriter.writerow(line)
print(line) print(line)