mirror of
https://github.com/Decentralized-ID/decentralized-id.github.io.git
synced 2024-12-17 03:24:37 -05:00
remove printlines, doubles, don't scrape twitter links
This commit is contained in:
parent
4e93a9b17d
commit
d4c51c30a4
48
twscraper.py
48
twscraper.py
@ -30,9 +30,13 @@ fname = 'ssitw' + date_time + '.csv'
|
|||||||
csvFile = open(fname, 'w+')
|
csvFile = open(fname, 'w+')
|
||||||
csvWriter = csv.writer(csvFile)
|
csvWriter = csv.writer(csvFile)
|
||||||
csvWriter.writerow(["Time", "Link", "Text", "Hashtags", "User", "Favorites", "Following", "Followers", "Retweets", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL","QuoteID","QuoteText","QuoteImg","QuoteUrl"])
|
csvWriter.writerow(["Time", "Link", "Text", "Hashtags", "User", "Favorites", "Following", "Followers", "Retweets", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL","QuoteID","QuoteText","QuoteImg","QuoteUrl"])
|
||||||
|
lines_seen = []
|
||||||
|
|
||||||
|
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
|
# Search hashtags\keywords
|
||||||
for tweet in tweepy.Cursor(api.search,q=keyword + '-filter:retweets',count=100, tweet_mode="extended", lang="en", since=days_before).items():
|
for tweet in tweepy.Cursor(api.search,q=keyword + '-filter:retweets',count=100, tweet_mode="extended", lang="en", since=days_before).items():
|
||||||
|
### Set \ Reset Variables
|
||||||
medias = []
|
medias = []
|
||||||
lnks = []
|
lnks = []
|
||||||
replink = ""
|
replink = ""
|
||||||
@ -51,74 +55,62 @@ for keyword in keywords:
|
|||||||
followers = tweet.user.followers_count
|
followers = tweet.user.followers_count
|
||||||
username = tweet.user.screen_name
|
username = tweet.user.screen_name
|
||||||
id = "https://twitter.com/" + username + "/status/" + tweet.id_str
|
id = "https://twitter.com/" + username + "/status/" + tweet.id_str
|
||||||
print(id)
|
|
||||||
text = tweet.full_text
|
text = tweet.full_text
|
||||||
print(tweet.full_text)
|
|
||||||
hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]]
|
hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]]
|
||||||
created = tweet.created_at
|
created = tweet.created_at
|
||||||
print(hashtags)
|
|
||||||
try:
|
try:
|
||||||
reply = tweet.in_reply_to_status_id_str
|
reply = tweet.in_reply_to_status_id_str
|
||||||
user = tweet.in_reply_to_user_id_str
|
user = tweet.in_reply_to_user_id_str
|
||||||
replink = "https://twitter.com/" + user + "/" + reply
|
replink = "https://twitter.com/" + user + "/" + reply
|
||||||
print(reply)
|
|
||||||
except:
|
except:
|
||||||
print("no reply url")
|
print("no reply url")
|
||||||
if 'media' in tweet.entities:
|
if 'media' in tweet.entities:
|
||||||
for media in tweet.extended_entities['media']:
|
for media in tweet.extended_entities['media']:
|
||||||
print(media['media_url_https'])
|
|
||||||
medias.append(media['media_url_https'])
|
medias.append(media['media_url_https'])
|
||||||
|
### Add URLs to array
|
||||||
if 'urls' in tweet.entities:
|
if 'urls' in tweet.entities:
|
||||||
|
quoted = 'FALSE'
|
||||||
for url in tweet.entities['urls']:
|
for url in tweet.entities['urls']:
|
||||||
print(url['expanded_url'])
|
lkn = url['expanded_url']
|
||||||
lnks.append(url['expanded_url'])
|
lnks.append(url['expanded_url'])
|
||||||
quoted = 'FALSE'
|
|
||||||
if hasattr(tweet, 'quoted_status'):
|
|
||||||
qtuser = tweet.quoted_status.user.screen_name
|
|
||||||
qtid = "https://twitter.com/" + username + "/status/" + tweet.quoted_status.id_str
|
|
||||||
if qtid == url:
|
|
||||||
quoted = 'TRUE'
|
|
||||||
print("quoted")
|
|
||||||
else:
|
|
||||||
print("notquoted")
|
|
||||||
from webpreview import web_preview
|
from webpreview import web_preview
|
||||||
if username == "Docbasia" or quoted == 'TRUE':
|
if username == "Docbasia" or 'twitter.com' in lkn: continue
|
||||||
print("Docbasia")
|
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
|
### get title img description
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
|
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
|
||||||
tit, desc, ima = web_preview(url['expanded_url'],timeout=10,headers=headers,parser='lxml')
|
tit, desc, ima = web_preview(lkn,timeout=10,headers=headers,parser='lxml')
|
||||||
title.append(tit)
|
title.append(tit)
|
||||||
description.append(desc)
|
description.append(desc)
|
||||||
image.append(ima)
|
image.append(ima)
|
||||||
except:
|
except:
|
||||||
print("broken link")
|
print("broken link")
|
||||||
# If it's a quote-tweet, get original stats
|
### If it's a quote-tweet, get original stats
|
||||||
if hasattr(tweet, 'quoted_status'):
|
if hasattr(tweet, 'quoted_status'):
|
||||||
qtmedia = ['']
|
qtmedia = ['']
|
||||||
qturls = ['']
|
qturls = ['']
|
||||||
qttext = tweet.quoted_status.full_text
|
qttext = tweet.quoted_status.full_text
|
||||||
print(qttext)
|
|
||||||
qtuser = tweet.quoted_status.user.screen_name
|
qtuser = tweet.quoted_status.user.screen_name
|
||||||
qtid = "https://twitter.com/" + qtuser + "/status/" + tweet.quoted_status.id_str
|
qtid = "https://twitter.com/" + qtuser + "/status/" + tweet.quoted_status.id_str
|
||||||
if 'media' in tweet.quoted_status.entities:
|
if 'media' in tweet.quoted_status.entities:
|
||||||
for media in tweet.quoted_status.extended_entities['media']:
|
for media in tweet.quoted_status.extended_entities['media']:
|
||||||
print(media['media_url_https'])
|
|
||||||
qtmedia.append(media['media_url_https'])
|
qtmedia.append(media['media_url_https'])
|
||||||
if 'urls' in tweet.quoted_status.entities:
|
if 'urls' in tweet.quoted_status.entities:
|
||||||
for url in tweet.quoted_status.entities['urls']:
|
for url in tweet.quoted_status.entities['urls']:
|
||||||
print(url['expanded_url'])
|
|
||||||
qturls.append(url['expanded_url'])
|
qturls.append(url['expanded_url'])
|
||||||
line = [created, id, text, hashtags, username, favorites, following, followers, retweetcount, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
|
|
||||||
lin = "created + ',' + id + ',' + text + ',' + hashtags + ',' + username + ',' + favorites + ',' + following + ',' + followers + ',' + retweetcount + ',' + lnks + ',' + title + ',' + description + ',' + image + ',' + medias + ',' + replink + ',' + qtid + ',' + qttext + ',' + qtmedia + ',' + qturls + ',"
|
|
||||||
seen = 'FALSE'
|
seen = 'FALSE'
|
||||||
for row in csvFile:
|
count = 0
|
||||||
if lin == row:
|
for y in lines_seen:
|
||||||
print("seen")
|
if id == y:
|
||||||
seen = 'TRUE'
|
seen = 'TRUE'
|
||||||
|
|
||||||
|
#### Column attributes
|
||||||
|
line = [created, id, text, hashtags, username, favorites, following, followers, retweetcount, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
|
||||||
|
|
||||||
|
#### Only add line to csv if it's not already been added
|
||||||
|
lines_seen.append(id)
|
||||||
if seen == 'TRUE' or username == "Docbasia" :
|
if seen == 'TRUE' or username == "Docbasia" :
|
||||||
print("seen")
|
print("Seen. Don't Save.")
|
||||||
else:
|
else:
|
||||||
csvWriter.writerow(line)
|
csvWriter.writerow(line)
|
||||||
print(line)
|
print(line)
|
||||||
|
Loading…
Reference in New Issue
Block a user