test twitter collection

2025-11-25 17:06:46 -05:00 · 2021-01-22 22:15:32 -05:00 · 2021-01-22 22:15:32 -05:00 · 08ca6cb42e
commit 08ca6cb42e
parent dc9270e1ee
3 changed files with 173 additions and 22 deletions
--- a/.github/workflows/twitter.yml
+++ b/.github/workflows/twitter.yml
@ -0,0 +1,46 @@
 # This is a basic workflow to help you get started with Actions
 name: twitter collections
 # Controls when the action will run. Triggers the workflow on push or pull request
 # events but only for the master branch
 on:
  schedule:
    - cron: '0 18 * * 6'
  push:
    paths:
    - 'twscraper.py'
    - '.github/workflows/twitter.yml'
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
  # This workflow contains a single job called "build"
  build:
    # The type of runner that the job will run on
    runs-on: ubuntu-latest
    # Steps represent a sequence of tasks that will be executed as part of the job
    steps:
      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
      - uses: actions/checkout@v2
      - uses: actions/setup-python@v2
        with:
          python-version: '3.9.0' # Version range or exact version of a Python version to use, using SemVer's version range syntax
      - uses: py-actions/py-dependency-install@v2   
        with:
          path: requirements.txt
      - shell: bash
        env: # Or as an environment variable
          CONSUMER_KEY: ${{ secrets.CONSUMER_KEY }}
          CONSUMER_SECRET: ${{ secrets.CONSUMER_SECRET }}
          ACCESS_KEY: ${{ secrets.ACCESS_KEY }}
          ACCESS_SECRET: ${{ secrets.ACCESS_SECRET }}
        run: python twscraper.py
      - name: Deploy Changes
        run: |
          git remote add gh-token "https://github.com/decentralized-id/decentralized-id.github.io.git"
          git config user.name "github-actions[bot]"
          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
          git pull
          git commit -a -m "add twitter collection"
          git push gh-token master
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 tweepy
 webpreview
--- a/twscraper.py
+++ b/twscraper.py
@ -1,13 +1,11 @@
 import tweepy
 import csv
 import pandas as pd
 #### Credentials
-CONSUMER_KEY = os.environ.get('CONSUMER_KEY')
+consumer_key = os.environ.get('CONSUMER_KEY')
-CONSUMER_SECRET = os.environ.get('CONSUMER_SECRET')
+consumer_secret = os.environ.get('CONSUMER_SECRET')
-ACCESS_KEY = os.environ.get('ACCESS_KEY')
+access_token = os.environ.get('ACCESS_KEY')
-ACCESS_SECRET = os.environ.get('ACCESS_SECRET')
+access_token_secret = os.environ.get('ACCESS_SECRET')
 #### Authorization
 auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
@ -24,15 +22,18 @@ from datetime import date, timedelta, datetime
 current_date = date.today()   
 days_before = (date.today()-timedelta(days=7))
 now = datetime.now()
-date_time = current_date.strftime("%m%d%y")
+date_time = current_date.strftime("%Y-%m-%d")
 startDate = days_before.strftime("%Y-%m-%d, %H:%M:%S")
 #### Open CSV + Write Column Names
-fname = 'SSI-DID_' + date_time + '.csv'
+fname = '/_data/twitter/search_' + date_time + '.csv'
 csvFile = open(fname, 'w+')
 csvWriter = csv.writer(csvFile)
-csvWriter.writerow(["Time","ID", "Link", "Likes", "Shares", "User", "Text", "Hashtags", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL", "QuoteID", "QuoteText", "QuoteImg", "QuoteUrl"])
+csvWriter.writerow(["Time","Link", "Urls", "UrlTitle", "UrlDesc", "UrlImg", "ImageUrls", "ReplyURL", "QuoteID", "QuoteImg", "QuoteUrl"])
 lines_seen = []
 text_seen = []
 tweet_ids = []
 for keyword in keywords:
    # Search hashtags\keywords
@ -51,21 +52,16 @@ for keyword in keywords:
        seen = 'FALSE'
        ### Set basic tweet attributes
        retweetcount = tweet.retweet_count
        favorites = tweet.favorite_count
        username = tweet.user.screen_name
        id = "https://twitter.com/" + username + "/status/" + tweet.id_str
        idstr = tweet.id_str
        text = tweet.full_text
        hashtags = [hashtag['text'] for hashtag in tweet.entities["hashtags"]]     
        created = str(tweet.created_at)
        #### Only add line to csv if it's not already been added
        if hasattr(tweet, 'quoted_status'):
            quotedid = 'https://twitter.com/' + tweet.quoted_status.user.screen_name + '/status/' + tweet.quoted_status_id_str
            print("Quoted ID " + quotedid)
            if quotedid in lines_seen:
                print("Quoted Status Seen")
                seen = 'TRUE'
        for y in lines_seen:
            if id == y:
@ -73,7 +69,7 @@ for keyword in keywords:
        for q in text_seen:
            if text == q:
                seen = 'TRUE'
-        if seen == 'TRUE' or username == "Docbasia": continue
+        if seen == 'TRUE': continue
        else:
            ### Keep track of seen lines \ tweets
            lines_seen.append(id)
@ -84,7 +80,7 @@ for keyword in keywords:
                user = tweet.in_reply_to_user_id_str
                replink = "https://twitter.com/" + user + "/" + reply
            except:
-                print("no reply url")
+                pass
            ### Check for images in tweet
            if 'media' in tweet.entities:
                for media in tweet.extended_entities['media']:
@ -97,7 +93,7 @@ for keyword in keywords:
                    ### Look for metadata
                    from webpreview import web_preview
                    ### Unless link is an image pdf twitter or insta
-                    if username == "Docbasia" or 'twitter.com' in lkn or '.png' in lkn or '.jpg' in lkn or '.pdf' in lkn or 'instagram.com' in lkn or 'linkedin.com' in lkn or 'facebook.com' in lkn: continue
+                    if 'twitter.com' in lkn or '.png' in lkn or '.jpg' in lkn or '.pdf' in lkn or 'instagram.com' in lkn or 'linkedin.com' in lkn or 'facebook.com' in lkn: pass
                    else:
                        try:
                        ### get title img description
@ -108,10 +104,9 @@ for keyword in keywords:
                            description.append(desc)
                            image.append(ima) 
                        except:
-                            print("broken link")
+                            pass
                ### If it's a quote-tweet, get original stats
            if hasattr(tweet, 'quoted_status'):
                print("Quoted Status NotSeen")
                qtmedia = ['']
                qturls = ['']
                qttext = tweet.quoted_status.full_text
@ -124,10 +119,118 @@ for keyword in keywords:
                    for url in tweet.quoted_status.entities['urls']:
                        qturls.append(url['expanded_url'])
            #### Column attributes
-            line = [created, "'"+idstr+"'", id, favorites, retweetcount, username, text, hashtags, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
+            line = [created, id, lnks, title, description, image, medias, replink, qtid, qtmedia, qturls]
            #### Write row to CSV and print line
            csvWriter.writerow(line)
-            print(line)
+            tweet_ids.append(idstr)
            print(idstr)
 #### Get USER Tweets
 tweets = []
 ids = []
 tmpTweets = api.user_timeline('DecentralizeID')
 for tweet in tmpTweets:
    created = tweet.created_at.strftime("%Y-%m-%d, %H:%M:%S")
    if created < date_time and created > startDate:
        tweets.append(tweet)
 while (tmpTweets[-1].created_at.strftime("%Y-%m-%d, %H:%M:%S") > startDate):
    print("Last Tweet @", tmpTweets[-1].created_at, " - fetching some more")
    tmpTweets = api.user_timeline(username, max_id = tmpTweets[-1].id)
    for tweet in tmpTweets:
        createdate = tweet.created_at.strftime("%Y-%m-%d, %H:%M:%S")
        if createdate < date_time and createdate > startdate:
            tweets.append(tweet)
 for tweet in tweets:
    created = str(tweet.created_at)
    id = "https://twitter.com/" + username + "/status/" + tweet.id_str
    idstr = str(tweet.id_str)
    username = tweet.user.screen_name
    if hasattr(tweet, 'text'):
        text = tweet.text
    if hasattr(tweet, 'full_text'):
        text = tweet.full_text
    try: 
        username = tweet.retweeted_status.user.screen_name
        id = "https://twitter.com/" + tweet.retweeted_status.user.screen_name + "/status/" + tweet.retweeted_status.id_str
        idstr = tweet.retweeted_status.id_str
    except:
        pass
    if id not in ids:
        ids.append(id)
        tweet_ids.append(idstr)
        line = [created, id, lnks, title, description, image, medias, replink, qtid, qttext, qtmedia, qturls]
        #### Write row to CSV and print line
        csvWriter.writerow(line)
 csvFile.close()
-print("Complete")
+print(tweet_ids)
 # Create Collection
 from requests_oauthlib import OAuth1Session
 import json
 ## OAuth vs Tweepy auth, idk why can't create collection with above tweepy auth
 twitter = OAuth1Session(consumer_key,
                        client_secret=consumer_secret,
                        resource_owner_key=access_token,
                        resource_owner_secret=access_token_secret)
 # create
 url = 'https://api.twitter.com/1.1/collections/create.json'
 params_create = {
    'name': 'Decentralized-ID Curated ' + date_time,
    'description': 'Decentralized Identity Curated Tweets by @infominer33 via identosphere.net',
    'timeline_order': 'tweet_chron'
    }
 r = twitter.post(url, data=params_create)
 print(r.json())
 print(r.json()['response'])
 # 'response': {'timeline_id': 'custom-1180945428222595074'}}
 ## Extract ID from response
 res = str(r.json()['response'])
 ss1 = "{'timeline_id': 'custom-"
 ss2 = "'}"
 resp = res.removeprefix(ss1)
 response = resp.removesuffix(ss2)
 timeline_id = r.json()['response']['timeline_id']
 # the collection can be viewed at, eg: https://twitter.com/laurenfratamico/timelines/1180945428222595074
 # bulk add
 url = 'https://api.twitter.com/1.1/collections/entries/curate.json'
 # split into batches of 100 for the uploads
 n = 100
 batches = [tweet_ids[i:i + n] for i in range(0, len(tweet_ids), n)]
 print (len(batches))
 for batch in batches:
    params_add = {
        "id": timeline_id,
        "changes": []
    }
    for tweet_id in batch:
        sub_params_add = {
            "tweet_id": str(tweet_id),
            "op": "add"
        }
        params_add['changes'].append(sub_params_add)
    r = twitter.post(url, data=json.dumps(params_add))
    print(r.json())
 file_name = "/_posts/twitter/" + str(date_time) + '-twitter.md'
 f = open(file_name,"w+")
 str1 = "---\n"
 str2 = 'title: "Twitter Collection – ' + date_time + '"\n'
 str3 = 'description: "Collection of tweets on decentralized identity – ' + date_time + '"\n'
 str4 = "last_modified_at: " + date_time + '\n'
 str5 = "---\n"
 str6 = "\n\n"
 str7 = '<a class="twitter-timeline" href="https://twitter.com/DecentralizeID/timelines/' + response + '">Decentralized Identity - Curated ' + date_time + '</a> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'
 L = [str1, str2, str3, str4, str5, str6, str7] 
 f.writelines(L)
 f.close()