I am using TwitterAPI to extract replies on tweets using conversation_id by following example code mentioned below: The idea is to extract all replies against a list of thousands of conversation_ids, and I do have academic track credentials so the archive search should not be a problem.
from TwitterAPI import TwitterAPI, TwitterOAuth, TwitterRequestError, TwitterConnectionError, TwitterPager
# NOTE: If conversation is over a week old then it will not get returned.
CONVERSATION_ID = '1369393783482236933'
class TreeNode:
def __init__(self, data):
"""data is a tweet's json object"""
self.data = data
self.children = []
self.replied_to_tweet = None
if 'referenced_tweets' in self.data:
for tweet in self.data['referenced_tweets']:
if tweet['type'] == 'replied_to':
self.replied_to_tweet = tweet['id']
break
def id(self):
"""a node is identified by its tweet id"""
return self.data['id']
def parent(self):
"""the reply-to tweet is the parent of the node"""
return self.replied_to_tweet
def find_parent_of(self, node):
"""append a node to the children of it's parent tweet"""
if node.parent() == self.id():
self.children.append(node)
return True
for child in self.children:
if child.find_parent_of(node):
return True
return False
def print_tree(self, level):
"""level 0 is the root node, then incremented for subsequent generations"""
created_at = self.data['created_at']
username = self.data['author_id']['username']
text_80chars = self.data['text'][0:80].replace('\n', ' ')
print(f'{level*"_"}{level}: [{created_at}][{username}] {text_80chars}')
level = 1
for child in reversed(self.children):
child.print_tree(level)
try:
o = TwitterOAuth.read_file()
api = TwitterAPI(o.consumer_key, o.consumer_secret, auth_type='oAuth2', api_version='2')
# GET ROOT OF THE CONVERSATION
r = api.request(f'tweets/:{CONVERSATION_ID}',
{
'expansions':'author_id',
'tweet.fields':'author_id,conversation_id,created_at,referenced_tweets'
},
hydrate_type=HydrateType.APPEND)
for item in r:
root = TreeNode(item)
print(f'ROOT {root.id()}')
# GET ALL REPLIES IN CONVERSATION
# (RETURNED IN REVERSE CHRONOLOGICAL ORDER)
pager = TwitterPager(api, 'tweets/search/recent',
{
'query':f'conversation_id:{CONVERSATION_ID}',
'expansions':'author_id',
'tweet.fields':'author_id,conversation_id,created_at,referenced_tweets'
},
hydrate_type=HydrateType.APPEND)
# "wait=2" means wait 2 seconds between each request.
# The rate limit is 450 requests per 15 minutes, or
# 15*60/450 = 2 seconds.
orphans = []
for item in pager.get_iterator(wait=2):
node = TreeNode(item)
print(f'{node.id()} => {node.parent()}', item['author_id']['username'])
# COLLECT ANY ORPHANS THAT ARE CHILDREN OF THE NEW NODE
orphans = [orphan for orphan in orphans if not node.find_parent_of(orphan)]
# IF THE NEW NODE CANNOT BE PLACED IN TREE, ORPHAN IT UNTIL ITS PARENT IS FOUND
if not root.find_parent_of(node):
orphans.append(node)
print('\nTREE...')
root.print_tree(0)
assert len(orphans) == 0, f'{len(orphans)} orphaned tweets'
except TwitterRequestError as e:
print(e.status_code)
for msg in iter(e):
print(msg)
except TwitterConnectionError as e:
print(e)
except Exception as e:
print(e)
The error is displayed in detail if I comment the last two lines.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
C:\Users\ANEESB~1\AppData\Local\Temp/ipykernel_18696/4104024841.py in <module>
88
89 print('\nTREE...')
---> 90 root.print_tree(0)
91 assert len(orphans) == 0, f'{len(orphans)} orphaned tweets'
92
C:\Users\ANEESB~1\AppData\Local\Temp/ipykernel_18696/4104024841.py in print_tree(self, level)
37 """level 0 is the root node, then incremented for subsequent generations"""
38 created_at = self.data['created_at']
---> 39 username = self.data['author_id']['username']
40 text_80chars = self.data['text'][0:80].replace('\n', ' ')
41 print(f'{level*"_"}{level}: [{created_at}][{username}] {text_80chars}')
TypeError: string indices must be integers
The code is supposed to work, I don't know what is causing the error. Any help please?
CodePudding user response:
Self.data looks like this:
{
'author_id': '3420477195',
'conversation_id': '1369393783482236933',
'created_at': '2021-03-09T21:04:54.000Z',
'text': "Happy one year anniversary to everyone working from home! Do you feel like if you have one more Zoom meeting you’ll rip your hair out? First of all, please don't do that. Second, we're here to save you from Zoom boredom with these new backgrounds!",
'id': '1369393783482236933',
'author_id_hydrate': {'id': '3420477195', 'name': 'Full Frontal', 'username': 'FullFrontalSamB'}
}
author_id
is just a string, the details about the author are in author_id_hydrate
. So self.data['author_id']['username']
should be self.data['author_id_hydrate']['username']