%config InlineBackend.figure_format = 'retina'
#print ('last file should be "'"tweet.js"'"')
#os.listdir(data_folder)[0:11]
# load account tweets from 'tweet.js'
# cutting json relevant part from file
import pprint as pp
import json
import pandas as pd
import pathlib
# from pandas.io.json import json_normalize #package for flattening json in pandas df
path = pathlib.Path.cwd() / data_folder / 'tweet.js'
with open(path) as dataFile:
data = dataFile.read()
obj = data[data.find('[') : data.rfind(']')+1]
jsonObj = json.loads(obj)
print (BOLD + 'Number of tweets found in this account: ' + END, len(jsonObj))
#pp.pprint(jsonObj[2])
# https://github.com/amirziai/flatten
from flatten_json import flatten
tweets_flattened = (flatten(e) for e in jsonObj)
tweets_df = pd.DataFrame (tweets_flattened)
#tweets_df.head(2)
# exploring columns
print()
print(HEADER + 'The dataset includes ', len( tweets_df.columns),' columns:' + END)
print('These are some of the column names: \n', sorted(tweets_df.columns[0:50]))
# counting number of hashtags and adding column 'num_of_tags' to *tweets_df*
print ('The following describes the occurence of hashtags, as they are considered an important element to link and structure tweets')
i=0
tagCols = ['entities_hashtags_0_text','entities_hashtags_1_text',
'entities_hashtags_2_text','entities_hashtags_3_text','entities_hashtags_4_text',
'entities_hashtags_5_text','entities_hashtags_6_text']
for i in range (len(tweets_df)):
num_of_tags = 0
for tag_column in tagCols:
if tweets_df.loc[[i],[tag_column]].notnull().bool():
num_of_tags +=1
else:
break
#print (i," ",num_of_tags)
tweets_df.loc[i,'num_of_tags']=num_of_tags
# ... ['favorite_count','retweet_count','num_of_tags'] formated without decimals
tweets_df[['favorite_count','retweet_count','num_of_tags']] = \
tweets_df[['favorite_count','retweet_count','num_of_tags']].astype(int)
#import numpy as np
tweets_df_tagged = tweets_df [tweets_df['entities_hashtags_0_text'].isnull() == False]
print ('Tweets with hashtags: ', len (tweets_df_tagged))
print ('Tweets w/o hashtags: ', len(tweets_df) - len (tweets_df_tagged))
tag_count = tweets_df[['num_of_tags','retweet_count']].\
groupby(['num_of_tags']).count().reset_index(drop=False)
tag_count.columns = ['num_of_tags','num_of_tweets']
tag_count = tag_count.reset_index(drop=True)
#tag_count
react_count = tweets_df.groupby (['num_of_tags'])['favorite_count','retweet_count'].mean().reset_index()
#react_count
std_dev = tweets_df.groupby (['num_of_tags'])['favorite_count','retweet_count'].std().reset_index()
#std_dev
tw_max = tweets_df.groupby (['num_of_tags'])['favorite_count','retweet_count'].max().reset_index()
#tw_max
tag_overview =pd.DataFrame()
tag_overview = tag_count [['num_of_tweets']]
tag_overview [['num_of_tags','averg_favs','averg_retw']] = react_count [['num_of_tags','favorite_count','retweet_count']]
tag_overview [['fav_std','ret_std']] = std_dev [['favorite_count','retweet_count']]
tag_overview = tag_overview [['num_of_tags','num_of_tweets','averg_favs','fav_std','averg_retw','ret_std']]
tag_overview [['max_retweet','max_likes']] = tw_max [['retweet_count','favorite_count']]
tag_overview.reset_index(drop=True)
tweets_df [['favorite_count','full_text']].sort_values(by='favorite_count', ascending=False).head(6).style.hide_index()
tweets_df [['retweet_count','full_text']].sort_values(by='retweet_count', ascending=False).head(6).style.hide_index()
import matplotlib.pyplot as plt
import seaborn as sns
# or use sns.set_style("ticks")
sns.set(style="whitegrid", context='notebook')
fig, ax = plt.subplots(figsize=(7, 4))
ax.set_title ('Total number of tweets including 1-n #Hashtags', size = 13)
ax = sns.countplot(x="num_of_tags", data=tweets_df)
#sns.pairplot(tweets_df[['favorite_count','retweet_count','num_of_tags']])
#compact = df[(df["retweet_count"]<4)&(df["favorite_count"]<4)&(df["num_of_tags"]<=4)]
compact = tweets_df[(tweets_df["num_of_tags"]<=4)]
len(compact)
fig, ax = plt.subplots(figsize=(7, 4))
ax.set_title ('Average number of #hastags per Tweet, including error-rate \n (only Tweets with no more than 4 hashtags)', size = 13)
ax = sns.barplot(x="num_of_tags", y="retweet_count",data=compact, palette="Set2", capsize=0.1)
# ylim requires some logic
tag_overview_comp = tag_overview[tag_overview['num_of_tags']<=4]
sns.set(style="whitegrid")
f, ax = plt.subplots(figsize=(7, 4))
# control x and y limits
plt.ylim(0, 2, 1)
plt.xlim(0, None)
sns.set_style("ticks")
ax.set_title ('Average number of "'"Likes"'" per Tweet, including error-rate \n (only Tweets with no more than 4 hashtags)', size = 13)
ax = sns.barplot(x="num_of_tags", y="favorite_count",data=compact, palette="Set2", capsize=0.1)
ax2 = ax.twinx()
sns.lineplot(x="num_of_tags", y='num_of_tweets',ax=ax2, data=tag_overview_comp, linewidth=3)
sns.set(style="whitegrid")
plt.show()
print ('There is a second y-axis to indicate the number of actual tweets behind the average. This line should be omitted if published.')
sns.set(style="whitegrid")
f, ax = plt.subplots(figsize=(7, 4))
# control x and y limits
plt.ylim(0, 2, 1)
plt.xlim(0, None)
sns.set_style("ticks")
ax = sns.barplot(x="num_of_tags", y="retweet_count",data=compact, palette="Set2", capsize=0.1)
ax.set_title ('Average number of "'"Retweets"'" per Tweet, including error-rate \n (only Tweets with no more than 4 hashtags)', size = 13)
ax2 = ax.twinx()
sns.lineplot(x="num_of_tags", y='num_of_tweets',ax=ax2, data=tag_overview_comp, linewidth=3)
sns.set(style="whitegrid")
The Heatmap for 'retweets' visualizes the frequency of a given 'number of tags' x 'number of retweets'.
import numpy as np
# only about 9.9% are retweeted (chrvoigt)
print('tweets total:',len(tweets_df))
print('tweets with tags:',len(tweets_df_tagged))
print('tagged tweets as percent: {00:.00%}'.format(len(tweets_df_tagged)/len(tweets_df)))
#print('tweets with zero retweets:',pivot_heat_retweet.iloc[0,:].sum())
print('tweets with one or more retweets:',np.sum(pivot_heat_retweet_1).sum())
print('retweets as percent: {00:.00%}'.format(np.sum(pivot_heat_retweet_1).sum()/len(tweets_df)))
# hight requires logic
print()
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(data=pivot_heat_retweet_1,
cmap=sns.color_palette("Blues"),
ax=ax)
plt.show()
The Heatmap for 'likes' visualizes the frequency of a given 'number of tags' x 'number of likes'.
pivot_heat_fav = heat_fav.pivot('favorite_count','num_of_tags','retweet_count')
#pivot_heat_fav.head(4)
pivot_heat_fav_1 = pivot_heat_fav.iloc [1:]
# only about 9.9% are retweeted
'''
print('tweets total: ',len(tweets_df))
print('tweets without tags: ',len(tweets_df_null))
print('tweets with zero favs: ',pivot_heat_fav.iloc[0,:].sum().astype(int))
'''
print('tweets with one or more likes :',np.sum(pivot_heat_fav_1).sum().astype(int))
print('liked tweets in percent: {00:.00%}'.format(np.sum(pivot_heat_fav_1).sum()/len(tweets_df)))
fig, ax = plt.subplots(figsize=(5,8))
sns.heatmap(data=pivot_heat_fav_1,
cmap=sns.color_palette("Blues"),
ax=ax)
plt.show()
It's a list of words extracted from a larger text, where the importance of each word is shown with font size or color. This format is useful for quickly perceiving the most prominent terms to determine its relative prominence. Generating a word cloud includes first:
from wordcloud import WordCloud, STOPWORDS
# example of a retweet
# tweets_df.loc[[5],['full_text']]
#Preprocessing del RT @blablabla:
tweets_df['tweetos'] = ''
#add tweetos first part
for i in range(len(tweets_df['full_text'])):
try:
tweets_df['tweetos'][i] = tweets_df['full_text'].str.split(' ')[i][0]
except AttributeError:
tweets_df['tweetos'][i] = 'other'
#Preprocessing tweetos. select tweetos contains 'RT @'
for i in range(len(tweets_df['full_text'])):
if tweets_df['tweetos'].str.contains('@')[i] == False:
tweets_df['tweetos'][i] = 'other'
# remove URLs, RTs, and twitter handles
for i in range(len(tweets_df['full_text'])):
tweets_df['full_text'][i] = " ".join([word for word in tweets_df['full_text'][i].split()
if 'http' not in word and '@' not in word and '<' not in word])
# test
# tweets_df.loc[[5],['full_text']]
stopwords =set(STOPWORDS)
stopwords.update({'RT','via','amp','telroadmap','will','telroadmaps','new','die','new','need','und'})
print(HEADER + 'The dataset uses ', len(stopwords),' stopwords:' + END)
print(stopwords)
print(HEADER + 'The following wordcloud is based on the top 70 most frequently used words \n' + END)
#cloud mit Maske https://mubaris.com/posts/dataviz-wordcloud/ (&Color)
# requires a google font? DroidSansMono.ttf, https://github.com/amueller/word_cloud
import matplotlib
#colormap - inferno, PuOr, twilight_shifted, overview: https://matplotlib.org/3.1.1/tutorials/colors/colormaps.html
def wordcloud(tweets,col):
wordcloud = WordCloud(background_color="white", max_font_size=60,stopwords=stopwords,max_words=70,
colormap=matplotlib.cm.twilight_shifted).generate(" ".join([i for i in tweets[col]]))
plt.figure( figsize=(16,10), facecolor='darkgrey')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
wordcloud(tweets_df,'full_text')
Interpreting natural language, specially if presented in the compact format of a tweet is one of the hardest tasks for machines. It's often overlooked that there are not yet tools that work equally good for all topics, i.e. tweets in a politicans account can differ significantly from tweets send by a research project. Hence, in the following a tweet including 'it was hard work' or 'Science books are often boring. We did something different' can cause a tweet to be classified as 'negative'.
The following analysis is based on a model extracted from movie reviews, using naive bayes as described here: https://textblob.readthedocs.io/en/dev/_modules/textblob/en/sentiments.html (python library)
A good overview abput different approaches can be found here https://www.ijcaonline.org/research/volume125/number3/dandrea-2015-ijca-905866.pdf
#tweets_df.loc[5:7,['full_text']].style
#for tweet in tweets_df['full_text']:
# print(tweet)
Some conceptual questions ... => what influences retweets / likes / impressions
# word clouds https://datascienceplus.com/twitter-analysis-with-python/
# Mapping via Folium: of lon & lat: https://towardsdatascience.com/analysis-of-car-accidents-in-barcelona-using-pandas-matplotlib-and-folium-73384240106b
# combine figures https://towardsdatascience.com/a-step-by-step-guide-for-creating-advanced-python-data-visualizations-with-seaborn-matplotlib-1579d6a1a7d0
# basic Pandas features https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
# Table styling (e.g. heatmaps) https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html
# Sentiment - naive bayes : https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/