Exploratory data analysis

Let's look at the JSON acquired from the Twitter API endpoint. A single tweet looks something like this (from the Twitter API documentation example):

 {
 "coordinates": null,
 "truncated": false,
 "created_at": "Tue Aug 28 19:59:34 +0000 2012",
 "favorited": false,
 "id_str": "240539141056638977",
 "in_reply_to_user_id_str": null,
 "entities": {
 "urls": [
],
 "hashtags": 
],
 "user_mentions": [
]
 },
 "text": "You'd be right more often if you thought you were wrong.",
 "contributors": null,
 "id": 240539141056638977,
 "retweet_count": 1,
 "in_reply_to_status_id_str": null,
 "geo": null,
 "retweeted": false,
 "in_reply_to_user_id": null,
 "place": null,
 "source": "web",
 "user": {
 "name": "Taylor Singletary",
 "profile_sidebar_fill_color": "FBFBFB",
 "profile_background_tile": true,
 "profile_sidebar_border_color": "000000",
 "profile_image_url": "http://a0.twimg.com/profile_images/2546730059/f6a8zq58mg1hn0ha8vie_normal.jpeg",
 "created_at": "Wed Mar 07 22:23:19 +0000 2007",
 "location": "San Francisco, CA",
 "follow_request_sent": false,
 "id_str": "819797",
 "is_translator": false,
 "profile_link_color": "c71818",
 "entities": {
 "url": {
 "urls": [
 {
 "expanded_url": "http://www.rebelmouse.com/episod/",
 "url": "http://t.co/Lxw7upbN",
 "indices": [
 0,
 20
 ],
 "display_url": "rebelmouse.com/episod/"
 }
 ]
 },
 "description": {
 "urls": [
]
 }
 },
 "default_profile": false,
 "url": "http://t.co/Lxw7upbN",
 "contributors_enabled": false,
 "favourites_count": 15990,
 "utc_offset": -28800,
 "profile_image_url_https": "https://si0.twimg.com/profile_images/2546730059/f6a8zq58mg1hn0ha8vie_normal.jpeg",
 "id": 819797,
 "listed_count": 340,
 "profile_use_background_image": true,
 "profile_text_color": "D20909",
 "followers_count": 7126,
 "lang": "en",
 "protected": false,
 "geo_enabled": true,
 "notifications": false,
 "description": "Reality Technician, Twitter API team, synthesizer enthusiast; a most excellent adventure in timelines. I know it's hard to believe in something you can't see.",
 "profile_background_color": "000000",
 "verified": false,
 "time_zone": "Pacific Time (US & Canada)",
 "profile_background_image_url_https": "https://si0.twimg.com/profile_background_images/643655842/hzfv12wini4q60zzrthg.png",
 "statuses_count": 18076,
 "profile_background_image_url": "http://a0.twimg.com/profile_background_images/643655842/hzfv12wini4q60zzrthg.png",
 "default_profile_image": false,
 "friends_count": 5444,
 "following": true,
 "show_all_inline_media": true,
 "screen_name": "episod"
 },
 "in_reply_to_screen_name": null,
 "in_reply_to_status_id": null
 }

We will be representing each individual tweet in a data structure that looks like this:

 type processedTweet struct {
 anaconda.Tweet
// post processed stuff
 ids []int // to implement Document
 textVec []float64
 normTextVec []float64
 location []float64
 isRT bool
 }

Note that we embed anaconda.Tweet, which is given as such in the Anaconda package:

 type Tweet struct {
 Contributors []int64 `json:"contributors"`
 Coordinates *Coordinates `json:"coordinates"`
 CreatedAt string `json:"created_at"`
 DisplayTextRange []int `json:"display_text_range"`
 Entities Entities `json:"entities"`
 ExtendedEntities Entities `json:"extended_entities"`
 ExtendedTweet ExtendedTweet `json:"extended_tweet"`
 FavoriteCount int `json:"favorite_count"`
 Favorited bool `json:"favorited"`
 FilterLevel string `json:"filter_level"`
 FullText string `json:"full_text"`
 HasExtendedProfile bool `json:"has_extended_profile"`
 Id int64 `json:"id"`
 IdStr string `json:"id_str"`
 InReplyToScreenName string `json:"in_reply_to_screen_name"`
 InReplyToStatusID int64 `json:"in_reply_to_status_id"`
 InReplyToStatusIdStr string `json:"in_reply_to_status_id_str"`
 InReplyToUserID int64 `json:"in_reply_to_user_id"`
 InReplyToUserIdStr string `json:"in_reply_to_user_id_str"`
 IsTranslationEnabled bool `json:"is_translation_enabled"`
 Lang string `json:"lang"`
 Place Place `json:"place"`
 QuotedStatusID int64 `json:"quoted_status_id"`
 QuotedStatusIdStr string `json:"quoted_status_id_str"`
 QuotedStatus *Tweet `json:"quoted_status"`
 PossiblySensitive bool `json:"possibly_sensitive"`
 PossiblySensitiveAppealable bool `json:"possibly_sensitive_appealable"`
 RetweetCount int `json:"retweet_count"`
 Retweeted bool `json:"retweeted"`
 RetweetedStatus *Tweet `json:"retweeted_status"`
 Source string `json:"source"`
 Scopes map[string]interface{} `json:"scopes"`
 Text string `json:"text"`
 User User `json:"user"`
 WithheldCopyright bool `json:"withheld_copyright"`
 WithheldInCountries []string `json:"withheld_in_countries"`
 WithheldScope string `json:"withheld_scope"`
 }

In the interest of building the program, we'll use the example tweets supplied by Twitter. I saved the example responses into a file called example.json and then a mock function is created to mock calling the API:

 func mock() []*processedTweet {
 f, err := os.Open("example.json")
 dieIfErr(err)
 return load(f)
 }
 func load(r io.Reader) (retVal []*processedTweet) {
 dec := json.NewDecoder(r)
 dieIfErr(dec.Decode(&retVal))
 return retVal
 }

The utility function dieIfErr is defined as usual:

 func dieIfErr(err error) {
 if err != nil {
 log.Fatal(err)
 }
 }

Note that in mock, no API calls to Twitter were made. In the future, we will be creating a function with a similar API so we can just replace the mock version of this function with the real one, which acquires the timeline from the API.

For now, we can test that this works by the following program:

 func main(){
 tweets := mock()
 for _, tweet := range tweets {
 fmt.Printf("%q
", tweet.FullText)
 }
 }

This is the output I got:

 $ go run *.go
 "just another test"
 "lecturing at the "analyzing big data with twitter" class at @cal with @othman http://t.co/bfj7zkDJ"
 "You'd be right more often if you thought you were wrong."

Table of Contents for Exploratory data analysis

Create new playlist

Sign In

Sign Up

Table of Contents for
Exploratory data analysis