Twitter の TL を全部 MongoDB にぶち込んでニヤニヤする

はじめに

タイトルの通り、自分の Twitter のタイムラインを全部 DB にぶち込んで後でニヤニヤする方法を紹介します。データの取得と保存は Node.js + MongoDB で行います。
これで MongoDB シェル上で以下の様な形で過去のツイートを検索できるようになります。

> db.posts.find(function(){ return this.text.match(/hoge/); })

環境を整える

タイムラインの取得には以前のエントリ（20 行で作る node.js による Twitter bot 作成講座 - 凹みTips）と同じ方法を使用し、Node.js を用いてStreaming API 経由でリアルタイムに取得します。DB には NoSQL で JavaScript と親和性の高い MongoDB を使い、Node.js からは Mongoose を利用してアクセスします。MongoDB については以下の書籍（無料！）がとても参考になります。

MongoDBの薄い本(The Little MongoDB Book) - cuspy diary

MongoDB の導入の仕方もここに書いてあります。mongoose のインストールは npm 経由で行います。

$ mkdir tw2db
$ cd tw2db
$ npm install mongoose

これで環境が整いました。

コードを書く

tw2db.js

var twitter  = require('twitter')
  , mongoose = require('mongoose')
  , Schema   = mongoose.Schema
;

// typeof で得た文字列を型に変換
var typeMap = {
	number   : Number,
	string   : String,
	boolean  : Boolean,
	object   : Object,
	function : Function
};

// オブジェクト/配列を受け取って Mongoose 用 Schema に変換
function makeSchema(data) {
	var schema = {};
	for (var x in data) {
		var type = typeof data[x];
		if (data[x] === null) {
			schema[x] = Object;
		} else if (type === 'object') {
			schema[x] = makeSchema(data[x]) ;
		} else {
			schema[x] = typeMap[type];
		}
	}
	return schema;
}

// MongoDB へ接続
mongoose.connect('mongodb://localhost/Twitter');

// mongoose のスキーマ
var PostSchema, Post, isSchemaDefined = false;

// Twitter へ接続
new twitter({
	consumer_key        : 'xxxxxxxxxxxxxxxxxxxx',
	consumer_secret     : 'xxxxxxxxxxxxxxxxxxxx',
	access_token_key    : 'xxxxxxxxxxxxxxxxxxxx',
	access_token_secret : 'xxxxxxxxxxxxxxxxxxxx'
}).stream('user', function(stream) {
	stream.on('data', function(data) {
		// Friends リストのデータはすっ飛ばす
		if ( !('id' in data) ) {
			return;
		} else {
			console.log(data.user.screen_name, data.text);
		}

		// 最初のデータで Schema を作成
		if (!isSchemaDefined) {
			PostSchema = new Schema( makeSchema(data, '') )
			Post       = mongoose.model('Post', PostSchema)
			isSchemaDefined = true;
		}

		// Post Schema から保存用のデータを生成して保存
		var post = new Post(data);
		post.save( function(err) {
			if (err) console.error(err);
		});
	});
});

// 例外処理
process.on('uncaughtException', function (err) {
	console.log('uncaughtException => ' + err);
});

Consumer Key や Access Token には各自取得したものを入力して下さい。

実行する

$ node tw2db
hecomi ドトる
...（以下略）

こんな感じでつぶやきがどんどん突っ込まれていきます。

ニヤニヤする

MongoDB シェルから突っ込んだデータを確認してみます。まずは DB ができていることを確認して全データ表示してみます。

$ mongo
> show dbs
Twitter	0.203125GB
local	(empty)
test	0.203125GB
> use Twitter
> show collections
posts
system.indexes
> db.posts.find()
{ "text" : "通ってたドトール最近混み始めたしそろそろ別の場所も開拓しないと…。", "in_reply_to_user_id" : null, "truncated" : false, "id_str" : "244337533918523392", "retweeted" : false, "in_reply_to_status_id_str" : null, "source" : "<a href=\"http://sites.google.com/site/yorufukurou/\" rel=\"nofollow\">YoruFukurou</a>", "in_reply_to_screen_name" : null, "in_reply_to_status_id" : null, "in_reply_to_user_id_str" : null, "place" : null, "contributors" : null, "coordinates" : null, "created_at" : "Sat Sep 08 07:33:02 +0000 2012", "favorited" : false, "retweet_count" : 0, "id" : 244337533918523400, "geo" : null, "_id" : ObjectId("504af4bbc8acf9a2ee000003"), "user" : { "follow_request_sent" : null, "contributors_enabled" : false, "location" : "class unreal { hecomi h_; };", "profile_sidebar_border_color" : "97C23A", "profile_image_url_https" : "https://si0.twimg.com/profile_images/1718681986/zzz_normal.png", "id_str" : "15599128", "listed_count" : 42, "verified" : false, "show_all_inline_media" : false, "profile_use_background_image" : true, "profile_image_url" : "http://a0.twimg.com/profile_images/1718681986/zzz_normal.png", "description" : "都内某所に生息 / 音声認識とかアレとか使って部屋をアレにしようという計画を実行中（C++, JavaScript） / 某弾幕ゲー風STG作成中…だった(C++，完全放置） / ゲームしたい（CoD:MW3 など） / ...", "default_profile" : false, "statuses_count" : 12857, "profile_text_color" : "0E4D00", "followers_count" : 437, "lang" : "ja", "profile_background_image_url" : "http://a0.twimg.com/profile_background_images/157175569/x774eac4372c7a8f5a4a8180aab4d296.png", "time_zone" : "Tokyo", "screen_name" : "hecomi", "geo_enabled" : true, "profile_link_color" : "083845", "profile_background_image_url_https" : "https://si0.twimg.com/profile_background_images/157175569/x774eac4372c7a8f5a4a8180aab4d296.png", "created_at" : "Fri Jul 25 15:18:25 +0000 2008", "protected" : false, "default_profile_image" : false, "following" : null, "notifications" : null, "friends_count" : 412, "profile_background_color" : "D4EB51", "url" : "http://d.hatena.ne.jp/hecomi/", "name" : "凹", "is_translator" : false, "profile_background_tile" : true, "favourites_count" : 1682, "id" : 15599128, "utc_offset" : 32400, "profile_sidebar_fill_color" : "728661" }, "entities" : { "hashtags" : [ ], "urls" : [ ] }, "__v" : 0 }
（以下略）
has more

こんな感じで保存したデータがズラーと降ってきます。
MongoDB は検索クエリにドット表記をしたり、更には JavaScript を書くこともできるので、こういったことを利用しながら柔軟にデータを取り出すことができます。

特定の人物のつぶやきを検索する

user.screen_name とドット表記を利用して検索します。

> db.posts.find({'user.screen_name': 'hecomi'})

特定のつぶやきを検索する

クエリとして function を指定します。this に各ドキュメントが入っているので true を返したものを表示するように指定してあげればおｋです。

> db.posts.find(function(){ return this.text.match(/hoge/); })

特定の期間のつぶやきを検索する

created_at に呟いた日時が含まれているのですが、「Sat Sep 08 06:40:38 +0000 2012」というように文字列になってしまっています。が、Date に食わせてあげればよしなに変換してくれます。

> db.posts.count( function(){ return new Date(this.created_at) > new Date("2012/09/08 17:00:00"); })
78

上記の例のようにデータの検索だけだなくカウントも簡単です。

検索結果を走査する

forEach 使ってあげれば簡単に検索結果を走査できます。

db.posts.find(function(){ return this.text.match(/hoge/); }).
... forEach( function(doc) {
...   print(doc.user.screen_name + ': ' + doc.text);
... })

おまけのコード解説

Mongoose では、こういう型で構成された情報扱いますよーという Schema を通じてモデルを作成し、データを扱います。

var PersonSchema = new Schema({
	name     : String,
	birthday : Date,
	family   : {
		mother  : String,
		father  : String,
		brother : String,
		sister  : String
	}
});
Person = mongoose.model('Person', PersonSchema)

例えばこんな感じです。
Twitter の１つ１つのツイートは上記で見たように、呟いた日時、ジオタグ、ユーザ情報など非常に多くの情報を含んだ形式で降ってきます。いちいち手で Schema を書くのは面倒なので、やってきたオブジェクトを元に自動で作ることにします。

function makeSchema(data) {
	var schema = {};
	for (var x in data) {
		var type = typeof data[x];
		if (data[x] === null) {
			schema[x] = Object;
		} else if (type === 'object') {
			schema[x] = makeSchema(data[x]) ;
		} else {
			schema[x] = typeMap[type];
		}
	}
	return schema;
}

再帰的にやってます。Object でない要素を参照したときは、 typeof で得られる文字列を型に変換します。変換には typeMap 的なものを定義してます。

var typeMap = {
	number   : Number,
	string   : String,
	boolean  : Boolean,
	object   : Object,
	function : Function
};

これを利用してこれを最初のツイートを元に生成、あとは

var post = new Post(data);

とモデルに降ってきたオブジェクトをそのまま食わせてあげればおｋです。

おわりに

これをずっと回しておけばリアルタイムに更新される DB の完成です。次はこの DB を利用して、TL を RSS 化してみたいと思います。