RでLDA – IT研究所

自分用メモ

こちらのRでLDAのサンプルがありますが、うまく動かなかったので修正メモ

環境

MacOS10.11
RStudio 0.99.484
R version 3.3.0

エラー

こちらのスクリプトを順番に実行していくと

# ggplotで可視化
ggplot(topic.proportions.df, aes(x=topic, y=value, fill=document)) + geom_bar() + facet_wrap(~ document, ncol=N) + coord_flip()

1 2	# ggplotで可視化 ggplot(topic.proportions.df, aes(x=topic, y=value, fill=document)) + geom_bar() + facet_wrap(~ document, ncol=N) + coord_flip()

この部分で以下のエラーが出ます

 eval(expr, envir, enclos) でエラー:  オブジェクト 'topic' がありません

1	eval(expr, envir, enclos) でエラー: オブジェクト 'topic' がありません

以下のように修正します

# ggplotで可視化
ggplot(topic.proportions.df, aes(x=variable, y=value, fill=document)) + geom_bar() + facet_wrap(~ document, ncol=N) + coord_flip()

1 2	# ggplotで可視化 ggplot(topic.proportions.df, aes(x=variable, y=value, fill=document)) + geom_bar() + facet_wrap(~ document, ncol=N) + coord_flip()

次はこのエラーです

 エラー: stat_count() must not be used with a y aesthetic.

1	エラー: stat_count() must not be used with a y aesthetic.

ここを参考に修正します

修正スクリプト

# coraデータの読み込み（2410の科学記事のデータセット。LISTで2410成分ある）
data(cora.documents)
head(cora.documents, n = 2)

# 科学記事で使われているユニーク単語(2910個)のベクトル
data(cora.vocab)
head(cora.vocab)

# 科学記事で使われているタイトル(2410個)のベクトル
data(cora.titles)
head(cora.titles)

# 分析データの作成（トリッキーな参照をしているので注意）
# 1列目がcora.documentsの第一成分で使われる単語のリスト、2列目がその出現回数
data_cora <- as.data.frame(cbind(cora.vocab[cora.documents[[1]][1, ] + 1], cora.documents[[1]][2,]))
# coreの1番目の記事はこれらの単語とその出現回数で構成されていることが分かる。
head(data_cora)

### LDA
# 推定するトピック数の設定
k <- 10

# ldaパッケージはギブスサンプラーでやるようです。
# ギブスサンプラーの中でも3つくらいmethodがあるようです。
result <- lda.collapsed.gibbs.sampler(cora.documents, 
                                      k,
                                      cora.vocab,
                                      25,  # 繰り返し数
                                      0.1, # ディリクレ過程のハイパーパラメータα
                                      0.1, # ディリクレ過程のハイパーパラメータη
                                      compute.log.likelihood=TRUE)

# サマリを見ると、10成分のリストで構成されている。
# assignments：文書Dと同じ長さのリスト。値は単語が割り当てられたトピックNoを示す。
# topic：k × vの行列。値はそのトピックに出現する単語数を表す。
# topic_sums：それぞれのトピックに割り当てられた単語の合計数
# document_sums：k × Dの行列。割り振られたトピックにおける一文章内の単語数を示す。
summary(result)

# 各クラスターでの上位キーワードを抽出する
# 例は各トピックにおける上位3位の単語の行列。
top.words <- top.topic.words(result$topics, 3, by.score=TRUE)
top.words

# 最初の3記事だけトピック割合を抽出してみる
N <- 3
topic.proportions <- t(result$document_sums) / colSums(result$document_sums)
topic.proportions <- topic.proportions[1:N, ]
topic.proportions[is.na(topic.proportions)] <-  1 / k

# 上位3番までのトップワードを用いて列名をつけて、意味付けを行う。
colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=" ")
par(mar=c(5, 14, 2, 2))
barplot(topic.proportions, beside=TRUE, horiz=TRUE, las=1, xlab="proportion")


###
# ggplotで可視化するために、meltを駆使してデータを作成（トリッキーなので注意）
topic.proportions.df <- melt(cbind(data.frame(topic.proportions), document=factor(1:N)), variable.name="topic", id.vars = "document")

# ggplotで可視化  この部分がうまく動かないので修正
#http://tutorials.iq.harvard.edu/R/Rgraphics/Rgraphics.html
ggplot(topic.proportions.df, aes(x=variable, y=value, fill=document)) + geom_bar(stat="identity") + facet_wrap(~ document, ncol=N) + coord_flip()

# 予測はこんな感じ
predictions <- predictive.distribution(result$document_sums[,1:2], result$topics, 0.1, 0.1)
top.topic.words(t(predictions), 5)

# coraデータの読み込み（2410の科学記事のデータセット。LISTで2410成分ある）

data(cora.documents)

head(cora.documents, n = 2)

# 科学記事で使われているユニーク単語(2910個)のベクトル

data(cora.vocab)

head(cora.vocab)

# 科学記事で使われているタイトル(2410個)のベクトル

data(cora.titles)

head(cora.titles)

# 分析データの作成（トリッキーな参照をしているので注意）

# 1列目がcora.documentsの第一成分で使われる単語のリスト、2列目がその出現回数

data_cora <- as.data.frame(cbind(cora.vocab[cora.documents[[1]][1, ] + 1], cora.documents[[1]][2,]))

# coreの1番目の記事はこれらの単語とその出現回数で構成されていることが分かる。

head(data_cora)

### LDA

# 推定するトピック数の設定

k <- 10

# ldaパッケージはギブスサンプラーでやるようです。

# ギブスサンプラーの中でも3つくらいmethodがあるようです。

result <- lda.collapsed.gibbs.sampler(cora.documents,

cora.vocab,

25, # 繰り返し数

0.1, # ディリクレ過程のハイパーパラメータα

0.1, # ディリクレ過程のハイパーパラメータη

compute.log.likelihood=TRUE)

# サマリを見ると、10成分のリストで構成されている。

# assignments：文書Dと同じ長さのリスト。値は単語が割り当てられたトピックNoを示す。

# topic：k × vの行列。値はそのトピックに出現する単語数を表す。

# topic_sums：それぞれのトピックに割り当てられた単語の合計数

# document_sums：k × Dの行列。割り振られたトピックにおける一文章内の単語数を示す。

summary(result)

# 各クラスターでの上位キーワードを抽出する

# 例は各トピックにおける上位3位の単語の行列。

top.words <- top.topic.words(result$topics, 3, by.score=TRUE)

top.words

# 最初の3記事だけトピック割合を抽出してみる

N <- 3

topic.proportions <- t(result$document_sums) / colSums(result$document_sums)

topic.proportions <- topic.proportions[1:N, ]

topic.proportions[is.na(topic.proportions)] <- 1 / k

# 上位3番までのトップワードを用いて列名をつけて、意味付けを行う。

colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=" ")

par(mar=c(5, 14, 2, 2))

barplot(topic.proportions, beside=TRUE, horiz=TRUE, las=1, xlab="proportion")

###

# ggplotで可視化するために、meltを駆使してデータを作成（トリッキーなので注意）

topic.proportions.df <- melt(cbind(data.frame(topic.proportions), document=factor(1:N)), variable.name="topic", id.vars = "document")

# ggplotで可視化この部分がうまく動かないので修正

#http://tutorials.iq.harvard.edu/R/Rgraphics/Rgraphics.html

ggplot(topic.proportions.df, aes(x=variable, y=value, fill=document)) + geom_bar(stat="identity") + facet_wrap(~ document, ncol=N) + coord_flip()

# 予測はこんな感じ

predictions <- predictive.distribution(result$document_sums[,1:2], result$topics, 0.1, 0.1)

top.topic.words(t(predictions), 5)

まとめ

Rのパッケージは時々仕様が変わってしまいますので自分でコードを修正する必要があります。