R – ページ 2 – IT研究所

ubuntu14.04へRの最新版をapt-getでインストール

投稿者: utsubo 投稿日: 2016-07-12 in R

ubuntu14.04にａｐｔ−ｇｅｔでインストールできるRのバージョンは３．０．２とちょっと古いです。
最新版をインストールするには以下のようにします

$ sudo echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" >> /etc/apt/sources.list
$ sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9
$ sudo add-apt-repository ppa:marutter/rdev
$ sudo apt-get update
$ sudo apt-get upgrade
$ sudo apt-get install r-base

$ sudo echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" >> /etc/apt/sources.list

$ sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9

$ sudo add-apt-repository ppa:marutter/rdev

$ sudo apt-get update

$ sudo apt-get upgrade

$ sudo apt-get install r-base

PROXY環境下の場合には

export http_proxy="http://server:port/"

1	export http_proxy="http://server:port/"

の環境変数を入れる必要があります

ここを参考にしました

RSRubyの使い方

投稿者: utsubo 投稿日: 2016-07-11 in R、ruby

よく忘れるのでメモ

#!/bin/env ruby
# coding:utf-8

require  "rsruby"
require "rsruby/dataframe"

r=RSRuby.instance
r.class_table["data.frame"]=lambda{|x| DataFrame.new(x)}
RSRuby.set_default_mode(RSRuby::CLASS_CONVERSION)

x=[1,2,3]
y=[2,4,6]
df=r.as_data_frame(:x=>{'x'=>x,'y'=>y})
r.assign('df',df)
res=r.eval_R(<<R_COMMAND)
  summary(lm(y ~ x ,df))
R_COMMAND
p res["r.squared"]

#!/bin/env ruby

# coding:utf-8

require "rsruby"

require "rsruby/dataframe"

r=RSRuby.instance

r.class_table["data.frame"]=lambda{|x| DataFrame.new(x)}

RSRuby.set_default_mode(RSRuby::CLASS_CONVERSION)

x=[1,2,3]

y=[2,4,6]

df=r.as_data_frame(:x=>{'x'=>x,'y'=>y})

r.assign('df',df)

res=r.eval_R(<<R_COMMAND)

summary(lm(y ~ x ,df))

R_COMMAND

p res["r.squared"]

RでLDA

投稿者: utsubo 投稿日: 2016-07-04 in R

自分用メモ

こちらのRでLDAのサンプルがありますが、うまく動かなかったので修正メモ

環境

MacOS10.11
RStudio 0.99.484
R version 3.3.0

エラー

こちらのスクリプトを順番に実行していくと

# ggplotで可視化
ggplot(topic.proportions.df, aes(x=topic, y=value, fill=document)) + geom_bar() + facet_wrap(~ document, ncol=N) + coord_flip()

1 2	# ggplotで可視化 ggplot(topic.proportions.df, aes(x=topic, y=value, fill=document)) + geom_bar() + facet_wrap(~ document, ncol=N) + coord_flip()

この部分で以下のエラーが出ます

 eval(expr, envir, enclos) でエラー:  オブジェクト 'topic' がありません

1	eval(expr, envir, enclos) でエラー: オブジェクト 'topic' がありません

以下のように修正します

# ggplotで可視化
ggplot(topic.proportions.df, aes(x=variable, y=value, fill=document)) + geom_bar() + facet_wrap(~ document, ncol=N) + coord_flip()

1 2	# ggplotで可視化 ggplot(topic.proportions.df, aes(x=variable, y=value, fill=document)) + geom_bar() + facet_wrap(~ document, ncol=N) + coord_flip()

次はこのエラーです

 エラー: stat_count() must not be used with a y aesthetic.

1	エラー: stat_count() must not be used with a y aesthetic.

ここを参考に修正します

修正スクリプト

# coraデータの読み込み（2410の科学記事のデータセット。LISTで2410成分ある）
data(cora.documents)
head(cora.documents, n = 2)

# 科学記事で使われているユニーク単語(2910個)のベクトル
data(cora.vocab)
head(cora.vocab)

# 科学記事で使われているタイトル(2410個)のベクトル
data(cora.titles)
head(cora.titles)

# 分析データの作成（トリッキーな参照をしているので注意）
# 1列目がcora.documentsの第一成分で使われる単語のリスト、2列目がその出現回数
data_cora <- as.data.frame(cbind(cora.vocab[cora.documents[[1]][1, ] + 1], cora.documents[[1]][2,]))
# coreの1番目の記事はこれらの単語とその出現回数で構成されていることが分かる。
head(data_cora)

### LDA
# 推定するトピック数の設定
k <- 10

# ldaパッケージはギブスサンプラーでやるようです。
# ギブスサンプラーの中でも3つくらいmethodがあるようです。
result <- lda.collapsed.gibbs.sampler(cora.documents, 
                                      k,
                                      cora.vocab,
                                      25,  # 繰り返し数
                                      0.1, # ディリクレ過程のハイパーパラメータα
                                      0.1, # ディリクレ過程のハイパーパラメータη
                                      compute.log.likelihood=TRUE)

# サマリを見ると、10成分のリストで構成されている。
# assignments：文書Dと同じ長さのリスト。値は単語が割り当てられたトピックNoを示す。
# topic：k × vの行列。値はそのトピックに出現する単語数を表す。
# topic_sums：それぞれのトピックに割り当てられた単語の合計数
# document_sums：k × Dの行列。割り振られたトピックにおける一文章内の単語数を示す。
summary(result)

# 各クラスターでの上位キーワードを抽出する
# 例は各トピックにおける上位3位の単語の行列。
top.words <- top.topic.words(result$topics, 3, by.score=TRUE)
top.words

# 最初の3記事だけトピック割合を抽出してみる
N <- 3
topic.proportions <- t(result$document_sums) / colSums(result$document_sums)
topic.proportions <- topic.proportions[1:N, ]
topic.proportions[is.na(topic.proportions)] <-  1 / k

# 上位3番までのトップワードを用いて列名をつけて、意味付けを行う。
colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=" ")
par(mar=c(5, 14, 2, 2))
barplot(topic.proportions, beside=TRUE, horiz=TRUE, las=1, xlab="proportion")


###
# ggplotで可視化するために、meltを駆使してデータを作成（トリッキーなので注意）
topic.proportions.df <- melt(cbind(data.frame(topic.proportions), document=factor(1:N)), variable.name="topic", id.vars = "document")

# ggplotで可視化  この部分がうまく動かないので修正
#http://tutorials.iq.harvard.edu/R/Rgraphics/Rgraphics.html
ggplot(topic.proportions.df, aes(x=variable, y=value, fill=document)) + geom_bar(stat="identity") + facet_wrap(~ document, ncol=N) + coord_flip()

# 予測はこんな感じ
predictions <- predictive.distribution(result$document_sums[,1:2], result$topics, 0.1, 0.1)
top.topic.words(t(predictions), 5)

# coraデータの読み込み（2410の科学記事のデータセット。LISTで2410成分ある）

data(cora.documents)

head(cora.documents, n = 2)

# 科学記事で使われているユニーク単語(2910個)のベクトル

data(cora.vocab)

head(cora.vocab)

# 科学記事で使われているタイトル(2410個)のベクトル

data(cora.titles)

head(cora.titles)

# 分析データの作成（トリッキーな参照をしているので注意）

# 1列目がcora.documentsの第一成分で使われる単語のリスト、2列目がその出現回数

data_cora <- as.data.frame(cbind(cora.vocab[cora.documents[[1]][1, ] + 1], cora.documents[[1]][2,]))

# coreの1番目の記事はこれらの単語とその出現回数で構成されていることが分かる。

head(data_cora)

### LDA

# 推定するトピック数の設定

k <- 10

# ldaパッケージはギブスサンプラーでやるようです。

# ギブスサンプラーの中でも3つくらいmethodがあるようです。

result <- lda.collapsed.gibbs.sampler(cora.documents,

cora.vocab,

25, # 繰り返し数

0.1, # ディリクレ過程のハイパーパラメータα

0.1, # ディリクレ過程のハイパーパラメータη

compute.log.likelihood=TRUE)

# サマリを見ると、10成分のリストで構成されている。

# assignments：文書Dと同じ長さのリスト。値は単語が割り当てられたトピックNoを示す。

# topic：k × vの行列。値はそのトピックに出現する単語数を表す。

# topic_sums：それぞれのトピックに割り当てられた単語の合計数

# document_sums：k × Dの行列。割り振られたトピックにおける一文章内の単語数を示す。

summary(result)

# 各クラスターでの上位キーワードを抽出する

# 例は各トピックにおける上位3位の単語の行列。

top.words <- top.topic.words(result$topics, 3, by.score=TRUE)

top.words

# 最初の3記事だけトピック割合を抽出してみる

N <- 3

topic.proportions <- t(result$document_sums) / colSums(result$document_sums)

topic.proportions <- topic.proportions[1:N, ]

topic.proportions[is.na(topic.proportions)] <- 1 / k

# 上位3番までのトップワードを用いて列名をつけて、意味付けを行う。

colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=" ")

par(mar=c(5, 14, 2, 2))

barplot(topic.proportions, beside=TRUE, horiz=TRUE, las=1, xlab="proportion")

###

# ggplotで可視化するために、meltを駆使してデータを作成（トリッキーなので注意）

topic.proportions.df <- melt(cbind(data.frame(topic.proportions), document=factor(1:N)), variable.name="topic", id.vars = "document")

# ggplotで可視化この部分がうまく動かないので修正

#http://tutorials.iq.harvard.edu/R/Rgraphics/Rgraphics.html

ggplot(topic.proportions.df, aes(x=variable, y=value, fill=document)) + geom_bar(stat="identity") + facet_wrap(~ document, ncol=N) + coord_flip()

# 予測はこんな感じ

predictions <- predictive.distribution(result$document_sums[,1:2], result$topics, 0.1, 0.1)

top.topic.words(t(predictions), 5)

まとめ

Rのパッケージは時々仕様が変わってしまいますので自分でコードを修正する必要があります。

RでSHA

投稿者: utsubo 投稿日: 2016-02-16 in R

RでSHA1を作成する。serialize=Fを入れるのがポイント

&gt; install.packages("digest")
&gt; library(digest)
&gt; digest("abcdefg",algo="sha1",serialize=F)
[1] "2fb5e13419fc89246865e7a324f476ec624e8740"

> install.packages("digest")

> library(digest)

> digest("abcdefg",algo="sha1",serialize=F)

[1] "2fb5e13419fc89246865e7a324f476ec624e8740"

rubyの出力との比較

$ irb
irb(main):001:0&gt; require 'digest/sha1'
=&gt; true
irb(main):002:0&gt; Digest::SHA1.hexdigest("abcdefg")
=&gt; "2fb5e13419fc89246865e7a324f476ec624e8740"

$ irb

irb(main):001:0> require 'digest/sha1'

=> true

irb(main):002:0> Digest::SHA1.hexdigest("abcdefg")

=> "2fb5e13419fc89246865e7a324f476ec624e8740"

AWS EMR でSparkRを使って見る

投稿者: utsubo 投稿日: 2015-12-03 in AWS、R

AWSEMRとは、SparkやらHiveやらそれら一式を簡単に使える様にしてくれている仕組みです。

ぽちぽちっとEMRでサーバを作成。

この間１０分程度

SparkRでサンプルデータを解析してみます

こちらの内容をアレンジしてみました

http://engineer.recruit-lifestyle.co.jp/techblog/2015-08-19-sparkr/

データ取得

http://stat-computing.org/dataexpo/2009/the-data.html

こちらから２００１、２、３のデータをダウンロード

$ wget http://stat-computing.org/dataexpo/2009/2001.csv.bz2

1	$ wget http://stat-computing.org/dataexpo/2009/2001.csv.bz2

unzip

$ bunzip2 2001.csv.bz2

1	$ bunzip2 2001.csv.bz2

s3にアップロード

$ aws s3 cp 2001.csv s3://samplebucket/airline/

1	$ aws s3 cp 2001.csv s3://samplebucket/airline/

同様に2002,2003も繰り返す

Hive

$ hive
hive> add jar /usr/lib/hive/lib/hive-contrib.jar;
Added [/usr/lib/hive/lib/hive-contrib.jar] to class path
Added resources: [/usr/lib/hive/lib/hive-contrib.jar]
hive> create table airline(
		> Year STRING,
		> Month STRING,
		> DayofMonth STRING,
		> DayOfWeek STRING,
		> DepTime STRING,
		> CRSDepTime STRING,
		> ArrTime STRING,
		> CRSArrTime STRING,
		> UniqueCarrier STRING,
		> FlightNum STRING,
		> TailNum STRING,
		> ActualElapsedTime STRING,
		> CRSElapsedTime STRING,
		> AirTime STRING,
		> ArrDelay STRING,
		> DepDelay STRING,
		> Origin STRING,
		> Dest STRING,
		> Distance STRING,
		> TaxiIn STRING,
		> TaxiOut STRING,
		> Cancelled STRING,
		> CancellationCode STRING,
		> Diverted STRING,
		> CarrierDelay STRING,
		> WeatherDelay STRING,
		> NASDelay STRING,
		> SecurityDelay STRING,
		> LateAircraftDelay STRING
		> )
		> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
		> LOCATION 's3://samplebucket/airline/' tblproperties ("skip.header.line.count"="1");
hive> select * from airline limit 1;
OK
2001	1	17	3	1806	1810	1931	1934	US	375	N700&#65533;&#65533;	85	84	60	-3	-4	BWI	CLT	361	5	20	0	NA	0	NA	NA	NA	NA	NA

$ hive

hive> add jar /usr/lib/hive/lib/hive-contrib.jar;

Added [/usr/lib/hive/lib/hive-contrib.jar] to class path

Added resources: [/usr/lib/hive/lib/hive-contrib.jar]

hive> create table airline(

> Year STRING,

> Month STRING,

> DayofMonth STRING,

> DayOfWeek STRING,

> DepTime STRING,

> CRSDepTime STRING,

> ArrTime STRING,

> CRSArrTime STRING,

> UniqueCarrier STRING,

> FlightNum STRING,

> TailNum STRING,

> ActualElapsedTime STRING,

> CRSElapsedTime STRING,

> AirTime STRING,

> ArrDelay STRING,

> DepDelay STRING,

> Origin STRING,

> Dest STRING,

> Distance STRING,

> TaxiIn STRING,

> TaxiOut STRING,

> Cancelled STRING,

> CancellationCode STRING,

> Diverted STRING,

> CarrierDelay STRING,

> WeatherDelay STRING,

> NASDelay STRING,

> SecurityDelay STRING,

> LateAircraftDelay STRING

> )

> ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'

> LOCATION 's3://samplebucket/airline/' tblproperties ("skip.header.line.count"="1");

hive> select * from airline limit 1;

2001 1 17 3 1806 1810 1931 1934 US 375 N700�� 85 84 60 -3 -4 BWI CLT 361 5 20 0 NA 0 NA NA NA NA NA

SparkR

$ sparkR
> install.packages("magrittr")
> library(magrittr)
> hiveContext <- sparkRHive.init(sc)
> airline<-sql(hiveContext,"select * from airline")
> class(airline)
[1] "DataFrame"
attr(,"package")
[1] "SparkR"
> airline %>%
+	 filter(airline$Origin == "JFK") %>%
+	 group_by(airline$Dest) %>%
+	 agg(count=n(airline$Dest)) %>%
+	 head
	Dest count																																		
1	IAH	1214
2	STL	2922
3	SNA	 805
4	MSP	1580
5	STT	1085
6	SAN	2723

$ sparkR

> install.packages("magrittr")

> library(magrittr)

> hiveContext <- sparkRHive.init(sc)

> airline<-sql(hiveContext,"select * from airline")

> class(airline)

[1] "DataFrame"

attr(,"package")

[1] "SparkR"

> airline %>%

+ filter(airline$Origin == "JFK") %>%

+ group_by(airline$Dest) %>%

+ agg(count=n(airline$Dest)) %>%

+ head

Dest count

1 IAH 1214

2 STL 2922

3 SNA 805

4 MSP 1580

5 STT 1085

6 SAN 2723

こんな感じで簡単にできました

RでDeeplearning

投稿者: utsubo 投稿日: 2015-10-30 in R

最近、Deeplearning、いいキーワードになっていますね。

これで解析さえすればバンバン売れる！なんて事は無いと思いますが、

Rで使い方を調べてみました。

h2oパッケージというものをRから使うことになります。

環境

R version 3.2.2
MacOS 10.11.1
jdk 1.8.0_40

h2oインストール

こちらを参考にインストール

http://d.hatena.ne.jp/dichika/20140503/p1

install.packages("h2o", repos=(c("http://s3.amazonaws.com/h2o-release/h2o/rel-kahan/5/R", getOption("repos"))))

1	install.packages("h2o", repos=(c("http://s3.amazonaws.com/h2o-release/h2o/rel-kahan/5/R", getOption("repos"))))

ubuntuなどでこのようなエラーが出た時には以下のように足りないものをインストールします

ERROR: configuration failed for package 'RCurl'

1	ERROR: configuration failed for package 'RCurl'

$ sudo apt-get install libcurl4-openssl-dev

1	$ sudo apt-get install libcurl4-openssl-dev

install.packages("RCurl")

install.packages("h2o", repos=(c("http://s3.amazonaws.com/h2o-release/h2o/rel-kahan/5/R", getOption("repos"))))

install.packages("RCurl")

install.packages("h2o", repos=(c("http://s3.amazonaws.com/h2o-release/h2o/rel-kahan/5/R", getOption("repos"))))

Deeplearning

こちらのコードにh2oのDeeplearningを追加します

http://yut.hatenablog.com/entry/20120827/1346024147

library( kernlab )
data(spam)
rowdata<-nrow(spam)
random_ids<-sample(rowdata,rowdata*0.5)
spam_training<-spam[random_ids,]
spam_predicting<-spam[-random_ids,]

#svm
library( kernlab )
spam_svm<-ksvm(type ~., data=spam_training )
spam_predict<-predict(spam_svm,spam_predicting[,-58])
table(spam_predict, spam_predicting[,58])

# nnet
library( nnet )
spam_nn<-nnet(type ~., data=spam_training,size = 2, rang = .1, decay = 5e-4, maxit = 200 )
spam_predict<-predict(spam_nn,spam_predicting[,-58],type="class")
table(spam_predict, spam_predicting[,58])

# naivebayes
library( e1071 )
spam_nn<-naiveBayes(type ~., data=spam_training)
spam_predict<-predict(spam_nn,spam_predicting[,-58],type="class")
table(spam_predict, spam_predicting[,58])


# deeplearning
library(h2o)
localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE)
spam_h2o<-h2o.deeplearning(x=1:57,y=58,training_frame=as.h2o(spam_training))
spam_predict<-h2o.predict(spam_h2o,as.h2o(spam_predicting[,-58]))
table(as.data.frame(spam_predict)[,1],spam_predicting[,58])
h2o.shutdown(localH2O)

library( kernlab )

data(spam)

rowdata<-nrow(spam)

random_ids<-sample(rowdata,rowdata*0.5)

spam_training<-spam[random_ids,]

spam_predicting<-spam[-random_ids,]

#svm

library( kernlab )

spam_svm<-ksvm(type ~., data=spam_training )

spam_predict<-predict(spam_svm,spam_predicting[,-58])

table(spam_predict, spam_predicting[,58])

# nnet

library( nnet )

spam_nn<-nnet(type ~., data=spam_training,size = 2, rang = .1, decay = 5e-4, maxit = 200 )

spam_predict<-predict(spam_nn,spam_predicting[,-58],type="class")

table(spam_predict, spam_predicting[,58])

# naivebayes

library( e1071 )

spam_nn<-naiveBayes(type ~., data=spam_training)

spam_predict<-predict(spam_nn,spam_predicting[,-58],type="class")

table(spam_predict, spam_predicting[,58])

# deeplearning

library(h2o)

localH2O = h2o.init(ip = "localhost", port = 54321, startH2O = TRUE)

spam_h2o<-h2o.deeplearning(x=1:57,y=58,training_frame=as.h2o(spam_training))

spam_predict<-h2o.predict(spam_h2o,as.h2o(spam_predicting[,-58]))

table(as.data.frame(spam_predict)[,1],spam_predicting[,58])

h2o.shutdown(localH2O)

結果

SVM

spam_predict nonspam spam
		 nonspam		1338	117
		 spam				 62	784

spam_predict nonspam spam

nonspam 1338 117

spam 62 784

(1338+784)/(1338+117+62+784)=0.9222077

nnet

spam_predict nonspam spam
		 nonspam		1313	 86
		 spam				 87	815

spam_predict nonspam spam

nonspam 1313 86

spam 87 815

(1313+815)/(1313+86+87+815)=0.9248153

naivebayes

spam_predict nonspam spam
		 nonspam		 752	 51
		 spam				648	850

spam_predict nonspam spam

nonspam 752 51

spam 648 850

(752+850)/(752+51+648+850)=0.696219

h2o

					nonspam spam
	nonspam		1321	 83
	spam				 74	823

nonspam spam

nonspam 1321 83

spam 74 823

(1321+823)/(1321+83+74+823)=0.9317688

Deeplearningが一番正解率高いですね

Rでチャートを書いてみる(9)

投稿者: utsubo 投稿日: 2015-05-13 in R

Rで作成したチャートをファイル保存する際にちょっとハマってしまったのでメモ

チャートを作成する際に、銘柄コードでグルグル回して作成したい場合があります。その際に、ロウソク足だけ、とかなら大丈夫なのですが、その上に重ね合わせたりする場合にファイル作成時にはうまくいかないことがあります。

ロウソク足

これはＯＫです

png("file.png")
candleChart(ohlc,theme="white")
dev.off()

png("file.png")

candleChart(ohlc,theme="white")

dev.off()

ロウソク足＋α

これだとpointsが描かれない

png("file.png")
candleChart(ohlc,theme="white")
addTA(points,on=1,col="red",type="b")
dev.off()

png("file.png")

candleChart(ohlc,theme="white")

addTA(points,on=1,col="red",type="b")

dev.off()

こうすればうまくファイルに出力されます

candleChart(ohlc,theme="white")
plot(addTA(points,on=1,col="red",type="b")
dev.copy(png,"file.png")
dev.off()

candleChart(ohlc,theme="white")

plot(addTA(points,on=1,col="red",type="b")

dev.copy(png,"file.png")

dev.off()

この辺りを参考にしました

http://stackoverflow.com/questions/18556548/is-it-possible-to-build-a-quantmod-chart-incrementally-and-export-the-final-resu

http://stackoverflow.com/questions/18342703/r-appears-to-fail-to-execute-a-line-in-a-function/18342756#18342756

Rでチャートを書いてみる(9)

投稿者: utsubo 投稿日: 2015-01-20 in R 1件のコメント

どうやらまたYahooの時系列データの仕様が変わったようです。

http://d.hatena.ne.jp/anagotan/20140816/1408276789

こちらで書いたものが使えなくなっていました。具体的には先頭（直近）のページのみのデータしか拾えていません。

そこでちょっとまたまた、いじってみました。

RFinanceYJを作った方の真意わかりませんがquoteTsData関数の

while( result.num >= 51 ){
..
result.num <- xmlSize(quote.table)

while( result.num >= 51 ){

result.num <- xmlSize(quote.table)

この部分で次ページがあるかどうか判定していたのがうまく作用しなくなっています。

そこで下記のように修正してみました。

library(RFinanceYJ)
#API
quoteStockTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')
{
	time.interval <- substr(time.interval,1,1)
	function.stock <- function(quote.table.item){
		if( xmlSize(quote.table.item) < 5) return(NULL) 
		d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)
		o <- as.number(xmlValue(quote.table.item[[2]]))
		h <- as.number(xmlValue(quote.table.item[[3]]))
		l <- as.number(xmlValue(quote.table.item[[4]]))
		c <- as.number(xmlValue(quote.table.item[[5]]))
		v <- ifelse(xmlSize(quote.table.item) >= 6,as.number(xmlValue(quote.table.item[[6]])),0)
		a <- ifelse(xmlSize(quote.table.item) >= 7,as.number(xmlValue(quote.table.item[[7]])),0)
		return(data.frame(date=d,open=o,high=h,low=l,close=c,volume=v, adj_close=a))
	}
	return(quoteTsData(x,function.stock,since,start.num,date.end,time.interval,type="stock"))
}
quoteFundTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')
{
	time.interval <- substr(time.interval,1,1)
	function.fund <- function(quote.table.item){
		d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)
		if(time.interval=='monthly'){
			d <- endOfMonth(d)
		}
		c <- as.number(xmlValue(quote.table.item[[2]]))
		v <- as.number(xmlValue(quote.table.item[[3]]))
		return(data.frame(date=d,constant.value=c,NAV=v))
	}
	return(quoteTsData(x,function.fund,since,start.num,date.end,time.interval,type="fund"))
}
quoteFXTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')
{
	time.interval <- substr(time.interval,1,1)
	function.fx <- function(quote.table.item){
		d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)
		o <- as.number(xmlValue(quote.table.item[[2]]))
		h <- as.number(xmlValue(quote.table.item[[3]]))
		l <- as.number(xmlValue(quote.table.item[[4]]))
		c <- as.number(xmlValue(quote.table.item[[5]]))
		return(data.frame(date=d,open=o,high=h,low=l,close=c))
	}
	return(quoteTsData(x,function.fx,since,start.num,date.end,time.interval,type="fx"))
}
######	private functions	#####
#get time series data from Yahoo! Finance.
quoteTsData <- function(x,function.financialproduct,since,start.num,date.end,time.interval,type="stock"){
	r <- NULL
	result.num <- 51
	financial.data <- data.frame(NULL)
	#start <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&c=\\1&a=\\2&b=\\3",since))
	#end	 <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&f=\\1&d=\\2&e=\\3",date.end))
	start <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&sy=\\1&sm=\\2&sd=\\3",since))
	end	 <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&ey=\\1&em=\\2&ed=\\3",date.end))
	
	if(!any(time.interval==c('d','w','m'))) stop("Invalid time.interval value")
	
	extractQuoteTable <- function(r,type){
		if(type %in% c("fund","fx")){
			tbl <- r[[2]][[2]][[7]][[3]][[3]][[9]][[2]]
		}
		else{
			tbl <- r[[2]][[2]][[7]][[3]][[3]][[10]][[2]]
		}
		return(tbl)
	}
	
	#while( result.num >= 51 ){
	while(1){
		start.num <- start.num + 1
		quote.table <- NULL
		quote.url <- paste('http://info.finance.yahoo.co.jp/history/?code=',x,start,end,'&p=',start.num,'&tm=',substr(time.interval,1,1),sep="")
		#cat(quote.url)
		#try( r <- xmlRoot(htmlTreeParse(quote.url,error=xmlErrorCumulator(immediate=F))), TRUE)	# これだと取得時にエラーが出た。。
		try(r<-htmlParse(quote.url))
		if( is.null(r) ) stop(paste("Can not access :", quote.url))
		
		#try( quote.table <- r[[2]][[1]][[1]][[16]][[1]][[1]][[1]][[4]][[1]][[1]][[1]], TRUE )
		#try( quote.table <- extractQuoteTable(r,type), TRUE )
		try( quote.table <- xpathApply(r,"//table")[[2]], TRUE )
 
		quote.size<-xmlSize(quote.table)
		#cat(paste("size:",quote.size))
		if(xmlSize(quote.table)<=1){
			return (financial.data)
		}
		if( is.null(quote.table) ){
			if( is.null(financial.data) ){
				stop(paste("Can not quote :", x))
			}else{
				financial.data <- financial.data[order(financial.data$date),]
				return(financial.data)
			}
		}
		
		size <- xmlSize(quote.table)
		for(i in 2:size){
			financial.data <- rbind(financial.data,function.financialproduct(quote.table[[i]]))
		}
		
		#result.num <- xmlSize(quote.table)
		Sys.sleep(1)
	}
	financial.data <- financial.data[order(financial.data$date),]
	return(financial.data)	
}
#convert string formart date to POSIXct object
convertToDate <- function(date.string,time.interval)
{
	#data format is different between monthly and dialy or weekly
	if(any(time.interval==c('d','w'))){
		result <- gsub("^([0-9]{4})([^0-9]+)([0-9]{1,2})([^0-9]+)([0-9]{1,2})([^0-9]+)","\\1-\\3-\\5",date.string)
	}else if(time.interval=='m'){
		result <- gsub("^([0-9]{4})([^0-9]+)([0-9]{1,2})([^0-9]+)","\\1-\\3-01",date.string)
	}
	return(as.POSIXct(result))
}
#convert string to number.
as.number <- function(string)
{
	return(as.double(as.character(gsub("[^0-9.]", "",string))))
}
#return end of month date.
endOfMonth <- function(date.obj)
{
	startOfMonth		 <- as.Date(format(date.obj,"%Y%m01"),"%Y%m%d")
	startOfNextMonth <- as.Date(format(startOfMonth+31,"%Y%m01"),"%Y%m%d")
	return(startOfNextMonth-1)
}

quoteStockTsData("6758.t",since="2014-01-01")

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

library(RFinanceYJ)

#API

quoteStockTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')

{

time.interval <- substr(time.interval,1,1)

function.stock <- function(quote.table.item){

if( xmlSize(quote.table.item) < 5) return(NULL)

d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)

o <- as.number(xmlValue(quote.table.item[[2]]))

h <- as.number(xmlValue(quote.table.item[[3]]))

l <- as.number(xmlValue(quote.table.item[[4]]))

c <- as.number(xmlValue(quote.table.item[[5]]))

v <- ifelse(xmlSize(quote.table.item) >= 6,as.number(xmlValue(quote.table.item[[6]])),0)

a <- ifelse(xmlSize(quote.table.item) >= 7,as.number(xmlValue(quote.table.item[[7]])),0)

return(data.frame(date=d,open=o,high=h,low=l,close=c,volume=v, adj_close=a))

}

return(quoteTsData(x,function.stock,since,start.num,date.end,time.interval,type="stock"))

}

quoteFundTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')

{

time.interval <- substr(time.interval,1,1)

function.fund <- function(quote.table.item){

d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)

if(time.interval=='monthly'){

d <- endOfMonth(d)

}

c <- as.number(xmlValue(quote.table.item[[2]]))

v <- as.number(xmlValue(quote.table.item[[3]]))

return(data.frame(date=d,constant.value=c,NAV=v))

}

return(quoteTsData(x,function.fund,since,start.num,date.end,time.interval,type="fund"))

}

quoteFXTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')

{

time.interval <- substr(time.interval,1,1)

function.fx <- function(quote.table.item){

d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)

o <- as.number(xmlValue(quote.table.item[[2]]))

h <- as.number(xmlValue(quote.table.item[[3]]))

l <- as.number(xmlValue(quote.table.item[[4]]))

c <- as.number(xmlValue(quote.table.item[[5]]))

return(data.frame(date=d,open=o,high=h,low=l,close=c))

}

return(quoteTsData(x,function.fx,since,start.num,date.end,time.interval,type="fx"))

}

###### private functions #####

#get time series data from Yahoo! Finance.

quoteTsData <- function(x,function.financialproduct,since,start.num,date.end,time.interval,type="stock"){

r <- NULL

result.num <- 51

financial.data <- data.frame(NULL)

#start <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&c=\\1&a=\\2&b=\\3",since))

#end <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&f=\\1&d=\\2&e=\\3",date.end))

start <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&sy=\\1&sm=\\2&sd=\\3",since))

end <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&ey=\\1&em=\\2&ed=\\3",date.end))

if(!any(time.interval==c('d','w','m'))) stop("Invalid time.interval value")

extractQuoteTable <- function(r,type){

if(type %in% c("fund","fx")){

tbl <- r[[2]][[2]][[7]][[3]][[3]][[9]][[2]]

}

else{

tbl <- r[[2]][[2]][[7]][[3]][[3]][[10]][[2]]

}

return(tbl)

}

#while( result.num >= 51 ){

while(1){

start.num <- start.num + 1

quote.table <- NULL

quote.url <- paste('http://info.finance.yahoo.co.jp/history/?code=',x,start,end,'&p=',start.num,'&tm=',substr(time.interval,1,1),sep="")

#cat(quote.url)

#try( r <- xmlRoot(htmlTreeParse(quote.url,error=xmlErrorCumulator(immediate=F))), TRUE) # これだと取得時にエラーが出た。。

try(r<-htmlParse(quote.url))

if( is.null(r) ) stop(paste("Can not access :", quote.url))

#try( quote.table <- r[[2]][[1]][[1]][[16]][[1]][[1]][[1]][[4]][[1]][[1]][[1]], TRUE )

#try( quote.table <- extractQuoteTable(r,type), TRUE )

try( quote.table <- xpathApply(r,"//table")[[2]], TRUE )

quote.size<-xmlSize(quote.table)

#cat(paste("size:",quote.size))

if(xmlSize(quote.table)<=1){

return (financial.data)

}

if( is.null(quote.table) ){

if( is.null(financial.data) ){

stop(paste("Can not quote :", x))

}else{

financial.data <- financial.data[order(financial.data$date),]

return(financial.data)

}

size <- xmlSize(quote.table)

for(i in 2:size){

financial.data <- rbind(financial.data,function.financialproduct(quote.table[[i]]))

}

#result.num <- xmlSize(quote.table)

Sys.sleep(1)

}

financial.data <- financial.data[order(financial.data$date),]

return(financial.data)

}

#convert string formart date to POSIXct object

convertToDate <- function(date.string,time.interval)

{

#data format is different between monthly and dialy or weekly

if(any(time.interval==c('d','w'))){

result <- gsub("^([0-9]{4})([^0-9]+)([0-9]{1,2})([^0-9]+)([0-9]{1,2})([^0-9]+)","\\1-\\3-\\5",date.string)

}else if(time.interval=='m'){

result <- gsub("^([0-9]{4})([^0-9]+)([0-9]{1,2})([^0-9]+)","\\1-\\3-01",date.string)

}

return(as.POSIXct(result))

}

#convert string to number.

as.number <- function(string)

{

return(as.double(as.character(gsub("[^0-9.]", "",string))))

}

#return end of month date.

endOfMonth <- function(date.obj)

{

startOfMonth <- as.Date(format(date.obj,"%Y%m01"),"%Y%m%d")

startOfNextMonth <- as.Date(format(startOfMonth+31,"%Y%m01"),"%Y%m%d")

return(startOfNextMonth-1)

}

quoteStockTsData("6758.t",since="2014-01-01")

以前と同じようにRのコンソールにコピーアンドペーストで実行

> quoteStockTsData("6758.t",since="2014-01-01")
					date	 open	 high		low	close	 volume adj_close
1	 2015-01-20 2430.0 2466.5 2397.5 2462.5	8926300		2462.5
2	 2015-01-19 2425.0 2448.5 2402.0 2443.5	7436000		2443.5
3	 2015-01-16 2426.0 2437.5 2351.5 2384.0 15055500		2384.0
4	 2015-01-15 2453.5 2511.0 2452.5 2500.0	7843800		2500.0
..
252 2014-01-09 1898.0 1920.0 1870.0 1894.0 37843200		1894.0
253 2014-01-08 1799.0 1827.0 1788.0 1825.0 10182500		1825.0
254 2014-01-07 1820.0 1820.0 1792.0 1800.0	7516600		1800.0
255 2014-01-06 1815.0 1830.0 1787.0 1802.0 10114200		1802.0
>

> quoteStockTsData("6758.t",since="2014-01-01")

date open high low close volume adj_close

1 2015-01-20 2430.0 2466.5 2397.5 2462.5 8926300 2462.5

2 2015-01-19 2425.0 2448.5 2402.0 2443.5 7436000 2443.5

3 2015-01-16 2426.0 2437.5 2351.5 2384.0 15055500 2384.0

4 2015-01-15 2453.5 2511.0 2452.5 2500.0 7843800 2500.0

252 2014-01-09 1898.0 1920.0 1870.0 1894.0 37843200 1894.0

253 2014-01-08 1799.0 1827.0 1788.0 1825.0 10182500 1825.0

254 2014-01-07 1820.0 1820.0 1792.0 1800.0 7516600 1800.0

255 2014-01-06 1815.0 1830.0 1787.0 1802.0 10114200 1802.0

無事取れるようになりました

「構文解析中に不正なマルチバイト文字列がありました」のエラー

投稿者: utsubo 投稿日: 2014-10-07 in R

ちょっとはまってしまったのでメモ

Rの関数にsource関数というものがあります。予め共通処理を書いておいたファイルを読み込む機能です。サブルーチンみたいなものでしょうか？

そこでsourceする際に、エラーが出る場合があります

&gt; source("http://localhost/test.R")
以下にエラー source("http://localhost/test.R") :
構文解析中に不正なマルチバイト文字列がありました (102 行)
追加情報:	警告メッセージ:
In grepl("\n", lines, fixed = TRUE) :
入力文字列 102 はこのロケールでは不適切です

> source("http://localhost/test.R")

以下にエラー source("http://localhost/test.R") :

構文解析中に不正なマルチバイト文字列がありました (102 行)

追加情報: 警告メッセージ:

In grepl("\n", lines, fixed = TRUE) :

入力文字列 102 はこのロケールでは不適切です

どうやら、エンコーディングの問題みたいです

使っているファイルのエンコーディングに合わせて明示的に示してやる必要があります

&gt; source("http://localhost/test.R",encoding="utf-8")

1	> source("http://localhost/test.R",encoding="utf-8")

Rでチャートを書いてみる(8)

投稿者: utsubo 投稿日: 2014-08-23 in R

前回までのスクリプトをぐるぐる回すようにしてみました。

一銘柄ずつとると時間がかかるので並列で動くようにしています

RFinanceYJPatch.R

quoteStockMasterTsData <- function(){

	financial.data <- data.frame(NULL)

	function.stockMasterData<-function(hira){
		r <- NULL
		result.num <- 20
		master.data <- data.frame(NULL)
		start.num<-0
		while( result.num >= 20 ){
			start.num <- start.num + 1
			quote.table <- NULL
			quote.url <- paste('http://stocks.finance.yahoo.co.jp/stocks/qi/?js=',hira,'&p=',start.num,sep="")
		
			try( r <- xmlRoot(htmlTreeParse(quote.url,error=xmlErrorCumulator(immediate=F))), TRUE)
			if( is.null(r) ) stop(paste("Can not access :", quote.url))

			try( quote.table <- xpathApply(r,"//a[contains(@href,'/stocks/detail')]"), TRUE )
			
			if( is.null(quote.table) ){
				if( is.null(master.data) ){
					stop(paste("Can not quote :", x))
				}else{
					return(master.data)
				}
			}

			size <- xmlSize(quote.table)/3
			if(size==0){
					return(master.data)
			}
			for(i in 1:size){
				mtmp<-data.frame(code=xmlValue(quote.table[[i*3-2]]),name=xmlValue(quote.table[[i*3-1]]))
				mtmp$code<-as.character(mtmp$code)
				mtmp$name<-as.character(mtmp$name)
				master.data <- rbind(master.data,mtmp)
			}
			
			result.num <- xmlSize(quote.table)/3
			Sys.sleep(1)
		}
		return(master.data)
	}

	hiraList<-"あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよわ"
	for(i in 1:nchar(hiraList)){
		hira<-substring(hiraList,i,i)
		master.data<-function.stockMasterData(hira)
		financial.data<-rbind(financial.data,master.data)

	}
	financial.data <- financial.data[order(financial.data$code),]
	return(financial.data)	

}


#API
quoteStockTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')
{
	time.interval <- substr(time.interval,1,1)
	function.stock <- function(quote.table.item){
		if( xmlSize(quote.table.item) < 5) return(NULL) 
		d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)
		o <- as.number(xmlValue(quote.table.item[[2]]))
		h <- as.number(xmlValue(quote.table.item[[3]]))
		l <- as.number(xmlValue(quote.table.item[[4]]))
		c <- as.number(xmlValue(quote.table.item[[5]]))
		v <- ifelse(xmlSize(quote.table.item) >= 6,as.number(xmlValue(quote.table.item[[6]])),0)
		a <- ifelse(xmlSize(quote.table.item) >= 7,as.number(xmlValue(quote.table.item[[7]])),0)
		return(data.frame(date=d,open=o,high=h,low=l,close=c,volume=v, adj_close=a))
	}
	return(quoteTsData(x,function.stock,since,start.num,date.end,time.interval,type="stock"))
}
quoteFundTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')
{
	time.interval <- substr(time.interval,1,1)
	function.fund <- function(quote.table.item){
		d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)
		if(time.interval=='monthly'){
			d <- endOfMonth(d)
		}
		c <- as.number(xmlValue(quote.table.item[[2]]))
		v <- as.number(xmlValue(quote.table.item[[3]]))
		return(data.frame(date=d,constant.value=c,NAV=v))
	}
	return(quoteTsData(x,function.fund,since,start.num,date.end,time.interval,type="fund"))
}
quoteFXTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')
{
	time.interval <- substr(time.interval,1,1)
	function.fx <- function(quote.table.item){
		d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)
		o <- as.number(xmlValue(quote.table.item[[2]]))
		h <- as.number(xmlValue(quote.table.item[[3]]))
		l <- as.number(xmlValue(quote.table.item[[4]]))
		c <- as.number(xmlValue(quote.table.item[[5]]))
		return(data.frame(date=d,open=o,high=h,low=l,close=c))
	}
	return(quoteTsData(x,function.fx,since,start.num,date.end,time.interval,type="fx"))
}
######	private functions	#####
#get time series data from Yahoo! Finance.
quoteTsData <- function(x,function.financialproduct,since,start.num,date.end,time.interval,type="stock"){
	r <- NULL
	result.num <- 51
	financial.data <- data.frame(NULL)
	#start <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&c=\\1&a=\\2&b=\\3",since))
	#end	 <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&f=\\1&d=\\2&e=\\3",date.end))
	start <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&sy=\\1&sm=\\2&sd=\\3",since))
	end	 <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&ey=\\1&em=\\2&ed=\\3",date.end))

	if(!any(time.interval==c('d','w','m'))) stop("Invalid time.interval value")
	
	extractQuoteTable <- function(r,type){
		if(type %in% c("fund","fx")){
			tbl <- r[[2]][[2]][[7]][[3]][[3]][[9]][[2]]
		}
		else{
			tbl <- r[[2]][[2]][[7]][[3]][[3]][[10]][[2]]
		}
		return(tbl)
	}
	
	while( result.num >= 51 ){
		start.num <- start.num + 1
		quote.table <- NULL
		quote.url <- paste('http://info.finance.yahoo.co.jp/history/?code=',x,start,end,'&p=',start.num,'&tm=',substr(time.interval,1,1),sep="")
	
		try( r <- xmlRoot(htmlTreeParse(quote.url,error=xmlErrorCumulator(immediate=F))), TRUE)
		if( is.null(r) ) stop(paste("Can not access :", quote.url))

		#try( quote.table <- r[[2]][[1]][[1]][[16]][[1]][[1]][[1]][[4]][[1]][[1]][[1]], TRUE )
		#try( quote.table <- extractQuoteTable(r,type), TRUE )
		try( quote.table <- xpathApply(r,"//table")[[2]], TRUE )
		
		if( is.null(quote.table) ){
			if( is.null(financial.data) ){
				stop(paste("Can not quote :", x))
			}else{
				 financial.data <- financial.data[order(financial.data$date),]
				 return(financial.data)
			}
		}

		size <- xmlSize(quote.table)
		for(i in 2:size){
			financial.data <- rbind(financial.data,function.financialproduct(quote.table[[i]]))
		}
		
		result.num <- xmlSize(quote.table)
		Sys.sleep(1)
	}
	financial.data <- financial.data[order(financial.data$date),]
	return(financial.data)	
}
#convert string formart date to POSIXct object
convertToDate <- function(date.string,time.interval)
{
	#data format is different between monthly and dialy or weekly
	if(any(time.interval==c('d','w'))){
		result <- gsub("^([0-9]{4})([^0-9]+)([0-9]{1,2})([^0-9]+)([0-9]{1,2})([^0-9]+)","\\1-\\3-\\5",date.string)
	}else if(time.interval=='m'){
		result <- gsub("^([0-9]{4})([^0-9]+)([0-9]{1,2})([^0-9]+)","\\1-\\3-01",date.string)
	}
	return(as.POSIXct(result))
}
#convert string to number.
as.number <- function(string)
{
	return(as.double(as.character(gsub("[^0-9.]", "",string))))
}
#return end of month date.
endOfMonth <- function(date.obj)
{
	startOfMonth		 <- as.Date(format(date.obj,"%Y%m01"),"%Y%m%d")
	startOfNextMonth <- as.Date(format(startOfMonth+31,"%Y%m01"),"%Y%m%d")
	return(startOfNextMonth-1)
}


library(RSQLite)

# fromDate: yyyy-MM-dd
getStockHistoricalData<-function(sqliteFile,codes,fromDate){
	drv<-dbDriver("SQLite")
	con<-dbConnect(drv,dbname=sqliteFile)
	try(rs<-dbSendQuery(con,"drop table historicalData"))
	rs<-dbSendQuery(con,"create table historicalData (date text,open real,high real,
		low real,close real,volume real,adj_close real,code text)")
	rs<-dbSendQuery(con,"create unique index idx_historicalData on historicalData(code,date)")

	#stockMaster<-quoteStockMasterTsData()

	for(code in codes){
		print(paste("code=",code,sep=""))
		tryCatch({
			data<-quoteStockTsData(code,fromDate)
			data<-transform(data,code=code)
			data$code<-as.character(data$code)
			data$date<-as.character(data$date)
			dbBeginTransaction(con)
			dbSendQuery(con,paste("delete from historicalData where code='",code,"'",sep=""))
			dbSendPreparedQuery(con,"insert into historicalData
				(date,open,high,low,close,volume,adj_close,code)
				values(:date, :open, :high, :low, :close, :volume, :adj_close, :code)",bind.data=data)
			dbCommit(con)
		},
		error = function(e){
			message(paste("ERROR:",code))
			message(e)
		
		})
	}
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

quoteStockMasterTsData <- function(){

financial.data <- data.frame(NULL)

function.stockMasterData<-function(hira){

r <- NULL

result.num <- 20

master.data <- data.frame(NULL)

start.num<-0

while( result.num >= 20 ){

start.num <- start.num + 1

quote.table <- NULL

quote.url <- paste('http://stocks.finance.yahoo.co.jp/stocks/qi/?js=',hira,'&p=',start.num,sep="")

try( r <- xmlRoot(htmlTreeParse(quote.url,error=xmlErrorCumulator(immediate=F))), TRUE)

if( is.null(r) ) stop(paste("Can not access :", quote.url))

try( quote.table <- xpathApply(r,"//a[contains(@href,'/stocks/detail')]"), TRUE )

if( is.null(quote.table) ){

if( is.null(master.data) ){

stop(paste("Can not quote :", x))

}else{

return(master.data)

}

size <- xmlSize(quote.table)/3

if(size==0){

return(master.data)

}

for(i in 1:size){

mtmp<-data.frame(code=xmlValue(quote.table[[i*3-2]]),name=xmlValue(quote.table[[i*3-1]]))

mtmp$code<-as.character(mtmp$code)

mtmp$name<-as.character(mtmp$name)

master.data <- rbind(master.data,mtmp)

}

result.num <- xmlSize(quote.table)/3

Sys.sleep(1)

}

return(master.data)

}

hiraList<-"あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよわ"

for(i in 1:nchar(hiraList)){

hira<-substring(hiraList,i,i)

master.data<-function.stockMasterData(hira)

financial.data<-rbind(financial.data,master.data)

}

financial.data <- financial.data[order(financial.data$code),]

return(financial.data)

}

#API

quoteStockTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')

{

time.interval <- substr(time.interval,1,1)

function.stock <- function(quote.table.item){

if( xmlSize(quote.table.item) < 5) return(NULL)

d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)

o <- as.number(xmlValue(quote.table.item[[2]]))

h <- as.number(xmlValue(quote.table.item[[3]]))

l <- as.number(xmlValue(quote.table.item[[4]]))

c <- as.number(xmlValue(quote.table.item[[5]]))

v <- ifelse(xmlSize(quote.table.item) >= 6,as.number(xmlValue(quote.table.item[[6]])),0)

a <- ifelse(xmlSize(quote.table.item) >= 7,as.number(xmlValue(quote.table.item[[7]])),0)

return(data.frame(date=d,open=o,high=h,low=l,close=c,volume=v, adj_close=a))

}

return(quoteTsData(x,function.stock,since,start.num,date.end,time.interval,type="stock"))

}

quoteFundTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')

{

time.interval <- substr(time.interval,1,1)

function.fund <- function(quote.table.item){

d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)

if(time.interval=='monthly'){

d <- endOfMonth(d)

}

c <- as.number(xmlValue(quote.table.item[[2]]))

v <- as.number(xmlValue(quote.table.item[[3]]))

return(data.frame(date=d,constant.value=c,NAV=v))

}

return(quoteTsData(x,function.fund,since,start.num,date.end,time.interval,type="fund"))

}

quoteFXTsData <- function(x, since=NULL,start.num=0,date.end=NULL,time.interval='daily')

{

time.interval <- substr(time.interval,1,1)

function.fx <- function(quote.table.item){

d <- convertToDate(xmlValue(quote.table.item[[1]]),time.interval)

o <- as.number(xmlValue(quote.table.item[[2]]))

h <- as.number(xmlValue(quote.table.item[[3]]))

l <- as.number(xmlValue(quote.table.item[[4]]))

c <- as.number(xmlValue(quote.table.item[[5]]))

return(data.frame(date=d,open=o,high=h,low=l,close=c))

}

return(quoteTsData(x,function.fx,since,start.num,date.end,time.interval,type="fx"))

}

###### private functions #####

#get time series data from Yahoo! Finance.

quoteTsData <- function(x,function.financialproduct,since,start.num,date.end,time.interval,type="stock"){

r <- NULL

result.num <- 51

financial.data <- data.frame(NULL)

#start <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&c=\\1&a=\\2&b=\\3",since))

#end <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&f=\\1&d=\\2&e=\\3",date.end))

start <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&sy=\\1&sm=\\2&sd=\\3",since))

end <- (gsub("([0-9]{4,4})-([0-9]{2,2})-([0-9]{2,2})","&ey=\\1&em=\\2&ed=\\3",date.end))

if(!any(time.interval==c('d','w','m'))) stop("Invalid time.interval value")

extractQuoteTable <- function(r,type){

if(type %in% c("fund","fx")){

tbl <- r[[2]][[2]][[7]][[3]][[3]][[9]][[2]]

}

else{

tbl <- r[[2]][[2]][[7]][[3]][[3]][[10]][[2]]

}

return(tbl)

}

while( result.num >= 51 ){

start.num <- start.num + 1

quote.table <- NULL

quote.url <- paste('http://info.finance.yahoo.co.jp/history/?code=',x,start,end,'&p=',start.num,'&tm=',substr(time.interval,1,1),sep="")

try( r <- xmlRoot(htmlTreeParse(quote.url,error=xmlErrorCumulator(immediate=F))), TRUE)

if( is.null(r) ) stop(paste("Can not access :", quote.url))

#try( quote.table <- r[[2]][[1]][[1]][[16]][[1]][[1]][[1]][[4]][[1]][[1]][[1]], TRUE )

#try( quote.table <- extractQuoteTable(r,type), TRUE )

try( quote.table <- xpathApply(r,"//table")[[2]], TRUE )

if( is.null(quote.table) ){

if( is.null(financial.data) ){

stop(paste("Can not quote :", x))

}else{

financial.data <- financial.data[order(financial.data$date),]

return(financial.data)

}

size <- xmlSize(quote.table)

for(i in 2:size){

financial.data <- rbind(financial.data,function.financialproduct(quote.table[[i]]))

}

result.num <- xmlSize(quote.table)

Sys.sleep(1)

}

financial.data <- financial.data[order(financial.data$date),]

return(financial.data)

}

#convert string formart date to POSIXct object

convertToDate <- function(date.string,time.interval)

{

#data format is different between monthly and dialy or weekly

if(any(time.interval==c('d','w'))){

result <- gsub("^([0-9]{4})([^0-9]+)([0-9]{1,2})([^0-9]+)([0-9]{1,2})([^0-9]+)","\\1-\\3-\\5",date.string)

}else if(time.interval=='m'){

result <- gsub("^([0-9]{4})([^0-9]+)([0-9]{1,2})([^0-9]+)","\\1-\\3-01",date.string)

}

return(as.POSIXct(result))

}

#convert string to number.

as.number <- function(string)

{

return(as.double(as.character(gsub("[^0-9.]", "",string))))

}

#return end of month date.

endOfMonth <- function(date.obj)

{

startOfMonth <- as.Date(format(date.obj,"%Y%m01"),"%Y%m%d")

startOfNextMonth <- as.Date(format(startOfMonth+31,"%Y%m01"),"%Y%m%d")

return(startOfNextMonth-1)

}

library(RSQLite)

# fromDate: yyyy-MM-dd

getStockHistoricalData<-function(sqliteFile,codes,fromDate){

drv<-dbDriver("SQLite")

con<-dbConnect(drv,dbname=sqliteFile)

try(rs<-dbSendQuery(con,"drop table historicalData"))

rs<-dbSendQuery(con,"create table historicalData (date text,open real,high real,

low real,close real,volume real,adj_close real,code text)")

rs<-dbSendQuery(con,"create unique index idx_historicalData on historicalData(code,date)")

#stockMaster<-quoteStockMasterTsData()

for(code in codes){

print(paste("code=",code,sep=""))

tryCatch({

data<-quoteStockTsData(code,fromDate)

data<-transform(data,code=code)

data$code<-as.character(data$code)

data$date<-as.character(data$date)

dbBeginTransaction(con)

dbSendQuery(con,paste("delete from historicalData where code='",code,"'",sep=""))

dbSendPreparedQuery(con,"insert into historicalData

(date,open,high,low,close,volume,adj_close,code)

values(:date, :open, :high, :low, :close, :volume, :adj_close, :code)",bind.data=data)

dbCommit(con)

error = function(e){

message(paste("ERROR:",code))

message(e)

})

}

getall.sh

#!/bin/sh

getStockData(){
fromNo=$1
toNo=$2
fileNo=$3

/usr/bin/R --vanilla << Eof
library(RFinanceYJ)
source("/Users/utsuboka/Documents/prog/RFinanceYJPatch.R")
stockMaster<-quoteStockMasterTsData()

getStockHistoricalData("stock$fileNo.db",stockMaster\$code[$fromNo:$toNo],"2000-01-01") 
Eof
}

getStockData 1 500 1 &
getStockData 501 1000 2 & 
getStockData 1001 1500 3 &
getStockData 1501 2000 4 & 
getStockData 2001 2500 5 &
getStockData 2501 3000 6 & 
getStockData 3001 3500 7 &
getStockData 3501 4000 8 &

#!/bin/sh

getStockData(){

fromNo=$1

toNo=$2

fileNo=$3

/usr/bin/R --vanilla << Eof

library(RFinanceYJ)

source("/Users/utsuboka/Documents/prog/RFinanceYJPatch.R")

stockMaster<-quoteStockMasterTsData()

getStockHistoricalData("stock$fileNo.db",stockMaster\$code[$fromNo:$toNo],"2000-01-01")

Eof

}

getStockData 1 500 1 &

getStockData 501 1000 2 &

getStockData 1001 1500 3 &

getStockData 1501 2000 4 &

getStockData 2001 2500 5 &

getStockData 2501 3000 6 &

getStockData 3001 3500 7 &

getStockData 3501 4000 8 &

現在４０００銘柄弱ですのでこれでOKだと思います。ただこれでもまだまだ時間がかかりますが全部取得できるかと思います