python – ページ 3

Tensorflowのエラー

投稿者: utsubo 投稿日: 2017-04-05 in python

Tensorflowをバージョンアップすると、昔動いていたスクリプトが動かなくなったりします

    cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
AttributeError: 'module' object has no attribute 'rnn_cell'

1 2	cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0) AttributeError: 'module' object has no attribute 'rnn_cell'

こんなエラーとか。

こちらを参照し修正します。

これを

tf.nn.rnn_cell.BasicLSTMCell

1	tf.nn.rnn_cell.BasicLSTMCell

これに変更

tf.contrib.rnn.BasicLSTMCell

1	tf.contrib.rnn.BasicLSTMCell

SQLiteをPythonからインメモリで使う

投稿者: utsubo 投稿日: 2017-02-23 in python

SQLite、手軽で便利なデータベースですが激しく使っていると速度が気になる時もあります。

SQLiteはインメモリデータベースもサポートしているので、既存のSQLiteのデータベースからインメモリ化して読み取り専用にすると早くなります。

データベース準備

適当に大きなデータベースを用意します。


#!/bin/env python
# coding:utf-8

import sqlite3

con=sqlite3.connect("test.db")
con.cursor().execute("CREATE TABLE test( key integer, val integer , primary key(key))")
con.commit()


for key in range(1,10000000+1):
  con.cursor().execute("insert into test values(?,?)",(key,key+1,))
con.commit()

#!/bin/env python

# coding:utf-8

import sqlite3

con=sqlite3.connect("test.db")

con.cursor().execute("CREATE TABLE test( key integer, val integer , primary key(key))")

con.commit()

for key in range(1,10000000+1):

con.cursor().execute("insert into test values(?,?)",(key,key+1,))

con.commit()

1000万レコードのデータベースを作成しました。


$ sqlite3 test.db 
SQLite version 3.7.6.3
Enter ".help" for instructions
Enter SQL statements terminated with a ";"
sqlite> select count(*) from test;
10000000

$ sqlite3 test.db

SQLite version 3.7.6.3

Enter ".help" for instructions

Enter SQL statements terminated with a ";"

sqlite> select count(*) from test;

10000000

プログラム

こちらを参考にプログラムを作成します。


#!/bin/env python
# coding:utf-8

import sqlite3
from StringIO import StringIO
import time
import random

# in memory化
con=sqlite3.connect("test.db")
tempfile=StringIO()
for line in con.iterdump():
    tempfile.write("%s\n" % line)
tempfile.seek(0)
mcon=sqlite3.connect(":memory:")
mcon.cursor().executescript(tempfile.read())
mcon.commit()
mcon.row_factory=sqlite3.Row


current_milli_times = lambda: int(round(time.time() * 1000))

print "ready.."
# normal
# select 10000 times
N=10000
nstart=current_milli_times()
for i in range(N):
    key=random.randrange(10000000)
    res=con.cursor().execute("select * from test where key=?",(key,))

nend=current_milli_times()
print "normal:"+str(nend-nstart)


# inmemory
mstart=current_milli_times()
for i in range(N):
    key=random.randrange(10000000)
    res=mcon.cursor().execute("select * from test where key=?",(key,))
mend=current_milli_times()
print "inmemory:"+str(mend-mstart)

#!/bin/env python

# coding:utf-8

import sqlite3

from StringIO import StringIO

import time

import random

# in memory化

con=sqlite3.connect("test.db")

tempfile=StringIO()

for line in con.iterdump():

tempfile.write("%s\n" % line)

tempfile.seek(0)

mcon=sqlite3.connect(":memory:")

mcon.cursor().executescript(tempfile.read())

mcon.commit()

mcon.row_factory=sqlite3.Row

current_milli_times = lambda: int(round(time.time() * 1000))

print "ready.."

# normal

# select 10000 times

N=10000

nstart=current_milli_times()

for i in range(N):

key=random.randrange(10000000)

res=con.cursor().execute("select * from test where key=?",(key,))

nend=current_milli_times()

print "normal:"+str(nend-nstart)

# inmemory

mstart=current_milli_times()

for i in range(N):

key=random.randrange(10000000)

res=mcon.cursor().execute("select * from test where key=?",(key,))

mend=current_milli_times()

print "inmemory:"+str(mend-mstart)

conのコネクションが通常のデータベースアクセス、mconがデータベースファイルをインメモリ化したものになります。10000回ランダムにSELECTしてみます。

結果

ready..
normal:414
inmemory:171

ready..

normal:414

inmemory:171

大学のスパコンで計算したのですが２倍以上の差が出ました。

bottleでMVC

投稿者: utsubo 投稿日: 2016-08-22 in python

pythonでデータ解析をしていると、その結果をヴィジュアル的に見せたくなってくる時があります。
PythonのWebフレームワークは様々ありますが、最もシンプルなbottleで作成するのが一番簡単です。

こちらのサイトで、BottleをMVCフレームワーク的に作成されているサンプルがありましたのでちょっといじってみました。その際、ちょっとハマったのでのメモです。

環境

OS: ubuntu14.04
python: 2.7.6
MySQL version: 5.1.63
MySQL encode: shift_jis
nginx 1.4.6

設定

nginx

http://server/pythonでアクセスできるようにnginxの設定ファイルを修正します

/etc/nginx/site-availables

server {
..

        location /python {
                rewrite ^/python/(.*)$ /$1 break;
                proxy_pass http://localhost:8081;
                proxy_redirect http://localhost:8081/ $scheme://$host/python/;
                proxy_http_version 1.1;
                proxy_set_header Upgrade $http_upgrade;
                proxy_set_header Connection $connection_upgrade;
                proxy_read_timeout 20d;
        }
..

server {

location /python {

rewrite ^/python/(.*)$ /$1 break;

proxy_pass http://localhost:8081;

proxy_redirect http://localhost:8081/ $scheme://$host/python/;

proxy_http_version 1.1;

proxy_set_header Upgrade $http_upgrade;

proxy_set_header Connection $connection_upgrade;

proxy_read_timeout 20d;

}

スクリプト

起動スクリプトを作成します

start.sh

gunicorn -b 127.0.0.1:8081 -c gunicorn.conf.py -w 1 index:app -D --reload

1	gunicorn -b 127.0.0.1:8081 -c gunicorn.conf.py -w 1 index:app -D --reload

gunicorn.conf.py

proc_name = "gunicorn"

bind = 'unix:/tmp/{0}.sock'.format(proc_name)
backlog = 2048


workers = 1
worker_class = 'sync'
worker_connections = 1000
timeout = 30
keepalive = 2


debug = False
spew = False

daemon = True
pidfile = "/tmp/gunicorn.pid"
umask = 0
user = None
group = None
tmp_upload_dir = None

errorlog = '/var/log/gunicorn/error.log'
loglevel = 'debug'
accesslog = '/var/log/gunicorn/access.log'


def post_fork(server, worker):
    server.log.info("Worker spawned (pid: %s)", worker.pid)

def pre_fork(server, worker):
    pass

def pre_exec(server):
    server.log.info("Forked child, re-executing.")

def when_ready(server):
    server.log.info("Server is ready. Spawning workers")

def worker_int(worker):
    worker.log.info("worker received INT or QUIT signal")

    ## get traceback info
    import threading, sys, traceback
    id2name = dict([(th.ident, th.name) for th in threading.enumerate()])
    code = []
    for threadId, stack in sys._current_frames().items():
        code.append("\n# Thread: %s(%d)" % (id2name.get(threadId,""),
            threadId))
        for filename, lineno, name, line in traceback.extract_stack(stack):
            code.append('File: "%s", line %d, in %s' % (filename,
                lineno, name))
            if line:
                code.append("  %s" % (line.strip()))
    worker.log.debug("\n".join(code))

def worker_abort(worker):
    worker.log.info("worker received SIGABRT signal")

proc_name = "gunicorn"

bind = 'unix:/tmp/{0}.sock'.format(proc_name)

backlog = 2048

workers = 1

worker_class = 'sync'

worker_connections = 1000

timeout = 30

keepalive = 2

debug = False

spew = False

daemon = True

pidfile = "/tmp/gunicorn.pid"

umask = 0

user = None

group = None

tmp_upload_dir = None

errorlog = '/var/log/gunicorn/error.log'

loglevel = 'debug'

accesslog = '/var/log/gunicorn/access.log'

def post_fork(server, worker):

server.log.info("Worker spawned (pid: %s)", worker.pid)

def pre_fork(server, worker):

pass

def pre_exec(server):

server.log.info("Forked child, re-executing.")

def when_ready(server):

server.log.info("Server is ready. Spawning workers")

def worker_int(worker):

worker.log.info("worker received INT or QUIT signal")

## get traceback info

import threading, sys, traceback

id2name = dict([(th.ident, th.name) for th in threading.enumerate()])

code = []

for threadId, stack in sys._current_frames().items():

code.append("\n# Thread: %s(%d)" % (id2name.get(threadId,""),

threadId))

for filename, lineno, name, line in traceback.extract_stack(stack):

code.append('File: "%s", line %d, in %s' % (filename,

lineno, name))

if line:

code.append(" %s" % (line.strip()))

worker.log.debug("\n".join(code))

def worker_abort(worker):

worker.log.info("worker received SIGABRT signal")

文字化け

上記環境で参考サイトを元に作成すると、DBの文字列を表示する際に文字化けしてしまいます。この対処法は散々悩んだ挙句この修正で行けました

app/models/db.py

dbhandle = MySQLdb.connect(
  host = config.get('live_db', 'host'),
  port = config.getint("live_db","port"), 
  user = config.get('live_db', 'user'),
  passwd = config.get('live_db', 'password'),
  db = config.get('live_db', 'database'),
  charset = "sjis",  # これを追加
  use_unicode=1
)

dbhandle = MySQLdb.connect(

host = config.get('live_db', 'host'),

port = config.getint("live_db","port"),

user = config.get('live_db', 'user'),

passwd = config.get('live_db', 'password'),

db = config.get('live_db', 'database'),

charset = "sjis", # これを追加

use_unicode=1

)

pythonのmatplotlibでcandlestickチャート

投稿者: utsubo 投稿日: 2016-08-01 in python

MySQLからデータを取得し、Pythonのmatplotlibでローソク足を描画します。

環境

OS:MacOS10.11
python:2.7.12
MySQL 5.6

テーブル

MySQLのテーブル形式は以下のとおり。日足でも週足でもなんでも構いません。
データベース名はdbnameとしています

create table price_table(
  date datetime not null,
  code varchar(8) not null,
  open double precision null,
  high double precision null,
  low  double precision null,
  close double precision null,
  volume double precision null
)
;
create unique index idx_price on price(date,code)
;

create table price_table(

date datetime not null,

code varchar(8) not null,

open double precision null,

high double precision null,

low double precision null,

close double precision null,

volume double precision null

)

;

create unique index idx_price on price(date,code)

;

休日考慮

シンプルにローソク足だけ表示します
休日がある場合には間を空けます

#!/bin/env python
# coding:utf-8

import matplotlib.pyplot as plt
from matplotlib.finance import candlestick_ohlc
import time
import MySQLdb

connection = MySQLdb.connect(host="localhost",db="dbname",user="root",passwd="")
cursor = connection.cursor()

code = "6758"
date='2016-04-01'
cursor.execute("select date,open,high,low,close,volume from price_table where code=%s and date>=%s",[code,date])
result = cursor.fetchall()

ohlc=[]
fdate=[]  # float
ddate=[]  # datetime
for row in result:
  tmp=time.mktime(row[0].timetuple())
  ohlc.append((tmp,row[1],row[2],row[3],row[4],row[5]))  # unix time
  ddate.append(row[0])
  fdate.append(tmp)
cursor.close()
connection.close()


# graph上のfloat型の日付と、表示文字列を紐付けている
plt.xticks(
	fdate[::5],
	[x.strftime('%Y-%m-%d') for x in ddate][::5]
)


ax = plt.subplot()

candlestick_ohlc(ax,ohlc)

plt.xlabel('Date')
plt.ylabel('Price')
plt.title("title")
plt.legend()
plt.show()

#!/bin/env python

# coding:utf-8

import matplotlib.pyplot as plt

from matplotlib.finance import candlestick_ohlc

import time

import MySQLdb

connection = MySQLdb.connect(host="localhost",db="dbname",user="root",passwd="")

cursor = connection.cursor()

code = "6758"

date='2016-04-01'

cursor.execute("select date,open,high,low,close,volume from price_table where code=%s and date>=%s",[code,date])

result = cursor.fetchall()

ohlc=[]

fdate=[] # float

ddate=[] # datetime

for row in result:

tmp=time.mktime(row[0].timetuple())

ohlc.append((tmp,row[1],row[2],row[3],row[4],row[5])) # unix time

ddate.append(row[0])

fdate.append(tmp)

cursor.close()

connection.close()

# graph上のfloat型の日付と、表示文字列を紐付けている

plt.xticks(

fdate[::5],

[x.strftime('%Y-%m-%d') for x in ddate][::5]

)

ax = plt.subplot()

candlestick_ohlc(ax,ohlc)

plt.xlabel('Date')

plt.ylabel('Price')

plt.title("title")

plt.legend()

plt.show()

休日考慮しない

シンプルにローソク足だけ表示します
休日を無視して詰めて描画します。テクニカルを重ね合わせる場合はこちらのほうが都合がいいです

#!/bin/env python
# coding:utf-8

import matplotlib.pyplot as plt
from matplotlib.finance import candlestick_ohlc
import time
import MySQLdb

connection = MySQLdb.connect(host="localhost",db="dbname",user="root",passwd="")
cursor = connection.cursor()

code = "6758"
date='2016-04-01'
cursor.execute("select date,open,high,low,close,volume from price_table where code=%s and date>=%s",[code,date])
result = cursor.fetchall()

ohlc=[]
fdate=[]  # float
ddate=[]  # datetime
adr=1
for row in result:
  tmp=adr
  ohlc.append((adr,row[1],row[2],row[3],row[4],row[5]))
  ddate.append(row[0])
  fdate.append(adr)
  adr=adr+1
cursor.close()
connection.close()


# graph上のfloat型の日付と、表示文字列を紐付けている
plt.xticks(
	fdate[::5],
	[x.strftime('%Y-%m-%d') for x in ddate][::5]
)


ax = plt.subplot()

candlestick_ohlc(ax,ohlc)

plt.xlabel('Date')
plt.ylabel('Price')
plt.title("title")
plt.legend()
plt.show()

#!/bin/env python

# coding:utf-8

import matplotlib.pyplot as plt

from matplotlib.finance import candlestick_ohlc

import time

import MySQLdb

connection = MySQLdb.connect(host="localhost",db="dbname",user="root",passwd="")

cursor = connection.cursor()

code = "6758"

date='2016-04-01'

cursor.execute("select date,open,high,low,close,volume from price_table where code=%s and date>=%s",[code,date])

result = cursor.fetchall()

ohlc=[]

fdate=[] # float

ddate=[] # datetime

adr=1

for row in result:

tmp=adr

ohlc.append((adr,row[1],row[2],row[3],row[4],row[5]))

ddate.append(row[0])

fdate.append(adr)

adr=adr+1

cursor.close()

connection.close()

# graph上のfloat型の日付と、表示文字列を紐付けている

plt.xticks(

fdate[::5],

[x.strftime('%Y-%m-%d') for x in ddate][::5]

)

ax = plt.subplot()

candlestick_ohlc(ax,ohlc)

plt.xlabel('Date')

plt.ylabel('Price')

plt.title("title")

plt.legend()

plt.show()

これでとりあえずチャートが表示されます

Word Cloudを使って見る

投稿者: utsubo 投稿日: 2016-07-27 in python

WordCloudなるライブラリがあるので使ってみました。
自分の環境ではそのままではちょっとうまく動かなかったのでメモです。

こちらを参考にしました。

環境

– MacOS10.11
– python 2.7.12
– mecab 0.996

インストール

brew install python
brew install mecab
brew install mecab-ipadic


git clone https://github.com/amueller/word_cloud
cd word_cloud 
pip install -r requirements.txt
python setup.py install
pip install beautifulsoup4
pip install requests

brew install python

brew install mecab

brew install mecab-ipadic

git clone https://github.com/amueller/word_cloud

cd word_cloud

pip install -r requirements.txt

python setup.py install

pip install beautifulsoup4

pip install requests

エラー

こちらのサンプルをそのまま実行するとエラーが出ます

/usr/local/lib/python2.7/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 53 of the file word_cloud.py. To get rid of this warning, change code that looks like this:

 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))
Traceback (most recent call last):
  File "word_cloud.py", line 53, in <module>
    wordlist = get_wordlist_from_QiitaURL(url)
  File "word_cloud.py", line 30, in get_wordlist_from_QiitaURL
    return mecab_analysis(text)
  File "word_cloud.py", line 10, in mecab_analysis
    t = mc.Tagger('-Ochasen -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd/')
  File "/usr/local/lib/python2.7/site-packages/MeCab.py", line 307, in __init__
    this = _MeCab.new_Tagger(*args)
RuntimeError

/usr/local/lib/python2.7/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 53 of the file word_cloud.py. To get rid of this warning, change code that looks like this:

BeautifulSoup([your markup])

to this:

BeautifulSoup([your markup], "html.parser")

markup_type=markup_type))

Traceback (most recent call last):

File "word_cloud.py", line 53, in <module>

wordlist = get_wordlist_from_QiitaURL(url)

File "word_cloud.py", line 30, in get_wordlist_from_QiitaURL

return mecab_analysis(text)

File "word_cloud.py", line 10, in mecab_analysis

t = mc.Tagger('-Ochasen -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd/')

File "/usr/local/lib/python2.7/site-packages/MeCab.py", line 307, in __init__

this = _MeCab.new_Tagger(*args)

RuntimeError

HTMLパーサーを明示的に入れます

    soup = BeautifulSoup(res.text,"html.parser")

1	soup = BeautifulSoup(res.text,"html.parser")

そのまま実行するとまたまたエラー

Traceback (most recent call last):
  File "word_cloud.py", line 59, in <module>
    create_wordcloud(" ".join(wordlist).decode('utf-8'))
  File "word_cloud.py", line 50, in create_wordcloud
    stopwords=set(stop_words)).generate(text)
  File "/usr/local/lib/python2.7/site-packages/wordcloud-1.2.1-py2.7-macosx-10.11-x86_64.egg/wordcloud/wordcloud.py", line 463, in generate
    return self.generate_from_text(text)
  File "/usr/local/lib/python2.7/site-packages/wordcloud-1.2.1-py2.7-macosx-10.11-x86_64.egg/wordcloud/wordcloud.py", line 448, in generate_from_text
    words = self.process_text(text)
  File "/usr/local/lib/python2.7/site-packages/wordcloud-1.2.1-py2.7-macosx-10.11-x86_64.egg/wordcloud/wordcloud.py", line 391, in process_text
    self.stopwords_lower_ = set(map(str.lower, self.stopwords))
TypeError: descriptor 'lower' requires a 'str' object but received a 'unicode'

Traceback (most recent call last):

File "word_cloud.py", line 59, in <module>

create_wordcloud(" ".join(wordlist).decode('utf-8'))

File "word_cloud.py", line 50, in create_wordcloud

stopwords=set(stop_words)).generate(text)

File "/usr/local/lib/python2.7/site-packages/wordcloud-1.2.1-py2.7-macosx-10.11-x86_64.egg/wordcloud/wordcloud.py", line 463, in generate

return self.generate_from_text(text)

File "/usr/local/lib/python2.7/site-packages/wordcloud-1.2.1-py2.7-macosx-10.11-x86_64.egg/wordcloud/wordcloud.py", line 448, in generate_from_text

words = self.process_text(text)

File "/usr/local/lib/python2.7/site-packages/wordcloud-1.2.1-py2.7-macosx-10.11-x86_64.egg/wordcloud/wordcloud.py", line 391, in process_text

self.stopwords_lower_ = set(map(str.lower, self.stopwords))

TypeError: descriptor 'lower' requires a 'str' object but received a 'unicode'

どうもUnicodeがらみのエラーです。stop_wordsのUnicode変換がうまくいっていないようなので普通の文字列にします

コード修正

自分の環境に合わせていじります

#!/bin/env python
# coding:utf-8
#%matplotlib inline
import urllib2
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from bs4 import BeautifulSoup
import requests
import MeCab as mc



def mecab_analysis(text):
    t = mc.Tagger('-Ochasen -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/ipadic/')
    enc_text = text.encode('utf-8')
    node = t.parseToNode(enc_text)
    output = []
    while(node):
        if node.surface != "":  # ヘッダとフッタを除外
            word_type = node.feature.split(",")[0]
            if word_type in ["形容詞", "動詞","名詞", "副詞"]:
                output.append(node.surface)
        node = node.next
        if node is None:
            break
    return output


def get_wordlist_from_QiitaURL(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text,"html.parser")

    text = soup.body.section.get_text().replace('\n','').replace('\t','')
    return mecab_analysis(text)

def create_wordcloud(text):

    # 環境に合わせてフォントのパスを指定する。
    #fpath = "/System/Library/Fonts/HelveticaNeue-UltraLight.otf"
    #fpath = "/Library/Fonts/ヒラギノ角ゴ Pro W3.otf"
    fpath = "/Library/Fonts/Osaka.ttf"

    # ストップワードの設定
    #stop_words = [ u'てる', u'いる', u'なる', u'れる', u'する', u'ある', u'こと
', u'これ', u'さん', u'して', u'くれる', u'やる', u'くださる', u'そう', u'せる', u'した',  u'思う',  u'それ', u'ここ', u'ちゃん', u'くん', u'', u'て',u'に',u'を
',u'は',u'の', u'が', u'と', u'た', u'し', u'で', u'ない', u'も', u'な', u'い', u'か', u'ので', u'よう', u'']
    stop_words = [ 'てる', 'いる', 'なる', 'れる', 'する', 'ある', 'こと', 'これ
', 'さん', 'して', 'くれる', 'やる', 'くださる', 'そう', 'せる', 'した',  '思う',  'それ', 'ここ', 'ちゃん', 'くん', '', 'て','に','を','は','の', 'が', 'と', 'た', 'し', 'で', 'ない', 'も', 'な', 'い', 'か', 'ので', 'よう', '']

    wordcloud = WordCloud(background_color="white",font_path=fpath, width=900, height=500, \
                          stopwords=set(stop_words)).generate(text)

    plt.figure(figsize=(15,12))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

url = "http://qiita.com/t_saeko/items/2b475b8657c826abc114"
wordlist = get_wordlist_from_QiitaURL(url)
create_wordcloud(" ".join(wordlist).decode('utf-8'))

#!/bin/env python

# coding:utf-8

#%matplotlib inline

import urllib2

from bs4 import BeautifulSoup

import matplotlib.pyplot as plt

from wordcloud import WordCloud

from bs4 import BeautifulSoup

import requests

import MeCab as mc

def mecab_analysis(text):

t = mc.Tagger('-Ochasen -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/ipadic/')

enc_text = text.encode('utf-8')

node = t.parseToNode(enc_text)

output = []

while(node):

if node.surface != "": # ヘッダとフッタを除外

word_type = node.feature.split(",")[0]

if word_type in ["形容詞", "動詞","名詞", "副詞"]:

output.append(node.surface)

node = node.next

if node is None:

break

return output

def get_wordlist_from_QiitaURL(url):

res = requests.get(url)

soup = BeautifulSoup(res.text,"html.parser")

text = soup.body.section.get_text().replace('\n','').replace('\t','')

return mecab_analysis(text)

def create_wordcloud(text):

# 環境に合わせてフォントのパスを指定する。

#fpath = "/System/Library/Fonts/HelveticaNeue-UltraLight.otf"

#fpath = "/Library/Fonts/ヒラギノ角ゴ Pro W3.otf"

fpath = "/Library/Fonts/Osaka.ttf"

# ストップワードの設定

#stop_words = [ u'てる', u'いる', u'なる', u'れる', u'する', u'ある', u'こと

', u'これ', u'さん', u'して', u'くれる', u'やる', u'くださる', u'そう', u'せる', u'した', u'思う', u'それ', u'ここ', u'ちゃん', u'くん', u'', u'て',u'に',u'を

',u'は',u'の', u'が', u'と', u'た', u'し', u'で', u'ない', u'も', u'な', u'い', u'か', u'ので', u'よう', u'']

stop_words = [ 'てる', 'いる', 'なる', 'れる', 'する', 'ある', 'こと', 'これ

', 'さん', 'して', 'くれる', 'やる', 'くださる', 'そう', 'せる', 'した', '思う', 'それ', 'ここ', 'ちゃん', 'くん', '', 'て','に','を','は','の', 'が', 'と', 'た', 'し', 'で', 'ない', 'も', 'な', 'い', 'か', 'ので', 'よう', '']

wordcloud = WordCloud(background_color="white",font_path=fpath, width=900, height=500, \

stopwords=set(stop_words)).generate(text)

plt.figure(figsize=(15,12))

plt.imshow(wordcloud)

plt.axis("off")

plt.show()

url = "http://qiita.com/t_saeko/items/2b475b8657c826abc114"

wordlist = get_wordlist_from_QiitaURL(url)

create_wordcloud(" ".join(wordlist).decode('utf-8'))

実行

python word_cloud.py

1	python word_cloud.py

これで画像が表示されます

TensorflowでMNIST（５）

投稿者: utsubo 投稿日: 2016-04-01 in ML、python

毎回学習させるのは効率が悪いので、一度学習させそれを保存しておき、判定時には判定のみをさせるように修正します

畳み込みニューラルネットワーク(CNN)のプログラムを修正していきます

学習用

tf_cnn3.py

#!/bin/env python
# -*- coding: utf-8 -*-
# http://qiita.com/ikki8412/items/95bc81a744dc377d9119
import tensorflow as tf
import numpy as np
import random
import time
import math

NUMCLASS=10
NUMPARAM=784
   
### データ処理用
def label_data(lines):
  labels=[]
  for line in lines:
    # ラベルを1-of-k方式で用意する
    tmp = np.zeros(NUMCLASS)
    tmp[int(line)] = 1
    labels.append(tmp)
  return np.asarray(labels)

def image_data(test):
  test_image=map(lambda n: map(lambda k: float(k)/255.0,n),test[:,1:NUMPARAM+1])
  return np.asarray(test_image)


# 開始時刻
start_time = time.time()
print "開始時刻: " + str(start_time)


### データ取得 --->
# ファイルを開く
f = open("train.txt", 'r')
# データを入れる配列
train = []
for line in f:
    # 改行を除いてスペース区切りにする
    line = line.rstrip()
    l = line.split(" ")
    l = map(lambda n: int(n),l)
    #l=map(lambda n: 0 if n=="0" else 1,l)
    train.append(l)


# numpy形式に変換
train = np.asarray(train)
f.close()

f = open("t10k.txt", 'r')
test = []
for line in f:
    line = line.rstrip()
    l = line.split(" ")
    l = map(lambda n: int(n),l)
    #l=map(lambda n: 0 if n=="0" else 1,l)
    test.append(l)

test = np.asarray(test)
f.close()
### データ取得 ---<


# 訓練画像を入れる変数
# 訓練画像は28x28pxであり、これらを1行784列のベクトルに並び替え格納する
# Noneとなっているのは訓練画像がいくつでも入れられるようにするため
x = tf.placeholder(tf.float32, [None, NUMPARAM], name="x-input")

# 交差エントロピー
# y_は正解データのラベル
# 損失とオプティマイザを定義します
y_ = tf.placeholder(tf.float32, [None, NUMCLASS], name="y-input")


# hidden1
with tf.name_scope("hidden_layer1") as scope:
  w_h1 = tf.Variable(tf.truncated_normal([NUMPARAM, 500],
                          stddev=1.0 / math.sqrt(float(NUMPARAM))),name='weights')
  b_h1 = tf.Variable(tf.zeros([500]),name='biases')

  h1 = tf.nn.sigmoid(tf.matmul(x, w_h1) + b_h1)
# hidden2
with tf.name_scope("hidden_layer2") as scope:
  w_h2 = tf.Variable(tf.truncated_normal([500, 300],
                          stddev=1.0 / math.sqrt(float(500))),name='weights')
  b_h2 = tf.Variable(tf.zeros([300]),name='biases')

  h2 = tf.nn.sigmoid(tf.matmul(h1, w_h2) + b_h2)
# softmax layer
with tf.name_scope("softmax_layer") as scope:
  w_o = tf.Variable(tf.truncated_normal([300, NUMCLASS],
                          stddev=1.0 / math.sqrt(float(300))),name='weights')
  b_o = tf.Variable(tf.zeros([NUMCLASS]),name='biases')

  y = tf.nn.softmax((tf.matmul(h2, w_o) + b_o))


# 更なる name scopes はグラフ表現をクリーンアップしま
with tf.name_scope("xent") as scope:
  cross_entropy = -tf.reduce_sum(y_*tf.log(y))
  # TensorBoardで表示するよう指定
  tf.scalar_summary("cross_entropy", cross_entropy)

  # 勾配硬化法を用い交差エントロピーが最小となるようyを最適化する
  train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)

# 用意した変数Veriableの初期化を実行する
init = tf.initialize_all_variables()

# Sessionを開始する
# runすることで初めて実行開始される（run(init)しないとinitが実行されない）
sess = tf.Session()
sess.run(init)
# TensorBoardで表示する値の設定
summary_op = tf.merge_all_summaries()
summary_writer = tf.train.SummaryWriter("/tmp/data", sess.graph_def)


# 1000回の訓練（train_step）を実行する
# next_batch(100)で100つのランダムな訓練セット（画像と対応するラベル）を選択する
# 訓練データは60000点あるので全て使いたいところだが費用つまり時間がかかるのでランダムな100つを使う
# 100つでも同じような結果を得ることができる
# feed_dictでplaceholderに値を入力することができる
print "--- 訓練開始 ---"
for i in range(20000):
  train_sample=np.asarray(random.sample(train,100))
  batch_ys=label_data(train_sample[:,0])
  batch_xs=image_data(train_sample)
  train_accuracy=sess.run(train_step, feed_dict={x: batch_xs, y_:batch_ys})

  # 1 step終わるたびにTensorBoardに表示する値を追加する
  summary_str=sess.run(summary_op, feed_dict={x: batch_xs, y_:batch_ys})
  summary_writer.add_summary(summary_str, i)
print "--- 訓練終了 ---"

# 正しいかの予測
# 計算された画像がどの数字であるかの予測yと正解ラベルy_を比較する
# 同じ値であればTrueが返される
# argmaxは配列の中で一番値の大きい箇所のindexが返される
# 一番値が大きいindexということは、それがその数字である確率が一番大きいということ
# Trueが返ってくるということは訓練した結果と回答が同じということ
with tf.name_scope("test") as scope:
  correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))

# 精度の計算
# correct_predictionはbooleanなのでfloatにキャストし、平均値を計算する
# Trueならば1、Falseならば0に変換される
  accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

  tf.scalar_summary("accuracy", accuracy)

# 精度の実行と表示
# テストデータの画像とラベルで精度を確認する
# ソフトマックス回帰によってWとbの値が計算されているので、xを入力することでyが計算できる
test_label=label_data(test[:,0])
test_image=image_data(test)
# http://d.hatena.ne.jp/sugyan/20151124/1448292129
print "精度"
print(sess.run(accuracy, feed_dict={x: test_image, y_: test_label}))

## save 
saver = tf.train.Saver([w_h1,b_h1,w_h2,b_h2,w_o,b_o])
saver.save(sess, "/tmp/tf_cnn.ckpt")

# 終了時刻
end_time = time.time()
print "終了時刻: " + str(end_time)
print "かかった時間: " + str(end_time - start_time)

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

#!/bin/env python

# -*- coding: utf-8 -*-

# http://qiita.com/ikki8412/items/95bc81a744dc377d9119

import tensorflow as tf

import numpy as np

import random

import time

import math

NUMCLASS=10

NUMPARAM=784

### データ処理用

def label_data(lines):

labels=[]

for line in lines:

# ラベルを1-of-k方式で用意する

tmp = np.zeros(NUMCLASS)

tmp[int(line)] = 1

labels.append(tmp)

return np.asarray(labels)

def image_data(test):

test_image=map(lambda n: map(lambda k: float(k)/255.0,n),test[:,1:NUMPARAM+1])

return np.asarray(test_image)

# 開始時刻

start_time = time.time()

print "開始時刻: " + str(start_time)

### データ取得 --->

# ファイルを開く

f = open("train.txt", 'r')

# データを入れる配列

train = []

for line in f:

# 改行を除いてスペース区切りにする

line = line.rstrip()

l = line.split(" ")

l = map(lambda n: int(n),l)

#l=map(lambda n: 0 if n=="0" else 1,l)

train.append(l)

# numpy形式に変換

train = np.asarray(train)

f.close()

f = open("t10k.txt", 'r')

test = []

for line in f:

line = line.rstrip()

l = line.split(" ")

l = map(lambda n: int(n),l)

#l=map(lambda n: 0 if n=="0" else 1,l)

test.append(l)

test = np.asarray(test)

f.close()

### データ取得 ---<

# 訓練画像を入れる変数

# 訓練画像は28x28pxであり、これらを1行784列のベクトルに並び替え格納する

# Noneとなっているのは訓練画像がいくつでも入れられるようにするため

x = tf.placeholder(tf.float32, [None, NUMPARAM], name="x-input")

# 交差エントロピー

# y_は正解データのラベル

# 損失とオプティマイザを定義します

y_ = tf.placeholder(tf.float32, [None, NUMCLASS], name="y-input")

# hidden1

with tf.name_scope("hidden_layer1") as scope:

w_h1 = tf.Variable(tf.truncated_normal([NUMPARAM, 500],

stddev=1.0 / math.sqrt(float(NUMPARAM))),name='weights')

b_h1 = tf.Variable(tf.zeros([500]),name='biases')

h1 = tf.nn.sigmoid(tf.matmul(x, w_h1) + b_h1)

# hidden2

with tf.name_scope("hidden_layer2") as scope:

w_h2 = tf.Variable(tf.truncated_normal([500, 300],

stddev=1.0 / math.sqrt(float(500))),name='weights')

b_h2 = tf.Variable(tf.zeros([300]),name='biases')

h2 = tf.nn.sigmoid(tf.matmul(h1, w_h2) + b_h2)

# softmax layer

with tf.name_scope("softmax_layer") as scope:

w_o = tf.Variable(tf.truncated_normal([300, NUMCLASS],

stddev=1.0 / math.sqrt(float(300))),name='weights')

b_o = tf.Variable(tf.zeros([NUMCLASS]),name='biases')

y = tf.nn.softmax((tf.matmul(h2, w_o) + b_o))

# 更なる name scopes はグラフ表現をクリーンアップしま

with tf.name_scope("xent") as scope:

cross_entropy = -tf.reduce_sum(y_*tf.log(y))

# TensorBoardで表示するよう指定

tf.scalar_summary("cross_entropy", cross_entropy)

# 勾配硬化法を用い交差エントロピーが最小となるようyを最適化する

train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)

# 用意した変数Veriableの初期化を実行する

init = tf.initialize_all_variables()

# Sessionを開始する

# runすることで初めて実行開始される（run(init)しないとinitが実行されない）

sess = tf.Session()

sess.run(init)

# TensorBoardで表示する値の設定

summary_op = tf.merge_all_summaries()

summary_writer = tf.train.SummaryWriter("/tmp/data", sess.graph_def)

# 1000回の訓練（train_step）を実行する

# next_batch(100)で100つのランダムな訓練セット（画像と対応するラベル）を選択する

# 訓練データは60000点あるので全て使いたいところだが費用つまり時間がかかるのでランダムな100つを使う

# 100つでも同じような結果を得ることができる

# feed_dictでplaceholderに値を入力することができる

print "--- 訓練開始 ---"

for i in range(20000):

train_sample=np.asarray(random.sample(train,100))

batch_ys=label_data(train_sample[:,0])

batch_xs=image_data(train_sample)

train_accuracy=sess.run(train_step, feed_dict={x: batch_xs, y_:batch_ys})

# 1 step終わるたびにTensorBoardに表示する値を追加する

summary_str=sess.run(summary_op, feed_dict={x: batch_xs, y_:batch_ys})

summary_writer.add_summary(summary_str, i)

print "--- 訓練終了 ---"

# 正しいかの予測

# 計算された画像がどの数字であるかの予測yと正解ラベルy_を比較する

# 同じ値であればTrueが返される

# argmaxは配列の中で一番値の大きい箇所のindexが返される

# 一番値が大きいindexということは、それがその数字である確率が一番大きいということ

# Trueが返ってくるということは訓練した結果と回答が同じということ

with tf.name_scope("test") as scope:

correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))

# 精度の計算

# correct_predictionはbooleanなのでfloatにキャストし、平均値を計算する

# Trueならば1、Falseならば0に変換される

accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

tf.scalar_summary("accuracy", accuracy)

# 精度の実行と表示

# テストデータの画像とラベルで精度を確認する

# ソフトマックス回帰によってWとbの値が計算されているので、xを入力することでyが計算できる

test_label=label_data(test[:,0])

test_image=image_data(test)

# http://d.hatena.ne.jp/sugyan/20151124/1448292129

print "精度"

print(sess.run(accuracy, feed_dict={x: test_image, y_: test_label}))

## save

saver = tf.train.Saver([w_h1,b_h1,w_h2,b_h2,w_o,b_o])

saver.save(sess, "/tmp/tf_cnn.ckpt")

# 終了時刻

end_time = time.time()

print "終了時刻: " + str(end_time)

print "かかった時間: " + str(end_time - start_time)

判定用

tf_cnn3_exec.py

#!/bin/env python
# -*- coding: utf-8 -*-
# http://qiita.com/ikki8412/items/95bc81a744dc377d9119
import tensorflow as tf
import numpy as np
import random
import time
import math

NUMCLASS=10
NUMPARAM=784
   
### データ処理用
def label_data(lines):
  labels=[]
  for line in lines:
    # ラベルを1-of-k方式で用意する
    tmp = np.zeros(NUMCLASS)
    tmp[int(line)] = 1
    labels.append(tmp)
  return np.asarray(labels)

def image_data(test):
  test_image=map(lambda n: map(lambda k: float(k)/255.0,n),test[:,1:NUMPARAM+1])
  return np.asarray(test_image)


# 開始時刻
start_time = time.time()
print "開始時刻: " + str(start_time)


### データ取得 --->
# ファイルを開く

f = open("t10k.txt", 'r')
test = []
for line in f:
    line = line.rstrip()
    l = line.split(" ")
    l = map(lambda n: int(n),l)
    #l=map(lambda n: 0 if n=="0" else 1,l)
    test.append(l)

test = np.asarray(test)
f.close()
### データ取得 ---<


# 訓練画像を入れる変数
# 訓練画像は28x28pxであり、これらを1行784列のベクトルに並び替え格納する
# Noneとなっているのは訓練画像がいくつでも入れられるようにするため
x = tf.placeholder(tf.float32, [None, NUMPARAM], name="x-input")

# 交差エントロピー
# y_は正解データのラベル
# 損失とオプティマイザを定義します
y_ = tf.placeholder(tf.float32, [None, NUMCLASS], name="y-input")


# hidden1
with tf.name_scope("hidden_layer1") as scope:
  w_h1 = tf.Variable(tf.truncated_normal([NUMPARAM, 500],
                          stddev=1.0 / math.sqrt(float(NUMPARAM))),name='weights')
  b_h1 = tf.Variable(tf.zeros([500]),name='biases')

  h1 = tf.nn.sigmoid(tf.matmul(x, w_h1) + b_h1)
# hidden2
with tf.name_scope("hidden_layer2") as scope:
  w_h2 = tf.Variable(tf.truncated_normal([500, 300],
                          stddev=1.0 / math.sqrt(float(500))),name='weights')
  b_h2 = tf.Variable(tf.zeros([300]),name='biases')

  h2 = tf.nn.sigmoid(tf.matmul(h1, w_h2) + b_h2)
# softmax layer
with tf.name_scope("softmax_layer") as scope:
  w_o = tf.Variable(tf.truncated_normal([300, NUMCLASS],
                          stddev=1.0 / math.sqrt(float(300))),name='weights')
  b_o = tf.Variable(tf.zeros([NUMCLASS]),name='biases')

  y = tf.nn.softmax((tf.matmul(h2, w_o) + b_o))



# 用意した変数Veriableの初期化を実行する
init = tf.initialize_all_variables()

# Sessionを開始する
# runすることで初めて実行開始される（run(init)しないとinitが実行されない）
sess = tf.Session()
sess.run(init)



saver = tf.train.Saver([w_h1,b_h1,w_h2,b_h2,w_o,b_o])
saver.restore(sess, "/tmp/tf_cnn.ckpt")


# 精度の実行と表示
# テストデータの画像とラベルで精度を確認する
# ソフトマックス回帰によってWとbの値が計算されているので、xを入力することでyが計算できる
test_label=label_data(test[:,0])
test_image=image_data(test)


# http://d.hatena.ne.jp/sugyan/20151124/1448292129
print "精度"
print(test_image[0])
res=(sess.run(y, feed_dict={x: [test_image[0]]}))
print res[0]


# 終了時刻
end_time = time.time()
print "終了時刻: " + str(end_time)
print "かかった時間: " + str(end_time - start_time)

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

#!/bin/env python

# -*- coding: utf-8 -*-

# http://qiita.com/ikki8412/items/95bc81a744dc377d9119

import tensorflow as tf

import numpy as np

import random

import time

import math

NUMCLASS=10

NUMPARAM=784

### データ処理用

def label_data(lines):

labels=[]

for line in lines:

# ラベルを1-of-k方式で用意する

tmp = np.zeros(NUMCLASS)

tmp[int(line)] = 1

labels.append(tmp)

return np.asarray(labels)

def image_data(test):

test_image=map(lambda n: map(lambda k: float(k)/255.0,n),test[:,1:NUMPARAM+1])

return np.asarray(test_image)

# 開始時刻

start_time = time.time()

print "開始時刻: " + str(start_time)

### データ取得 --->

# ファイルを開く

f = open("t10k.txt", 'r')

test = []

for line in f:

line = line.rstrip()

l = line.split(" ")

l = map(lambda n: int(n),l)

#l=map(lambda n: 0 if n=="0" else 1,l)

test.append(l)

test = np.asarray(test)

f.close()

### データ取得 ---<

# 訓練画像を入れる変数

# 訓練画像は28x28pxであり、これらを1行784列のベクトルに並び替え格納する

# Noneとなっているのは訓練画像がいくつでも入れられるようにするため

x = tf.placeholder(tf.float32, [None, NUMPARAM], name="x-input")

# 交差エントロピー

# y_は正解データのラベル

# 損失とオプティマイザを定義します

y_ = tf.placeholder(tf.float32, [None, NUMCLASS], name="y-input")

# hidden1

with tf.name_scope("hidden_layer1") as scope:

w_h1 = tf.Variable(tf.truncated_normal([NUMPARAM, 500],

stddev=1.0 / math.sqrt(float(NUMPARAM))),name='weights')

b_h1 = tf.Variable(tf.zeros([500]),name='biases')

h1 = tf.nn.sigmoid(tf.matmul(x, w_h1) + b_h1)

# hidden2

with tf.name_scope("hidden_layer2") as scope:

w_h2 = tf.Variable(tf.truncated_normal([500, 300],

stddev=1.0 / math.sqrt(float(500))),name='weights')

b_h2 = tf.Variable(tf.zeros([300]),name='biases')

h2 = tf.nn.sigmoid(tf.matmul(h1, w_h2) + b_h2)

# softmax layer

with tf.name_scope("softmax_layer") as scope:

w_o = tf.Variable(tf.truncated_normal([300, NUMCLASS],

stddev=1.0 / math.sqrt(float(300))),name='weights')

b_o = tf.Variable(tf.zeros([NUMCLASS]),name='biases')

y = tf.nn.softmax((tf.matmul(h2, w_o) + b_o))

# 用意した変数Veriableの初期化を実行する

init = tf.initialize_all_variables()

# Sessionを開始する

# runすることで初めて実行開始される（run(init)しないとinitが実行されない）

sess = tf.Session()

sess.run(init)

saver = tf.train.Saver([w_h1,b_h1,w_h2,b_h2,w_o,b_o])

saver.restore(sess, "/tmp/tf_cnn.ckpt")

# 精度の実行と表示

# テストデータの画像とラベルで精度を確認する

# ソフトマックス回帰によってWとbの値が計算されているので、xを入力することでyが計算できる

test_label=label_data(test[:,0])

test_image=image_data(test)

# http://d.hatena.ne.jp/sugyan/20151124/1448292129

print "精度"

print(test_image[0])

res=(sess.run(y, feed_dict={x: [test_image[0]]}))

print res[0]

# 終了時刻

end_time = time.time()

print "終了時刻: " + str(end_time)

print "かかった時間: " + str(end_time - start_time)

実行

学習させて、テストデータの先頭の１つを判定します。その際、どの数値をどの確率で判定しているかのリストを表示させています。その数値が最も高いものがこのエンジンでの判定結果となります。

$ python tf_cnn3.py
開始時刻: 1459469907.05
--- 訓練開始 ---
--- 訓練終了 ---
精度
0.9804
終了時刻: 1459470791.98
かかった時間: 884.936733961
ubuntu@ubuntu:~/mnist$ python tf_cnn3_exec.py
開始時刻: 1459471244.43
結果
[	1.49596369e-09	 4.64228158e-08	 5.76857360e-07	 1.40932752e-05
2.07982364e-11	 3.24105409e-10	 2.27739631e-15	 9.99985099e-01
5.93633684e-11	 2.51105405e-07]
終了時刻: 1459471249.84
かかった時間: 5.40709519386

$ python tf_cnn3.py

開始時刻: 1459469907.05

--- 訓練開始 ---

--- 訓練終了 ---

精度

0.9804

終了時刻: 1459470791.98

かかった時間: 884.936733961

ubuntu@ubuntu:~/mnist$ python tf_cnn3_exec.py

開始時刻: 1459471244.43

結果

[ 1.49596369e-09 4.64228158e-08 5.76857360e-07 1.40932752e-05

2.07982364e-11 3.24105409e-10 2.27739631e-15 9.99985099e-01

5.93633684e-11 2.51105405e-07]

終了時刻: 1459471249.84

かかった時間: 5.40709519386

この結果では７が9.99985099e-01で最も数値が高くなっています

TensorflowでMNIST（４）

投稿者: utsubo 投稿日: 2016-03-31 in ML、python

前回までで、MNISTをDeeplearningするにあたって、回帰分析、多層パーセプトロン、畳み込みニューラルネットワークとTensorflowで実装してみました。

入力データに関しての補足です

入力用のデータはMNISTのデータを使っているのですが、28×28のビットマップデータを0－254までの数値（白黒）で表したデータを入力とします。

これを用いてTensorflowの入力データ用に１次元配列に変換します。その際、ビットマップデータを左上から順に１次元の配列に格納しているので、結局784要素の配列となります。これを、ビットマップデータ数分用意(60000)するので結局、784×60000という巨大な行列が入力となります。

実際の入力利用したデータは、その列の先頭に正解を付与しているので、785×60000でできたファイルとなります

入力ファイル

これが１つのデータです。先頭の５が正解データ、それ以降０から続くデータがビットマップの数値表現です

train.txt

5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 18 18 18 126
136 175 26 166 255 247 127 0 0 0 0 0 0 0 0 0 0 0 0 30 36 94 154 170 253 253 253
253 253 225 172 253 242 195 64 0 0 0 0 0 0 0 0 0 0 0 49 238 253 253 253 253 253
253 253 253 251 93 82 82 56 39 0 0 0 0 0 0 0 0 0 0 0 0 18 219 253 253 253 253 2
53 198 182 247 241 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 80 156 107 253 253 205 11
0 43 154 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 1 154 253 90 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 139 253 190 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 11 190 253 70 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
35 241 225 160 108 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 81 240 253 25
3 119 25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 45 186 253 253 150 27 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 93 252 253 187 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 249 253 249 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 46 130 183 253 253 207 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 39 148 22
9 253 253 253 250 182 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 24 114 221 253 253 253
253 201 78 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 23 66 213 253 253 253 253 198 81 2
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 171 219 253 253 253 253 195 80 9 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 55 172 226 253 253 253 253 244 133 11 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 136 253 253 253 212 135 132 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0

5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 18 18 18 126

136 175 26 166 255 247 127 0 0 0 0 0 0 0 0 0 0 0 0 30 36 94 154 170 253 253 253

253 253 225 172 253 242 195 64 0 0 0 0 0 0 0 0 0 0 0 49 238 253 253 253 253 253

253 253 253 251 93 82 82 56 39 0 0 0 0 0 0 0 0 0 0 0 0 18 219 253 253 253 253 2

53 198 182 247 241 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 80 156 107 253 253 205 11

0 43 154 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 1 154 253 90 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 139 253 190 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 11 190 253 70 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

35 241 225 160 108 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 81 240 253 25

3 119 25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 45 186 253 253 150 27 0 0

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 93 252 253 187 0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 249 253 249 64 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0 46 130 183 253 253 207 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 39 148 22

9 253 253 253 250 182 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 24 114 221 253 253 253

253 201 78 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 23 66 213 253 253 253 253 198 81 2

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 171 219 253 253 253 253 195 80 9 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0 55 172 226 253 253 253 253 244 133 11 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 136 253 253 253 212 135 132 16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0 0 0

Tensorflow入力

実際にTensorflowに計算させる際には0－254のビットマップデータを0－1の表現に変更します

[ 0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.
0.32941176	0.7254902	 0.62352941	0.59215686	0.23529412	0.14117647
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.87058824	0.99607843	0.99607843	0.99607843	0.99607843
0.94509804	0.77647059	0.77647059	0.77647059	0.77647059	0.77647059
0.77647059	0.77647059	0.77647059	0.66666667	0.20392157	0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.2627451	 0.44705882	0.28235294
0.44705882	0.63921569	0.89019608	0.99607843	0.88235294	0.99607843　
...
0.4745098	 0.99607843	0.81176471	0.07058824	0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.					0.					0.					0.
0.					0.					0.					0.				]

[ 0. 0. 0. 0. 0. 0. 0.

0. 0. 0. 0. 0. 0. 0.

0. 0. 0. 0. 0. 0.

0.32941176 0.7254902 0.62352941 0.59215686 0.23529412 0.14117647

0. 0. 0. 0. 0. 0. 0.

0. 0.87058824 0.99607843 0.99607843 0.99607843 0.99607843

0.94509804 0.77647059 0.77647059 0.77647059 0.77647059 0.77647059

0.77647059 0.77647059 0.77647059 0.66666667 0.20392157 0. 0.

0. 0. 0. 0. 0. 0. 0.

0. 0. 0. 0.2627451 0.44705882 0.28235294

0.44705882 0.63921569 0.89019608 0.99607843 0.88235294 0.99607843　

...

0.4745098 0.99607843 0.81176471 0.07058824 0. 0. 0.

0. 0. 0. 0. 0. 0. 0.

0. 0. 0. 0. ]

教師データ

正解データは1-of-k方式のデータに直します。下記ですと７になります

[ 0.	0.	0.	0.	0.	0.	0.	1.	0.	0.]

1	[ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]

回帰分析のダミー変数みたいな感じだと思えばいいかと思います

TensorflowでMNIST（３）

投稿者: utsubo 投稿日: 2016-03-30 in ML、python

最後は畳み込みニューラルネットワーク(CNN)を実装します。

TensorflowではExpertとしてTutorialに入っているものです。

前回と同様に、MNISTのデータを作成し、Deeplearningしてみます。

コードは前回、前々回のものの一部エンジン部分のみを変更していますので合わせて比較してみるとよくわかるかもしません。

データ作成

まずデータを作成します。こちらの手順を実行します

http://d.hatena.ne.jp/anagotan/20160328/1459156607

train.txtとt10k.txtを作成しておきます

tf_cnn.py

こちらのコードは以下のものを流用させていただきました

http://qiita.com/ikki8412/items/95bc81a744dc377d9119

#!/bin/env python
# -*- coding: utf-8 -*-
# http://qiita.com/ikki8412/items/95bc81a744dc377d9119
import tensorflow as tf
import numpy as np
import random
import time
import math

NUMCLASS=10
NUMPARAM=784
   
### データ処理用
def label_data(lines):
  labels=[]
  for line in lines:
    # ラベルを1-of-k方式で用意する
    tmp = np.zeros(NUMCLASS)
    tmp[int(line)] = 1
    labels.append(tmp)
  return np.asarray(labels)

def image_data(test):
  test_image=map(lambda n: map(lambda k: float(k)/255.0,n),test[:,1:NUMPARAM+1])
  return np.asarray(test_image)


# 開始時刻
start_time = time.time()
print "開始時刻: " + str(start_time)


### データ取得 --->
# ファイルを開く
f = open("train.txt", 'r')
# データを入れる配列
train = []
for line in f:
    # 改行を除いてスペース区切りにする
    line = line.rstrip()
    l = line.split(" ")
    l = map(lambda n: int(n),l)
    #l=map(lambda n: 0 if n=="0" else 1,l)
    train.append(l)


# numpy形式に変換
train = np.asarray(train)
f.close()

f = open("t10k.txt", 'r')
test = []
for line in f:
    line = line.rstrip()
    l = line.split(" ")
    l = map(lambda n: int(n),l)
    #l=map(lambda n: 0 if n=="0" else 1,l)
    test.append(l)

test = np.asarray(test)
f.close()
### データ取得 ---<


# 訓練画像を入れる変数
# 訓練画像は28x28pxであり、これらを1行784列のベクトルに並び替え格納する
# Noneとなっているのは訓練画像がいくつでも入れられるようにするため
x = tf.placeholder(tf.float32, [None, NUMPARAM], name="x-input")

# 交差エントロピー
# y_は正解データのラベル
# 損失とオプティマイザを定義します
y_ = tf.placeholder(tf.float32, [None, NUMCLASS], name="y-input")


# hidden1
with tf.name_scope("hidden_layer1") as scope:
  weights = tf.Variable(tf.truncated_normal([NUMPARAM, 500],
                          stddev=1.0 / math.sqrt(float(NUMPARAM))),name='weights')
  biases = tf.Variable(tf.zeros([500]),name='biases')

  hidden1 = tf.nn.sigmoid(tf.matmul(x, weights) + biases)
# hidden2
with tf.name_scope("hidden_layer2") as scope:
  weights = tf.Variable(tf.truncated_normal([500, 300],
                          stddev=1.0 / math.sqrt(float(500))),name='weights')
  biases = tf.Variable(tf.zeros([300]),name='biases')

  hidden2 = tf.nn.sigmoid(tf.matmul(hidden1, weights) + biases)
# softmax layer
with tf.name_scope("softmax_layer") as scope:
  weights = tf.Variable(tf.truncated_normal([300, NUMCLASS],
                          stddev=1.0 / math.sqrt(float(300))),name='weights')
  biases = tf.Variable(tf.zeros([NUMCLASS]),name='biases')

  y = tf.nn.softmax((tf.matmul(hidden2, weights) + biases))


# 更なる name scopes はグラフ表現をクリーンアップしま
with tf.name_scope("xent") as scope:
  cross_entropy = -tf.reduce_sum(y_*tf.log(y))
  # TensorBoardで表示するよう指定
  tf.scalar_summary("cross_entropy", cross_entropy)

  # 勾配硬化法を用い交差エントロピーが最小となるようyを最適化する
  train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)

# 用意した変数Veriableの初期化を実行する
init = tf.initialize_all_variables()

# Sessionを開始する
# runすることで初めて実行開始される（run(init)しないとinitが実行されない）
sess = tf.Session()
sess.run(init)
# TensorBoardで表示する値の設定
summary_op = tf.merge_all_summaries()
summary_writer = tf.train.SummaryWriter("/tmp/data", sess.graph_def)


# 1000回の訓練（train_step）を実行する
# next_batch(100)で100つのランダムな訓練セット（画像と対応するラベル）を選択する
# 訓練データは60000点あるので全て使いたいところだが費用つまり時間がかかるのでランダムな100つを使う
# 100つでも同じような結果を得ることができる
# feed_dictでplaceholderに値を入力することができる
print "--- 訓練開始 ---"
for i in range(20000):
  train_sample=np.asarray(random.sample(train,100))
  batch_ys=label_data(train_sample[:,0])
  batch_xs=image_data(train_sample)
  train_accuracy=sess.run(train_step, feed_dict={x: batch_xs, y_:batch_ys})

  # 1 step終わるたびにTensorBoardに表示する値を追加する
  summary_str=sess.run(summary_op, feed_dict={x: batch_xs, y_:batch_ys})
  summary_writer.add_summary(summary_str, i)
print "--- 訓練終了 ---"

# 正しいかの予測
# 計算された画像がどの数字であるかの予測yと正解ラベルy_を比較する
# 同じ値であればTrueが返される
# argmaxは配列の中で一番値の大きい箇所のindexが返される
# 一番値が大きいindexということは、それがその数字である確率が一番大きいということ
# Trueが返ってくるということは訓練した結果と回答が同じということ
with tf.name_scope("test") as scope:
  correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))

# 精度の計算
# correct_predictionはbooleanなのでfloatにキャストし、平均値を計算する
# Trueならば1、Falseならば0に変換される
  accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

  tf.scalar_summary("accuracy", accuracy)

# 精度の実行と表示
# テストデータの画像とラベルで精度を確認する
# ソフトマックス回帰によってWとbの値が計算されているので、xを入力することでyが計算できる
test_label=label_data(test[:,0])
test_image=image_data(test)
print "精度"
print(sess.run(accuracy, feed_dict={x: test_image, y_: test_label}))

# 終了時刻
end_time = time.time()
print "終了時刻: " + str(end_time)
print "かかった時間: " + str(end_time - start_time)

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

#!/bin/env python

# -*- coding: utf-8 -*-

# http://qiita.com/ikki8412/items/95bc81a744dc377d9119

import tensorflow as tf

import numpy as np

import random

import time

import math

NUMCLASS=10

NUMPARAM=784

### データ処理用

def label_data(lines):

labels=[]

for line in lines:

# ラベルを1-of-k方式で用意する

tmp = np.zeros(NUMCLASS)

tmp[int(line)] = 1

labels.append(tmp)

return np.asarray(labels)

def image_data(test):

test_image=map(lambda n: map(lambda k: float(k)/255.0,n),test[:,1:NUMPARAM+1])

return np.asarray(test_image)

# 開始時刻

start_time = time.time()

print "開始時刻: " + str(start_time)

### データ取得 --->

# ファイルを開く

f = open("train.txt", 'r')

# データを入れる配列

train = []

for line in f:

# 改行を除いてスペース区切りにする

line = line.rstrip()

l = line.split(" ")

l = map(lambda n: int(n),l)

#l=map(lambda n: 0 if n=="0" else 1,l)

train.append(l)

# numpy形式に変換

train = np.asarray(train)

f.close()

f = open("t10k.txt", 'r')

test = []

for line in f:

line = line.rstrip()

l = line.split(" ")

l = map(lambda n: int(n),l)

#l=map(lambda n: 0 if n=="0" else 1,l)

test.append(l)

test = np.asarray(test)

f.close()

### データ取得 ---<

# 訓練画像を入れる変数

# 訓練画像は28x28pxであり、これらを1行784列のベクトルに並び替え格納する

# Noneとなっているのは訓練画像がいくつでも入れられるようにするため

x = tf.placeholder(tf.float32, [None, NUMPARAM], name="x-input")

# 交差エントロピー

# y_は正解データのラベル

# 損失とオプティマイザを定義します

y_ = tf.placeholder(tf.float32, [None, NUMCLASS], name="y-input")

# hidden1

with tf.name_scope("hidden_layer1") as scope:

weights = tf.Variable(tf.truncated_normal([NUMPARAM, 500],

stddev=1.0 / math.sqrt(float(NUMPARAM))),name='weights')

biases = tf.Variable(tf.zeros([500]),name='biases')

hidden1 = tf.nn.sigmoid(tf.matmul(x, weights) + biases)

# hidden2

with tf.name_scope("hidden_layer2") as scope:

weights = tf.Variable(tf.truncated_normal([500, 300],

stddev=1.0 / math.sqrt(float(500))),name='weights')

biases = tf.Variable(tf.zeros([300]),name='biases')

hidden2 = tf.nn.sigmoid(tf.matmul(hidden1, weights) + biases)

# softmax layer

with tf.name_scope("softmax_layer") as scope:

weights = tf.Variable(tf.truncated_normal([300, NUMCLASS],

stddev=1.0 / math.sqrt(float(300))),name='weights')

biases = tf.Variable(tf.zeros([NUMCLASS]),name='biases')

y = tf.nn.softmax((tf.matmul(hidden2, weights) + biases))

# 更なる name scopes はグラフ表現をクリーンアップしま

with tf.name_scope("xent") as scope:

cross_entropy = -tf.reduce_sum(y_*tf.log(y))

# TensorBoardで表示するよう指定

tf.scalar_summary("cross_entropy", cross_entropy)

# 勾配硬化法を用い交差エントロピーが最小となるようyを最適化する

train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)

# 用意した変数Veriableの初期化を実行する

init = tf.initialize_all_variables()

# Sessionを開始する

# runすることで初めて実行開始される（run(init)しないとinitが実行されない）

sess = tf.Session()

sess.run(init)

# TensorBoardで表示する値の設定

summary_op = tf.merge_all_summaries()

summary_writer = tf.train.SummaryWriter("/tmp/data", sess.graph_def)

# 1000回の訓練（train_step）を実行する

# next_batch(100)で100つのランダムな訓練セット（画像と対応するラベル）を選択する

# 訓練データは60000点あるので全て使いたいところだが費用つまり時間がかかるのでランダムな100つを使う

# 100つでも同じような結果を得ることができる

# feed_dictでplaceholderに値を入力することができる

print "--- 訓練開始 ---"

for i in range(20000):

train_sample=np.asarray(random.sample(train,100))

batch_ys=label_data(train_sample[:,0])

batch_xs=image_data(train_sample)

train_accuracy=sess.run(train_step, feed_dict={x: batch_xs, y_:batch_ys})

# 1 step終わるたびにTensorBoardに表示する値を追加する

summary_str=sess.run(summary_op, feed_dict={x: batch_xs, y_:batch_ys})

summary_writer.add_summary(summary_str, i)

print "--- 訓練終了 ---"

# 正しいかの予測

# 計算された画像がどの数字であるかの予測yと正解ラベルy_を比較する

# 同じ値であればTrueが返される

# argmaxは配列の中で一番値の大きい箇所のindexが返される

# 一番値が大きいindexということは、それがその数字である確率が一番大きいということ

# Trueが返ってくるということは訓練した結果と回答が同じということ

with tf.name_scope("test") as scope:

correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))

# 精度の計算

# correct_predictionはbooleanなのでfloatにキャストし、平均値を計算する

# Trueならば1、Falseならば0に変換される

accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

tf.scalar_summary("accuracy", accuracy)

# 精度の実行と表示

# テストデータの画像とラベルで精度を確認する

# ソフトマックス回帰によってWとbの値が計算されているので、xを入力することでyが計算できる

test_label=label_data(test[:,0])

test_image=image_data(test)

print "精度"

print(sess.run(accuracy, feed_dict={x: test_image, y_: test_label}))

# 終了時刻

end_time = time.time()

print "終了時刻: " + str(end_time)

print "かかった時間: " + str(end_time - start_time)

実行

$ python tf_cnn.py
開始時刻: 1459233406.76
--- 訓練開始 ---
--- 訓練終了 ---
精度
0.9806
終了時刻: 1459234272.93
かかった時間: 866.176848888

$ python tf_cnn.py

開始時刻: 1459233406.76

--- 訓練開始 ---

--- 訓練終了 ---

精度

0.9806

終了時刻: 1459234272.93

かかった時間: 866.176848888

回帰(0.9227)、多層パーセプトロン(0.9562)に比較し0.9806とかなり精度が上がりました

TensorflowでMNIST（２）

投稿者: utsubo 投稿日: 2016-03-29 in ML、python

今回は中級？者向けの多層パーセプトロン(multilayer perceptron)を実装します

こちらのコードを流用させていただきました

http://qiita.com/TomokIshii/items/92a266b805d7eee02b1d

前回と同様に、input_data.pyを使わずにデータを自前で作成します

http://d.hatena.ne.jp/anagotan/20160328/1459156607

train.txtとt10k.txtを作成しておきます

tf_mlp.py

#!/bin/env python
# -*- coding: utf-8 -*-
# http://qiita.com/ikki8412/items/95bc81a744dc377d9119
import tensorflow as tf
import numpy as np
import random
import time
import math

NUMCLASS=10
NUMPARAM=784
NUMHIDDEN=625
   
### データ処理用
def label_data(lines):
  labels=[]
  for line in lines:
    # ラベルを1-of-k方式で用意する
    tmp = np.zeros(NUMCLASS)
    tmp[int(line)] = 1
    labels.append(tmp)
  return np.asarray(labels)

def image_data(test):
  test_image=map(lambda n: map(lambda k: float(k)/255.0,n),test[:,1:NUMPARAM+1])
  return np.asarray(test_image)


# 開始時刻
start_time = time.time()
print "開始時刻: " + str(start_time)


### データ取得 --->
# ファイルを開く
f = open("train.txt", 'r')
# データを入れる配列
train = []
for line in f:
    # 改行を除いてスペース区切りにする
    line = line.rstrip()
    l = line.split(" ")
    l = map(lambda n: int(n),l)
    #l=map(lambda n: 0 if n=="0" else 1,l)
    train.append(l)


# numpy形式に変換
train = np.asarray(train)
f.close()

f = open("t10k.txt", 'r')
test = []
for line in f:
    line = line.rstrip()
    l = line.split(" ")
    l = map(lambda n: int(n),l)
    #l=map(lambda n: 0 if n=="0" else 1,l)
    test.append(l)

test = np.asarray(test)
f.close()
### データ取得 ---<


# 訓練画像を入れる変数
# 訓練画像は28x28pxであり、これらを1行784列のベクトルに並び替え格納する
# Noneとなっているのは訓練画像がいくつでも入れられるようにするため
x = tf.placeholder(tf.float32, [None, NUMPARAM], name="x-input")

# 交差エントロピー
# y_は正解データのラベル
# 損失とオプティマイザを定義します
y_ = tf.placeholder(tf.float32, [None, NUMCLASS], name="y-input")


# hidden1
with tf.name_scope("hidden_layer1") as scope:
  w_h = tf.Variable(tf.random_normal([NUMPARAM, NUMHIDDEN],mean=0.0, stddev=0.05))
  b_h = tf.Variable(tf.zeros([NUMHIDDEN]),name='biases')

  h = tf.sigmoid(tf.matmul(x, w_h) + b_h)
# output layer
with tf.name_scope("output_layer") as scope:
  w_o = tf.Variable(tf.truncated_normal([NUMHIDDEN, NUMCLASS],mean=0.0, stddev=0.05))
  b_o = tf.Variable(tf.zeros([NUMCLASS]),name='biases')

  y = tf.nn.softmax((tf.matmul(h, w_o) + b_o))


# 更なる name scopes はグラフ表現をクリーンアップしま
with tf.name_scope("xent") as scope:
  # Cost Function basic term
  cross_entropy = -tf.reduce_sum(y_*tf.log(y))
  
  # Regularization terms (weight decay)
  L2_sqr = tf.nn.l2_loss(w_h) + tf.nn.l2_loss(w_o)
  lambda_2 = 0.01
  # the loss and accuracy
  loss = cross_entropy + lambda_2 * L2_sqr

  # TensorBoardで表示するよう指定
  tf.scalar_summary("cross_entropy", cross_entropy)

  # 勾配硬化法を用い交差エントロピーが最小となるようyを最適化する
  train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cross_entropy)

# 用意した変数Veriableの初期化を実行する
init = tf.initialize_all_variables()

# Sessionを開始する
# runすることで初めて実行開始される（run(init)しないとinitが実行されない）
sess = tf.Session()
sess.run(init)
# TensorBoardで表示する値の設定
summary_op = tf.merge_all_summaries()
summary_writer = tf.train.SummaryWriter("/tmp/data", sess.graph_def)


# 1000回の訓練（train_step）を実行する
# next_batch(100)で100つのランダムな訓練セット（画像と対応するラベル）を選択する
# 訓練データは60000点あるので全て使いたいところだが費用つまり時間がかかるのでランダムな100つを使う
# 100つでも同じような結果を得ることができる
# feed_dictでplaceholderに値を入力することができる
print "--- 訓練開始 ---"
for i in range(20000):
  train_sample=np.asarray(random.sample(train,100))
  batch_ys=label_data(train_sample[:,0])
  batch_xs=image_data(train_sample)
  train_accuracy=sess.run(train_step, feed_dict={x: batch_xs, y_:batch_ys})

  # 1 step終わるたびにTensorBoardに表示する値を追加する
  summary_str=sess.run(summary_op, feed_dict={x: batch_xs, y_:batch_ys})
  summary_writer.add_summary(summary_str, i)
print "--- 訓練終了 ---"

# 正しいかの予測
# 計算された画像がどの数字であるかの予測yと正解ラベルy_を比較する
# 同じ値であればTrueが返される
# argmaxは配列の中で一番値の大きい箇所のindexが返される
# 一番値が大きいindexということは、それがその数字である確率が一番大きいということ
# Trueが返ってくるということは訓練した結果と回答が同じということ
with tf.name_scope("test") as scope:
  correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))

# 精度の計算
# correct_predictionはbooleanなのでfloatにキャストし、平均値を計算する
# Trueならば1、Falseならば0に変換される
  accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

  tf.scalar_summary("accuracy", accuracy)

# 精度の実行と表示
# テストデータの画像とラベルで精度を確認する
# ソフトマックス回帰によってWとbの値が計算されているので、xを入力することでyが計算できる
test_label=label_data(test[:,0])
test_image=image_data(test)
print "精度"
print(sess.run(accuracy, feed_dict={x: test_image, y_: test_label}))

# 終了時刻
end_time = time.time()
print "終了時刻: " + str(end_time)
print "かかった時間: " + str(end_time - start_time)

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

#!/bin/env python

# -*- coding: utf-8 -*-

# http://qiita.com/ikki8412/items/95bc81a744dc377d9119

import tensorflow as tf

import numpy as np

import random

import time

import math

NUMCLASS=10

NUMPARAM=784

NUMHIDDEN=625

### データ処理用

def label_data(lines):

labels=[]

for line in lines:

# ラベルを1-of-k方式で用意する

tmp = np.zeros(NUMCLASS)

tmp[int(line)] = 1

labels.append(tmp)

return np.asarray(labels)

def image_data(test):

test_image=map(lambda n: map(lambda k: float(k)/255.0,n),test[:,1:NUMPARAM+1])

return np.asarray(test_image)

# 開始時刻

start_time = time.time()

print "開始時刻: " + str(start_time)

### データ取得 --->

# ファイルを開く

f = open("train.txt", 'r')

# データを入れる配列

train = []

for line in f:

# 改行を除いてスペース区切りにする

line = line.rstrip()

l = line.split(" ")

l = map(lambda n: int(n),l)

#l=map(lambda n: 0 if n=="0" else 1,l)

train.append(l)

# numpy形式に変換

train = np.asarray(train)

f.close()

f = open("t10k.txt", 'r')

test = []

for line in f:

line = line.rstrip()

l = line.split(" ")

l = map(lambda n: int(n),l)

#l=map(lambda n: 0 if n=="0" else 1,l)

test.append(l)

test = np.asarray(test)

f.close()

### データ取得 ---<

# 訓練画像を入れる変数

# 訓練画像は28x28pxであり、これらを1行784列のベクトルに並び替え格納する

# Noneとなっているのは訓練画像がいくつでも入れられるようにするため

x = tf.placeholder(tf.float32, [None, NUMPARAM], name="x-input")

# 交差エントロピー

# y_は正解データのラベル

# 損失とオプティマイザを定義します

y_ = tf.placeholder(tf.float32, [None, NUMCLASS], name="y-input")

# hidden1

with tf.name_scope("hidden_layer1") as scope:

w_h = tf.Variable(tf.random_normal([NUMPARAM, NUMHIDDEN],mean=0.0, stddev=0.05))

b_h = tf.Variable(tf.zeros([NUMHIDDEN]),name='biases')

h = tf.sigmoid(tf.matmul(x, w_h) + b_h)

# output layer

with tf.name_scope("output_layer") as scope:

w_o = tf.Variable(tf.truncated_normal([NUMHIDDEN, NUMCLASS],mean=0.0, stddev=0.05))

b_o = tf.Variable(tf.zeros([NUMCLASS]),name='biases')

y = tf.nn.softmax((tf.matmul(h, w_o) + b_o))

# 更なる name scopes はグラフ表現をクリーンアップしま

with tf.name_scope("xent") as scope:

# Cost Function basic term

cross_entropy = -tf.reduce_sum(y_*tf.log(y))

# Regularization terms (weight decay)

L2_sqr = tf.nn.l2_loss(w_h) + tf.nn.l2_loss(w_o)

lambda_2 = 0.01

# the loss and accuracy

loss = cross_entropy + lambda_2 * L2_sqr

# TensorBoardで表示するよう指定

tf.scalar_summary("cross_entropy", cross_entropy)

# 勾配硬化法を用い交差エントロピーが最小となるようyを最適化する

train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cross_entropy)

# 用意した変数Veriableの初期化を実行する

init = tf.initialize_all_variables()

# Sessionを開始する

# runすることで初めて実行開始される（run(init)しないとinitが実行されない）

sess = tf.Session()

sess.run(init)

# TensorBoardで表示する値の設定

summary_op = tf.merge_all_summaries()

summary_writer = tf.train.SummaryWriter("/tmp/data", sess.graph_def)

# 1000回の訓練（train_step）を実行する

# next_batch(100)で100つのランダムな訓練セット（画像と対応するラベル）を選択する

# 訓練データは60000点あるので全て使いたいところだが費用つまり時間がかかるのでランダムな100つを使う

# 100つでも同じような結果を得ることができる

# feed_dictでplaceholderに値を入力することができる

print "--- 訓練開始 ---"

for i in range(20000):

train_sample=np.asarray(random.sample(train,100))

batch_ys=label_data(train_sample[:,0])

batch_xs=image_data(train_sample)

train_accuracy=sess.run(train_step, feed_dict={x: batch_xs, y_:batch_ys})

# 1 step終わるたびにTensorBoardに表示する値を追加する

summary_str=sess.run(summary_op, feed_dict={x: batch_xs, y_:batch_ys})

summary_writer.add_summary(summary_str, i)

print "--- 訓練終了 ---"

# 正しいかの予測

# 計算された画像がどの数字であるかの予測yと正解ラベルy_を比較する

# 同じ値であればTrueが返される

# argmaxは配列の中で一番値の大きい箇所のindexが返される

# 一番値が大きいindexということは、それがその数字である確率が一番大きいということ

# Trueが返ってくるということは訓練した結果と回答が同じということ

with tf.name_scope("test") as scope:

correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))

# 精度の計算

# correct_predictionはbooleanなのでfloatにキャストし、平均値を計算する

# Trueならば1、Falseならば0に変換される

accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

tf.scalar_summary("accuracy", accuracy)

# 精度の実行と表示

# テストデータの画像とラベルで精度を確認する

# ソフトマックス回帰によってWとbの値が計算されているので、xを入力することでyが計算できる

test_label=label_data(test[:,0])

test_image=image_data(test)

print "精度"

print(sess.run(accuracy, feed_dict={x: test_image, y_: test_label}))

# 終了時刻

end_time = time.time()

print "終了時刻: " + str(end_time)

print "かかった時間: " + str(end_time - start_time)

実行

$ python tf_mlp.py
開始時刻: 1459237690.97
--- 訓練開始 ---
--- 訓練終了 ---
精度
0.9562
終了時刻: 1459238511.06
かかった時間: 820.087219

$ python tf_mlp.py

開始時刻: 1459237690.97

--- 訓練開始 ---

--- 訓練終了 ---

精度

0.9562

終了時刻: 1459238511.06

かかった時間: 820.087219

回帰の場合には0.9227だったのですが、0.9562まで精度が上がりました

TensorflowでMNIST（１）

投稿者: utsubo 投稿日: 2016-03-28 in ML、python

GoogleのDeeplearningプラットフォームであるtensorflowを触ってみました。

https://www.tensorflow.org/

世の中にはMNISTのサンプルを実行したブログが多いのですが、tutorialを開設しているだけのものが多くちょっとよく理解できていませんでした

自分なりに色々と調べてMNISTを理解していきます

まずはBeginnerということのサンプルです。

Beginnerというか、Deeplearningというよりは回帰分析をTensorflowで行っているというサンプルです

データダウンロード

input_data.pyを使うとよくわからないので自分でデータ取得からハンドリングします。

まず、データをダウンロードします。

https://www.tensorflow.org/versions/master/tutorials/mnist/download/index.html

こちらの真ん中ほどにあるリンクから下記４つをダウンロードし解凍しておきます。

train-images-idx3-ubyte.gz

train-labels-idx1-ubyte.gz

t10k-images-idx3-ubyte.gz

t10k-labels-idx1-ubyte.gz

$ gunzip train-images-idx3-ubyte.gz
$ gunzip train-labels-idx1-ubyte.gz
$ gunzip t10k-images-idx3-ubyte.gz
$ gunzip t10k-labels-idx1-ubyte.gz

$ gunzip train-images-idx3-ubyte.gz

$ gunzip train-labels-idx1-ubyte.gz

$ gunzip t10k-images-idx3-ubyte.gz

$ gunzip t10k-labels-idx1-ubyte.gz

データの整形

そのままでは使いづらいので整形します

od -An -v -tu1 -j16 -w784 train-images-idx3-ubyte | sed 's/^ *//' | tr -s ' ' &gt;train-images.txt
od -An -v -tu1 -j8 -w1 train-labels-idx1-ubyte | tr -d ' ' &gt;train-labels.txt
od -An -v -tu1 -j16 -w784 t10k-images-idx3-ubyte | sed 's/^ *//' | tr -s ' ' &gt;t10k-images.txt
od -An -v -tu1 -j8 -w1 t10k-labels-idx1-ubyte | tr -d ' ' &gt;t10k-labels.txt
file_join(){
image=$1
label=$2
ruby &lt; train.txt
file_join t10k-images.txt t10k-labels.txt &gt; t10k.txt

od -An -v -tu1 -j16 -w784 train-images-idx3-ubyte | sed 's/^ *//' | tr -s ' ' >train-images.txt

od -An -v -tu1 -j8 -w1 train-labels-idx1-ubyte | tr -d ' ' >train-labels.txt

od -An -v -tu1 -j16 -w784 t10k-images-idx3-ubyte | sed 's/^ *//' | tr -s ' ' >t10k-images.txt

od -An -v -tu1 -j8 -w1 t10k-labels-idx1-ubyte | tr -d ' ' >t10k-labels.txt

file_join(){

image=$1

label=$2

ruby < train.txt

file_join t10k-images.txt t10k-labels.txt > t10k.txt

train.txtとt10k.txtというファイルが作成されます。このファイルは１行ごとにMNISTの画像データの数値データ、０－２５５までの値で構成されています。その行の先頭に正解数字を入れてたデータです。

Deeplearning

Tensorflowのプログラムはこちらの方のサンプルを流用させていただきました

http://tensorflow.classcat.com/2016/02/11/tensorflow-how-tos-visualizing-learning/

#!/bin/env python
# -*- coding: utf-8 -*-
# http://tensorflow.classcat.com/2016/02/11/tensorflow-how-tos-visualizing-learning/
import tensorflow as tf
import numpy as np
import random
import time

NUMCLASS=10
NUMPARAM=784

### データ処理用
def label_data(lines):
  labels=[]
  for line in lines:
    # ラベルを1-of-k方式で用意する
    tmp = np.zeros(NUMCLASS)
    tmp[int(line)] = 1
    labels.append(tmp)
  return np.asarray(labels)

def image_data(test):
  test_image=map(lambda n: map(lambda k: float(k)/255.0,n),test[:,1:NUMPARAM+1])
  return np.asarray(test_image)


# 開始時刻
start_time = time.time()
print "開始時刻: " + str(start_time)


### データ取得 --->
# ファイルを開く
f = open("train.txt", 'r')
# データを入れる配列
train = []
for line in f:
    # 改行を除いてスペース区切りにする
    line = line.rstrip()
    l = line.split(" ")
    l = map(lambda n: int(n),l)
    #l=map(lambda n: 0 if n=="0" else 1,l)
    train.append(l)


# numpy形式に変換
train = np.asarray(train)
f.close()

f = open("t10k.txt", 'r')
test = []
for line in f:
    line = line.rstrip()
    l = line.split(" ")
    l = map(lambda n: int(n),l)
    #l=map(lambda n: 0 if n=="0" else 1,l)
    test.append(l)

test = np.asarray(test)
f.close()
### データ取得 ---<


# ファイルを開く
f = open("train.txt", 'r')
# データを入れる配列
train = []
for line in f:
    # 改行を除いてスペース区切りにする
    line = line.rstrip()
    l = line.split(" ")
    l = map(lambda n: int(n),l)
    #l=map(lambda n: 0 if n=="0" else 1,l)
    train.append(l)


# numpy形式に変換
train = np.asarray(train)
f.close()

f = open("t10k.txt", 'r')
test = []
for line in f:
    line = line.rstrip()
    l = line.split(" ")
    l = map(lambda n: int(n),l)
    #l=map(lambda n: 0 if n=="0" else 1,l)
    test.append(l)

test = np.asarray(test)
f.close()

### データ取得 ---
test_label=label_data(test[:,0])
test_image=image_data(test)
print "精度"
print(sess.run(accuracy, feed_dict={x: test_image, y_: test_label}))

# 終了時刻
end_time = time.time()
print "終了時刻: " + str(end_time)
print "かかった時間: " + str(end_time - start_time)

100

101

102

#!/bin/env python

# -*- coding: utf-8 -*-

# http://tensorflow.classcat.com/2016/02/11/tensorflow-how-tos-visualizing-learning/

import tensorflow as tf

import numpy as np

import random

import time

NUMCLASS=10

NUMPARAM=784

### データ処理用

def label_data(lines):

labels=[]

for line in lines:

# ラベルを1-of-k方式で用意する

tmp = np.zeros(NUMCLASS)

tmp[int(line)] = 1

labels.append(tmp)

return np.asarray(labels)

def image_data(test):

test_image=map(lambda n: map(lambda k: float(k)/255.0,n),test[:,1:NUMPARAM+1])

return np.asarray(test_image)

# 開始時刻

start_time = time.time()

print "開始時刻: " + str(start_time)

### データ取得 --->

# ファイルを開く

f = open("train.txt", 'r')

# データを入れる配列

train = []

for line in f:

# 改行を除いてスペース区切りにする

line = line.rstrip()

l = line.split(" ")

l = map(lambda n: int(n),l)

#l=map(lambda n: 0 if n=="0" else 1,l)

train.append(l)

# numpy形式に変換

train = np.asarray(train)

f.close()

f = open("t10k.txt", 'r')

test = []

for line in f:

line = line.rstrip()

l = line.split(" ")

l = map(lambda n: int(n),l)

#l=map(lambda n: 0 if n=="0" else 1,l)

test.append(l)

test = np.asarray(test)

f.close()

### データ取得 ---<

# ファイルを開く

f = open("train.txt", 'r')

# データを入れる配列

train = []

for line in f:

# 改行を除いてスペース区切りにする

line = line.rstrip()

l = line.split(" ")

l = map(lambda n: int(n),l)

#l=map(lambda n: 0 if n=="0" else 1,l)

train.append(l)

# numpy形式に変換

train = np.asarray(train)

f.close()

f = open("t10k.txt", 'r')

test = []

for line in f:

line = line.rstrip()

l = line.split(" ")

l = map(lambda n: int(n),l)

#l=map(lambda n: 0 if n=="0" else 1,l)

test.append(l)

test = np.asarray(test)

f.close()

### データ取得 ---

test_label=label_data(test[:,0])

test_image=image_data(test)

print "精度"

print(sess.run(accuracy, feed_dict={x: test_image, y_: test_label}))

# 終了時刻

end_time = time.time()

print "終了時刻: " + str(end_time)

print "かかった時間: " + str(end_time - start_time)

実行

これを実行します

$ python tf_regression.py
開始時刻: 1459234322.4
--- 訓練開始 ---
--- 訓練終了 ---
精度
0.9227
終了時刻: 1459234921.58
かかった時間: 599.178552866

$ python tf_regression.py

開始時刻: 1459234322.4

--- 訓練開始 ---

--- 訓練終了 ---

精度

0.9227

終了時刻: 1459234921.58

かかった時間: 599.178552866

精度はあまり良くありませんが計算できました