scikit-learnのTfidfVectorizerで単語ごとのスコアを取得する

2016-06-17
このエントリーをはてなブックマークに追加
from sklearn.feature_extraction.text import TfidfVectorizer


def main():
    _docs = [
        'A friend to all is a friend to none.',
        'One good turn deserves another.',
        'He who runs after two hares will catch neither.',
        'It’s no use crying over spilt milk.',
        'Time and tide wait for no man.',
    ]

    _v = TfidfVectorizer(analyzer=text_split)
    _score =_v.fit_transform(_docs)

    print(_score.toarray())
    print(_v.get_feature_names())

def text_split(val):
    return val.split(' ')

if __name__ == '__main__':
    main()

$ python tf.py
[[ 0.2773501   0.          0.          0.          0.          0.2773501
   0.          0.2773501   0.          0.          0.          0.          0.
   0.          0.5547002   0.          0.          0.2773501   0.          0.
   0.          0.          0.2773501   0.          0.          0.          0.
   0.5547002   0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.4472136   0.          0.          0.
   0.          0.          0.4472136   0.          0.          0.4472136
   0.          0.          0.4472136   0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.4472136   0.          0.          0.          0.          0.        ]
 [ 0.          0.33333333  0.          0.          0.          0.
   0.33333333  0.          0.          0.          0.33333333  0.          0.
   0.          0.          0.          0.33333333  0.          0.          0.
   0.33333333  0.          0.          0.          0.33333333  0.          0.
   0.          0.          0.33333333  0.          0.          0.33333333
   0.33333333]
 [ 0.          0.          0.38775666  0.          0.          0.          0.
   0.          0.          0.          0.          0.38775666  0.          0.
   0.          0.          0.          0.          0.          0.38775666
   0.          0.31283963  0.          0.38775666  0.          0.38775666
   0.          0.          0.          0.          0.38775666  0.          0.
   0.        ]
 [ 0.          0.          0.          0.          0.38775666  0.          0.
   0.          0.38775666  0.          0.          0.          0.
   0.38775666  0.          0.          0.          0.          0.38775666
   0.          0.          0.31283963  0.          0.          0.          0.
   0.38775666  0.          0.          0.          0.          0.38775666
   0.          0.        ]]

[‘A’, ‘He’, ‘It’s’, ‘One’, ‘Time’, ‘a’, ‘after’, ‘all’, ‘and’, ‘another.’, ‘catch’, ‘crying’, ‘deserves’, ‘for’, ‘friend’, ‘good’, ‘hares’, ‘is’, ‘man.’, ‘milk.’, ‘neither.’, ‘no’, ‘none.’, ‘over’, ‘runs’, ‘spilt’, ‘tide’, ‘to’, ‘turn’, ‘two’, ‘use’, ‘wait’, ‘who’, ‘will’]