Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

单机试验个性化推荐报错 TypeError: 'generator' object is not callable #2455

Closed
xlhlhlx opened this issue Jun 13, 2017 · 6 comments
Closed

Comments

@xlhlhlx
Copy link

xlhlhlx commented Jun 13, 2017

自己定义了一个data reader,然后跑train的时候报错,但是单独运行data reader是可以正常生成样本的,具体报错的log如下:
image

自己写的data reader如下:

#!/usr/bin/python
#encoding=utf8
import sys
import os
import random

class CategoryFeatureGenerator(object):
    def __init__(self):
        self.dic = dict()
        self.dic['unk'] = 0
        self.counter = 1

    def register(self, key):
        '''
        Register record.
        '''
        if key not in self.dic:
            self.dic[key] = self.counter
            self.counter += 1

    def size(self):
        return len(self.dic)


    def gen(self, key):
        '''
        Generate one-hot representation for a record.
        '''
        if key not in self.dic:
            res = self.dic['unk']
        else:
            res = self.dic[key]
        return res

    def __repr__(self):
        return '<CategoryFeatureGenerator %d>' % len(self.dic)

feature_fields = ['user_id','user_location','content_id','cate_id','word','check_in_period']
feature_dict = {}
for key in feature_fields:
    feature_dict[key] = CategoryFeatureGenerator()

def __init_dataset__(path):
    with open(path, "r") as f:
        for line in f:
            user_id, time_period, user_location, app_id, content_id, cate_id, title, brief, quality_level, check_in_period, read_time = line.strip().split('\t')
            feature_dict['user_id'].register(user_id)
            feature_dict['content_id'].register(int(content_id))
            user_location_list = user_location.split('|')
            for ul in user_location_list:
                feature_dict['user_location'].register(ul)
                feature_dict['cate_id'].register(cate_id)
                title_list = title.split(' ')
            for w in title_list:
                feature_dict['word'].register(w.lower())
                brief_list = brief.split(' ')
            for w in brief_list:
                feature_dict['word'].register(w.lower())
                feature_dict['check_in_period'].register(int(check_in_period))

class ReaderData(object):
    def __init__(self, data_path, test_ratio, is_test):
        __init_dataset__(data_path)
        self.data_path = data_path
        self.test_ratio = test_ratio
        self.is_test = is_test
    
    def reader_creator(self):
        def reader():
            rand = random.Random()
            path = self.data_path
            test_ratio = self.test_ratio
            is_test = self.is_test
            with open(path, "r") as f:
                for line in f:
                    if (rand.random() < test_ratio) == is_test:
                        user_id, time_period, user_location, app_id, content_id, cate_id, title, brief, quality_level, check_in_period, read_time = line.strip().split('\t')
                        user_id_code = feature_dict['user_id'].gen(user_id)
                        user_location_code = [feature_dict['user_location'].gen(ul) for ul in user_location.split('|')]
                        content_id_code = feature_dict['content_id'].gen(int(content_id))
                        cate_id_code = feature_dict['cate_id'].gen(cate_id)
                        title_code = [feature_dict['word'].gen(w.lower()) for w in title.split(' ')]
                        brief_code = [feature_dict['word'].gen(w.lower()) for w in brief.split(' ')]
                        check_in_period_code = feature_dict['check_in_period'].gen(int(check_in_period))
                        record = [user_id_code, int(time_period), user_location_code, content_id_code, cate_id_code, title_code, brief_code, check_in_period_code]
                        yield record + [[float(read_time)]]
        return reader

    def get_content_word_dict(self):
        return feature_dict['word'].dic

    def user_id_len(self):
        return feature_dict['user_id'].size()

    def get_user_location_dict(self):
        return feature_dict['user_location'].dic

    def content_id_len(self):
        return feature_dict['content_id'].size()

    def category_id_len(self):
        return feature_dict['cate_id'].size()
   
    def check_in_period_len(self):
        return feature_dict['check_in_period'].size()

if __name__ == '__main__':
    path = "./videoSample"
    test_ratio = 0.1
    is_test = False
    trainer = ReaderData(path, test_ratio, is_test)
    print trainer.user_id_len()
    a = trainer.get_user_location_dict()
    for no, rcd in enumerate(trainer.read()):
        print no, rcd
        if no > 10 : break

训练模型的代码如下:

#!/usr/bin/python
#encoding=utf8

import paddle.v2 as paddle
import cPickle
import copy
from paddle.v2.dataset.video import feature_dict, ReaderData 

dataset_train = ReaderData("./videoSample", 0.1, False)

def get_usr_combined_features():
    uid = paddle.layer.data(
        name='user_id',
        type=paddle.data_type.integer_value(
            dataset_train.user_id_len()))
    usr_emb = paddle.layer.embedding(input=uid, size=32)
    usr_fc = paddle.layer.fc(input=usr_emb, size=32)

    time_period = paddle.layer.data(
	name='time_period', 
	type=paddle.data_type.integer_value(24))
    time_period_emb = paddle.layer.embedding(input=time_period, size=16)
    time_period_fc = paddle.layer.fc(input=time_period_emb, size=16)

    usr_location = paddle.layer.data(
	name='user_location',
	type=paddle.data_type.sparse_binary_vector(
	len(dataset_train.get_user_location_dict())))
    usr_location_fc = paddle.layer.fc(input=usr_location, size=32)

    usr_combined_features = paddle.layer.fc(
        input=[usr_fc, time_period_fc, usr_location_fc],
        size=200,
        act=paddle.activation.Tanh())
    return usr_combined_features


def get_content_combined_features():
    content_word_dict = dataset_train.get_content_word_dict()
    content_id = paddle.layer.data(
        name='content_id',
        type=paddle.data_type.integer_value(
            dataset_train.content_id_len()))
    content_emb = paddle.layer.embedding(input=content_id, size=32)
    content_fc = paddle.layer.fc(input=content_emb, size=32)

    content_categories = paddle.layer.data(
        name='category_id',
        type=paddle.data_type.integer_value(
            dataset_train.category_id_len()))
    content_categories_emb = paddle.layer.embedding(input=content_categories, size=16)
    content_categories_fc = paddle.layer.fc(input=content_categories, size=16)

    content_title_id = paddle.layer.data(
        name='title',
        type=paddle.data_type.integer_value_sequence(len(content_word_dict)))
    content_title_emb = paddle.layer.embedding(input=content_title_id, size=32)
    content_title_conv = paddle.networks.sequence_conv_pool(
        input=content_title_emb, hidden_size=32, context_len=2)

    content_brief_id = paddle.layer.data(
	name='brief',
	type=paddle.data_type.integer_value_sequence(len(content_word_dict)))
    content_brief_emb = paddle.layer.embedding(input=content_brief_id, size=32)
    content_brief_conv = paddle.networks.sequence_conv_pool(
	input=content_brief_emb, hidden_size=32, context_len=2)

    check_in_period = paddle.layer.data(
	name='check_in_period',
	type=paddle.data_type.integer_value(
	    dataset_train.check_in_period_len()))
    check_in_period_emb = paddle.layer.embedding(input=check_in_period, size=32)
    check_in_period_fc = paddle.layer.fc(input=check_in_period, size=32)

    content_combined_features = paddle.layer.fc(
        input=[content_fc, content_categories_fc, content_title_conv, content_brief_conv, check_in_period_fc],
        size=200,
        act=paddle.activation.Tanh())
    return content_combined_features


def main():
    paddle.init(use_gpu=False)
    usr_combined_features = get_usr_combined_features()
    content_combined_features = get_content_combined_features()
    inference = paddle.layer.cos_sim(
        a=usr_combined_features, b=content_combined_features, size=1, scale=5)
    cost = paddle.layer.mse_cost(
        input=inference,
        label=paddle.layer.data(
            name='read_time', type=paddle.data_type.dense_vector(1)))

    parameters = paddle.parameters.create(cost)

    trainer = paddle.trainer.SGD(
        cost=cost,
        parameters=parameters,
        update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
    feeding = {
        'user_id': 0,
        'time_period': 1,
        'user_location': 2,
        'content_id': 3,
        'category_id': 4,
        'title': 5,
        'brief': 6,
	'check_in_period': 7,
	'read_time': 8
    }

    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
                print "Pass %d Batch %d Cost %.2f" % (
                    event.pass_id, event.batch_id, event.cost)

    trainer.train(
        reader=paddle.batch(
            paddle.reader.shuffle(
                dataset_train.reader_creator(), buf_size=8192),
            batch_size=256),
        event_handler=event_handler,
        feeding=feeding,
        num_passes=1)

    user_id = "123a"
    content_id = 20419555
    time_period = 16
    user_location = "上海市|上海市"
    cate_id = "1001"
    title = "白鹿原 白嘉轩 娶 的 第七任 老婆 仙草 洞房花烛 夜 白嘉轩 跑 了"
    brief = ""
    check_in_period = 3600

    user_id = feature_dict['user_id'].gen(user_id)
    content_id_code = feature_dict['content_id'].gen(content_id)
    cate_id_code = feature_dict['cate_id'].gen(cate_id)
    title_code = [feature_dict['word'].gen(w.lower()) for w in title.split(' ')]
    brief_code = [feature_dict['word'].gen(w.lower()) for w in brief.split(' ')]
    check_in_period_code = feature_dict['check_in_period'].gen(int(check_in_period))

    print [user_id, content_id, time_period, user_location, cate_id, title, brief, check_in_period]
    feature = [user_id_code, int(time_period), user_location_code, content_id_code, cate_id_code, title_code, brief_code, check_in_period_code]
    print feature

    infer_dict = copy.copy(feeding)
    del infer_dict['read_time']

    prediction = paddle.infer(
        output_layer=inference,
        parameters=parameters,
        input=[feature],
        feeding=infer_dict)
    print prediction


if __name__ == '__main__':
    main()
@kuke
Copy link
Contributor

kuke commented Jun 13, 2017

Could you please typeset your issue into a better format? Others may read your code more easily and offer useful help.

@xlhlhlx xlhlhlx closed this as completed Jun 14, 2017
@xlhlhlx xlhlhlx reopened this Jun 14, 2017
@Yancey1989
Copy link
Contributor

The parameter of paddle.reader.shuffle is a callable interface, so you can define the reader as:

class ReaderData(object):
    ...
    def reader_creator(self):
        def reader():
            ...
        return reader

And call the reader as:

trainer.train(
    reader=paddle.batch(
        paddle.reader.shuffle(
            dataset_train.reader_creator(), buf_size=8192),
        batch_size=256),

@xlhlhlx
Copy link
Author

xlhlhlx commented Jun 14, 2017

代码改成如上之后上一个问题解决了,但又报其它错了,且没有定位到我的代码里面哪一行,还请帮忙定位一下,上面的代码已经是最新的,报错信息如下:
image

@xlhlhlx
Copy link
Author

xlhlhlx commented Jun 14, 2017

生成的训练数据的samle如下:
[14, 10, [20, 24], 14, 2, [146, 45, 110, 111, 112, 147, 148, 149, 85, 150, 151, 83, 152, 153, 83, 154, 155], [12], 14, [121539.0]]
[13, 4, [22, 23], 13, 8, [138, 139, 140, 141, 142, 143, 144, 85, 145, 83], [12], 13, [4349.0]]

@lcy-seso
Copy link
Contributor

lcy-seso commented Jun 14, 2017

这里有错:

content_categories = paddle.layer.data(
        name='category_id',
        type=paddle.data_type.integer_value(
            dataset_train.category_id_len()))
    content_categories_emb = paddle.layer.embedding(input=content_categories, size=16)
    content_categories_fc = paddle.layer.fc(input=content_categories, size=16)

content_categories_fc = paddle.layer.fc(input=content_categories, size=16) 的输入给错了。

@xlhlhlx
Copy link
Author

xlhlhlx commented Jun 14, 2017

thx,fixed

@xlhlhlx xlhlhlx closed this as completed Jun 14, 2017
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants