diff --git a/.travis.yml b/.travis.yml index 0562c7ccec..5ae159e8ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,6 +30,7 @@ addons: - unzip - gcc-4.8 - g++-4.8 + - python-pip before_install: - export TRAVIS=scripts/travis diff --git a/include/dmlc/data.h b/include/dmlc/data.h index 53fd32e678..135907e44f 100644 --- a/include/dmlc/data.h +++ b/include/dmlc/data.h @@ -73,6 +73,8 @@ class Row { const real_t *label; /*! \brief weight of the instance */ const real_t *weight; + /*! \brief session-id of the instance */ + const uint64_t *qid; /*! \brief length of the sparse vector */ size_t length; /*! @@ -123,6 +125,13 @@ class Row { inline real_t get_weight() const { return weight == NULL ? 1.0f : *weight; } + /*! + * \return the qid of the instance, this function is always + * safe even when qid == NULL + */ + inline uint64_t get_qid() const { + return qid == NULL ? 0 : *qid; + } /*! * \brief helper function to compute dot product of current * \param weight the dense array of weight we want to product @@ -167,6 +176,8 @@ struct RowBlock { const real_t *label; /*! \brief With weight: array[size] label of each instance, otherwise nullptr */ const real_t *weight; + /*! \brief With qid: array[size] session id of each instance, otherwise nullptr */ + const uint64_t *qid; /*! \brief field id*/ const IndexType *field; /*! \brief feature index */ @@ -183,6 +194,7 @@ struct RowBlock { inline size_t MemCostBytes(void) const { size_t cost = size * (sizeof(size_t) + sizeof(real_t)); if (weight != NULL) cost += size * sizeof(real_t); + if (qid != NULL) cost += size * sizeof(size_t); size_t ndata = offset[size] - offset[0]; if (field != NULL) cost += ndata * sizeof(IndexType); if (index != NULL) cost += ndata * sizeof(IndexType); @@ -205,6 +217,11 @@ struct RowBlock { } else { ret.weight = NULL; } + if (qid != NULL) { + ret.qid = qid + begin; + } else { + ret.qid = NULL; + } ret.offset = offset + begin; ret.field = field; ret.index = index; @@ -345,6 +362,11 @@ RowBlock::operator[](size_t rowid) const { } else { inst.weight = NULL; } + if (qid != NULL) { + inst.qid = qid + rowid; + } else { + inst.qid = NULL; + } inst.length = offset[rowid + 1] - offset[rowid]; if (field != NULL) { inst.field = field + offset[rowid]; diff --git a/scripts/travis/travis_osx_install.sh b/scripts/travis/travis_osx_install.sh index 8c449c8436..fb6514d888 100755 --- a/scripts/travis/travis_osx_install.sh +++ b/scripts/travis/travis_osx_install.sh @@ -5,3 +5,4 @@ if [ ${TRAVIS_OS_NAME} != "osx" ]; then fi brew update +sudo easy_install pip diff --git a/src/data/libsvm_parser.h b/src/data/libsvm_parser.h index cdf9cfb910..8924101dd4 100644 --- a/src/data/libsvm_parser.h +++ b/src/data/libsvm_parser.h @@ -63,8 +63,17 @@ ParseBlock(char *begin, out->offset.push_back(out->index.size()); } out->label.push_back(label); - // parse feature[:value] + // parse qid:id + uint64_t qid; p = q; + while (p != end && *p == ' ') ++p; + if (p != lend && (strncmp(p, "qid:", 4) == 0)) { + p += 4; + qid = atoll(p); + out->qid.push_back(qid); + p = q; + } + // parse feature[:value] while (p != lend) { IndexType featureId; real_t value; diff --git a/src/data/row_block.h b/src/data/row_block.h index 57f779e5b7..a8c8532d55 100644 --- a/src/data/row_block.h +++ b/src/data/row_block.h @@ -31,6 +31,8 @@ struct RowBlockContainer { std::vector label; /*! \brief array[size] weight of each instance */ std::vector weight; + /*! \brief array[size] session-id of each instance */ + std::vector qid; /*! \brief field index */ std::vector field; /*! \brief feature index */ @@ -61,7 +63,7 @@ struct RowBlockContainer { /*! \brief clear the container */ inline void Clear(void) { offset.clear(); offset.push_back(0); - label.clear(); field.clear(); index.clear(); value.clear(); weight.clear(); + label.clear(); field.clear(); index.clear(); value.clear(); weight.clear(); qid.clear(); max_field = 0; max_index = 0; } @@ -74,6 +76,7 @@ struct RowBlockContainer { return offset.size() * sizeof(size_t) + label.size() * sizeof(real_t) + weight.size() * sizeof(real_t) + + qid.size() * sizeof(size_t) + field.size() * sizeof(IndexType) + index.size() * sizeof(IndexType) + value.size() * sizeof(real_t); @@ -87,6 +90,7 @@ struct RowBlockContainer { inline void Push(Row row) { label.push_back(row.get_label()); weight.push_back(row.get_weight()); + qid.push_back(row.get_qid()); if (row.field != NULL) { for (size_t i = 0; i < row.length; ++i) { CHECK_LE(row.field[i], std::numeric_limits::max()) @@ -124,6 +128,9 @@ struct RowBlockContainer { if (batch.weight != NULL) { weight.insert(weight.end(), batch.weight, batch.weight + batch.size); } + if (batch.qid != NULL) { + qid.insert(qid.end(), batch.qid, batch.qid + batch.size); + } size_t ndata = batch.offset[batch.size] - batch.offset[0]; if (batch.field != NULL) { field.resize(field.size() + ndata); @@ -173,6 +180,7 @@ RowBlockContainer::GetBlock(void) const { data.offset = BeginPtr(offset); data.label = BeginPtr(label); data.weight = BeginPtr(weight); + data.qid = BeginPtr(qid); data.field = BeginPtr(field); data.index = BeginPtr(index); data.value = BeginPtr(value); @@ -184,6 +192,7 @@ RowBlockContainer::Save(Stream *fo) const { fo->Write(offset); fo->Write(label); fo->Write(weight); + fo->Write(qid); fo->Write(field); fo->Write(index); fo->Write(value); @@ -196,6 +205,7 @@ RowBlockContainer::Load(Stream *fi) { if (!fi->Read(&offset)) return false; CHECK(fi->Read(&label)) << "Bad RowBlock format"; CHECK(fi->Read(&weight)) << "Bad RowBlock format"; + CHECK(fi->Read(&qid)) << "Bad RowBlock format"; CHECK(fi->Read(&field)) << "Bad RowBlock format"; CHECK(fi->Read(&index)) << "Bad RowBlock format"; CHECK(fi->Read(&value)) << "Bad RowBlock format";