Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add qid as ranklib format #317

Merged
merged 14 commits into from
Jan 9, 2018
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ addons:
- unzip
- gcc-4.8
- g++-4.8
- python-pip

before_install:
- export TRAVIS=scripts/travis
Expand Down
22 changes: 22 additions & 0 deletions include/dmlc/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ class Row {
const real_t *label;
/*! \brief weight of the instance */
const real_t *weight;
/*! \brief session-id of the instance */
const size_t *qid;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use int32_t, as size_t is not portable across platforms

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if there is worry about possible integer overflow, use uint64_t instead

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, I have changed it to uint64_t.

/*! \brief length of the sparse vector */
size_t length;
/*!
Expand Down Expand Up @@ -123,6 +125,13 @@ class Row {
inline real_t get_weight() const {
return weight == NULL ? 1.0f : *weight;
}
/*!
* \return the qid of the instance, this function is always
* safe even when qid == NULL
*/
inline size_t get_qid() const {
return qid == NULL ? 0 : *qid;
}
/*!
* \brief helper function to compute dot product of current
* \param weight the dense array of weight we want to product
Expand Down Expand Up @@ -167,6 +176,8 @@ struct RowBlock {
const real_t *label;
/*! \brief With weight: array[size] label of each instance, otherwise nullptr */
const real_t *weight;
/*! \brief With qid: array[size] session id of each instance, otherwise nullptr */
const size_t *qid;
/*! \brief field id*/
const IndexType *field;
/*! \brief feature index */
Expand All @@ -183,6 +194,7 @@ struct RowBlock {
inline size_t MemCostBytes(void) const {
size_t cost = size * (sizeof(size_t) + sizeof(real_t));
if (weight != NULL) cost += size * sizeof(real_t);
if (qid != NULL) cost += size * sizeof(size_t);
size_t ndata = offset[size] - offset[0];
if (field != NULL) cost += ndata * sizeof(IndexType);
if (index != NULL) cost += ndata * sizeof(IndexType);
Expand All @@ -205,6 +217,11 @@ struct RowBlock {
} else {
ret.weight = NULL;
}
if (qid != NULL) {
ret.qid = qid + begin;
} else {
ret.qid = NULL;
}
ret.offset = offset + begin;
ret.field = field;
ret.index = index;
Expand Down Expand Up @@ -345,6 +362,11 @@ RowBlock<IndexType>::operator[](size_t rowid) const {
} else {
inst.weight = NULL;
}
if (qid != NULL) {
inst.qid = qid + rowid;
} else {
inst.qid = NULL;
}
inst.length = offset[rowid + 1] - offset[rowid];
if (field != NULL) {
inst.field = field + offset[rowid];
Expand Down
1 change: 1 addition & 0 deletions scripts/travis/travis_osx_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ if [ ${TRAVIS_OS_NAME} != "osx" ]; then
fi

brew update
sudo easy_install pip
11 changes: 10 additions & 1 deletion src/data/libsvm_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,17 @@ ParseBlock(char *begin,
out->offset.push_back(out->index.size());
}
out->label.push_back(label);
// parse feature[:value]
// parse qid:id
size_t qid;
p = q;
while (p != end && *p == ' ') ++p;
if (p != lend && (strncmp(p, "qid:", 4) == 0)) {
p += 4;
qid = atol(p);
out->qid.push_back(qid);
p = q;
}
// parse feature[:value]
while (p != lend) {
IndexType featureId;
real_t value;
Expand Down
12 changes: 11 additions & 1 deletion src/data/row_block.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ struct RowBlockContainer {
std::vector<real_t> label;
/*! \brief array[size] weight of each instance */
std::vector<real_t> weight;
/*! \brief array[size] session-id of each instance */
std::vector<size_t> qid;
/*! \brief field index */
std::vector<IndexType> field;
/*! \brief feature index */
Expand Down Expand Up @@ -61,7 +63,7 @@ struct RowBlockContainer {
/*! \brief clear the container */
inline void Clear(void) {
offset.clear(); offset.push_back(0);
label.clear(); field.clear(); index.clear(); value.clear(); weight.clear();
label.clear(); field.clear(); index.clear(); value.clear(); weight.clear(); qid.clear();
max_field = 0;
max_index = 0;
}
Expand All @@ -74,6 +76,7 @@ struct RowBlockContainer {
return offset.size() * sizeof(size_t) +
label.size() * sizeof(real_t) +
weight.size() * sizeof(real_t) +
qid.size() * sizeof(size_t) +
field.size() * sizeof(IndexType) +
index.size() * sizeof(IndexType) +
value.size() * sizeof(real_t);
Expand All @@ -87,6 +90,7 @@ struct RowBlockContainer {
inline void Push(Row<I> row) {
label.push_back(row.get_label());
weight.push_back(row.get_weight());
qid.push_back(row.get_qid());
if (row.field != NULL) {
for (size_t i = 0; i < row.length; ++i) {
CHECK_LE(row.field[i], std::numeric_limits<IndexType>::max())
Expand Down Expand Up @@ -124,6 +128,9 @@ struct RowBlockContainer {
if (batch.weight != NULL) {
weight.insert(weight.end(), batch.weight, batch.weight + batch.size);
}
if (batch.qid != NULL) {
qid.insert(qid.end(), batch.qid, batch.qid + batch.size);
}
size_t ndata = batch.offset[batch.size] - batch.offset[0];
if (batch.field != NULL) {
field.resize(field.size() + ndata);
Expand Down Expand Up @@ -173,6 +180,7 @@ RowBlockContainer<IndexType>::GetBlock(void) const {
data.offset = BeginPtr(offset);
data.label = BeginPtr(label);
data.weight = BeginPtr(weight);
data.qid = BeginPtr(qid);
data.field = BeginPtr(field);
data.index = BeginPtr(index);
data.value = BeginPtr(value);
Expand All @@ -184,6 +192,7 @@ RowBlockContainer<IndexType>::Save(Stream *fo) const {
fo->Write(offset);
fo->Write(label);
fo->Write(weight);
fo->Write(qid);
fo->Write(field);
fo->Write(index);
fo->Write(value);
Expand All @@ -196,6 +205,7 @@ RowBlockContainer<IndexType>::Load(Stream *fi) {
if (!fi->Read(&offset)) return false;
CHECK(fi->Read(&label)) << "Bad RowBlock format";
CHECK(fi->Read(&weight)) << "Bad RowBlock format";
CHECK(fi->Read(&qid)) << "Bad RowBlock format";
CHECK(fi->Read(&field)) << "Bad RowBlock format";
CHECK(fi->Read(&index)) << "Bad RowBlock format";
CHECK(fi->Read(&value)) << "Bad RowBlock format";
Expand Down