49 #include <ext/algorithm>
52 using __gnu_cxx::random_sample;
67 int operator() (
int n) {
68 return static_cast<int>(
static_cast<float>(n) * rand()/(RAND_MAX+1.0) );
116 deque< CDatasetItem<Type> > tItems;
138 CDataset(
int iInitialSize=1000,
int iGrowSize=1000);
167 void initRandomSeed();
176 bool read(istream& istr);
182 virtual string className()
const {
return string(
"CDataset");};
191 bool read(
const char* pcPath);
200 bool write(
const char* pcPath);
209 bool write(ostream& ostr);
212 bool serialize(fstream& stream,
IO_MODE mode =
READ);
219 void setInfo(
const string& str);
227 string getInfo(
void);
231 void reserve(
int iSize);
235 void setGrowSize(
int iGrowSize);
262 vector<CDataset<Type> > createFolds(
int iN)
const ;
274 CDataset<Type> extract(
int iIndexStart,
int iIndexEnd,
bool bDelete);
352 const int iStart,
const int iEnd)
const;
355 const int iStart,
const int iEnd,
const int iOmit)
const;
358 vector<int>::iterator itBegin, vector<int>::iterator itEnd)
const;
391 void getBestMatchOutput(
const CVector<Type>& rtVector,
const CMetric<Type>& rCMetric,
int& riBestID, Type& rtBestDist)
const ;
428 void getDataFromVector(Type* ptData,
int iNbItems,
int iInputDim,
int iOutputDim);
435 void setDataToVector(Type* ptData);
469 int removeItem(
int iIndex);
477 void removeLastItem();
484 void sortdata() {sort(tItems.begin(),tItems.end());};
501 void randomShuffle();
517 inline int items()
const {
return (
int)tItems.size();};
525 inline int inputDimension()
const;
534 inline int outputDimension()
const;
558 #ifndef DS_IS_A_DEQUE
559 tItems.reserve(iInitialSize);
562 sType =string(
"CDataset 0.1");
563 sInfo=string(
"none");
608 if(
this == &rtDataset)
611 #ifndef DS_IS_A_DEQUE
612 this->tItems.reserve(rtDataset.
items());
614 this->tItems=rtDataset.
tItems;
622 ifstream ist(pcPath, ios::in);
628 stream.
open(pcPath, ios::in);
642 stream.
open(pcPath, ios::out);
644 if(!stream)
return false;
662 istr.read((
char*) &iBuff,
sizeof(
int));
664 pcBuff=
new char[iBuff];
665 istr.read((
char*) pcBuff, iBuff*
sizeof(
char));
667 if(this->sType.compare(pcBuff)){
668 cerr <<
"Wrong file type!" << endl;
674 istr.read((
char*) &iNbItems,
sizeof(
int));
675 istr.read((
char*) &this->iGrowSize,
sizeof(
int));
676 istr.read((
char*) &iInputDim,
sizeof(
int));
677 istr.read((
char*) &iOutputDim,
sizeof(
int));
679 istr.read((
char*) &iBuff,
sizeof(
int));
680 pcBuff=
new char[iBuff];
681 istr.read((
char*) pcBuff, iBuff*
sizeof(
char));
688 this->tItems.clear();
689 #ifndef DS_IS_A_DEQUE
690 this->tItems.reserve(iNbItems);
692 for(
int i=0;i<iNbItems;i++){
694 for(
int j=0;j<iInputDim;j++){
695 istr.read((
char*) &fBuff,
sizeof(
float));
696 inpVec.
setElement(j,static_cast<Type>(fBuff));;
699 for(
int j=0;j<iOutputDim;j++){
700 istr.read((
char*) &fBuff,
sizeof(
float));
701 outVec.
setElement(j,static_cast<Type>(fBuff));
714 int iNbItems =this->items();
715 int iInputDim =this->inputDimension();
716 int iOutputDim=this->outputDimension();
718 iBuff=sType.length()+1;
719 ostr.write((
const char*)&iBuff,
sizeof(
int));
720 ostr.write((
const char*)sType.c_str(),iBuff*
sizeof(char));
721 ostr.write((
const char*)&iNbItems,
sizeof(iNbItems));
722 ostr.write((
const char*)&this->iGrowSize,
sizeof(this->iGrowSize));
723 ostr.write((
const char*)&iInputDim,
sizeof(iInputDim));
724 ostr.write((
const char*)&iOutputDim,
sizeof(iOutputDim));
725 iBuff=sInfo.length()+1;
726 ostr.write((
const char*)&iBuff,
sizeof(
int));
727 ostr.write((
const char*)sInfo.c_str(),iBuff*
sizeof(char));
729 for(
int i=0;i<iNbItems;i++){
730 for(
int j=0;j<iInputDim;j++){
731 fBuff=
static_cast<float>(this->getItem(i).getInputComponent(j));
732 ostr.write((
const char*)&fBuff,
sizeof(fBuff));
734 for(
int j=0;j<iOutputDim;j++){
735 fBuff=
static_cast<float>(this->getItem(i).getOutputComponent(j));
736 ostr.write((
const char*)&fBuff,
sizeof(fBuff));
746 stream.read( (
char*)&iSize,
sizeof(
int));
747 #ifndef DS_IS_A_DEQUE
748 tItems.reserve(iSize);
750 stream.read( (
char*)&iSize,
sizeof(
int));
751 char* pcBuffer =
new char[iSize+1];
752 memset(pcBuffer,
'\0', iSize+1);
753 stream.read( (
char*)pcBuffer, iSize*
sizeof(
char));
754 sInfo = string(pcBuffer);
758 for(
int i=0;i<iSize ;i++) {
764 int iSize = this->items();
765 stream.write( (
char*)&iSize,
sizeof(
int));
766 iSize = sInfo.length();
767 stream.write( (
char*)&iSize,
sizeof(
int));
768 stream.write( (
char*)sInfo.c_str(), iSize*
sizeof(char));
770 for(
int i=0;i<iSize ;i++) {
771 tItems[i].getInputVector().serialize(stream, mode);
772 tItems[i].getOutputVector().serialize(stream, mode);
781 unsigned int uiItems;
788 tItems.resize(uiItems);
789 for(
int i=0;i<(int)uiItems ;i++)
794 tA << tItems.size(); tA.
flush();
795 for(
int i=0;i<(int)tItems.size() ;i++)
796 tA << tItems[i]; tA.
flush();
821 #ifndef DS_IS_A_DEQUE
822 tItems.reserve(
MAX((
int)iSize, (
int)tItems.size()));
833 this->iGrowSize=iGrowSize;
844 assert((
int)iIndex < (
int)tItems.size());
846 return (tItems[iIndex]);
856 assert((
int)iIndex < (
int)tItems.size());
858 return (tItems[iIndex]);
867 assert(iIndex < (
int)tItems.size());
869 return (tItems[iIndex]);
878 assert(iIndex < (
int)tItems.size());
880 return (tItems[iIndex]);
890 int i =
IRAND(0,(
int)tItems.size());
900 int i =
IRAND(0,(
int)tItems.size());
907 vector< CDataset<Type> > tVec(iN);
909 for(
int i=0;i<iN;i++)
910 tVec.reserve(this->items());
911 for(
int i=0;i<items();i++)
912 tVec[i%iN].appendItem(getItem(i));
920 return (tItems[tItems.size()-1]);
926 Type tMinDist = rCMetric.
distance(tItems[0].getInputVector(),rtVector);
928 int iSize =
static_cast<int>(tItems.size());
930 for(
int i=1;i<iSize;i++){
931 Type tDist = rCMetric.
distance(tItems[i].getInputVector(),rtVector);
943 Type tMinDist = rCMetric.
distance(tItems[0].getInputVector(),rtVector);
945 int iSize =
static_cast<int>(tItems.size());
947 for(
int i=1;i<iSize;i++){
948 Type tDist = rCMetric.
distance(tItems[i].getInputVector(),rtVector);
960 const int iStart,
const int iEnd)
const{
961 Type tMinDist = rCMetric.
distance(tItems[iStart].getInputVector(),rtVector);
962 int iBestId = iStart;
964 for(
int i=iStart+1; i <= iEnd; i++){
965 Type tDist = rCMetric.
distance(tItems[i].getInputVector(),rtVector);
977 const int iStart,
const int iEnd,
const int iOmit)
const{
978 Type tMinDist = rCMetric.
distance(tItems[iStart].getInputVector(),rtVector);
982 if (iOmit == iStart) {
988 for(
int i=iStart+1; i <= iEnd; i++) {
991 Type tDist = rCMetric.
distance(tItems[i].getInputVector(),rtVector);
1001 template<
class Type>
1004 vector<int>::iterator itBegin, vector<int>::iterator itEnd)
const {
1006 Type tMinDist = rCMetric.
distance(tItems[*itBegin].getInputVector(),rtVector);
1008 int iBestId = *itBegin;
1009 vector<int>::iterator it = itBegin + 1;
1011 for(; it < itEnd; it++) {
1012 Type tDist = rCMetric.
distance(tItems[*it].getInputVector(),rtVector);
1021 template<
class Type>
1024 Type tMinDist = rCMetric.
distance(tItems[0].getOutputVector(),rtVector);
1026 int iSize =
static_cast<int>(tItems.size());
1028 for(
int i=1;i<iSize;i++){
1029 Type tDist = rCMetric.
distance(tItems[i].getOutputVector(),rtVector);
1037 template<
class Type>
1040 Type tMinDist = rCMetric.
distance(tItems[0].getOutputVector(),rtVector);
1042 int iSize =
static_cast<int>(tItems.size());
1044 for(
int i=1;i<iSize;i++){
1045 Type tDist = rCMetric.
distance(tItems[i].getOutputVector(),rtVector);
1056 template<
class Type>
1059 int iSize =
static_cast<int>(tItems.size());
1061 rtBestDist = rCMetric.
distance(tItems[0].getInputVector(),rtVector);
1063 for(
int i=1;i<iSize;i++){
1064 Type tDist = rCMetric.
distance(tItems[i].getInputVector(),rtVector);
1065 if(tDist<rtBestDist){
1072 template<
class Type>
1075 int iSize =
static_cast<int>(tItems.size());
1077 rtBestDist = rCMetric.
distance(tItems[0].getInputVector(),rtVector);
1079 for(
int i=1;i<iSize;i++){
1080 Type tDist = rCMetric.
distance(tItems[i].getInputVector(),rtVector);
1081 if(tDist<rtBestDist){
1088 template<
class Type>
1091 int iSize =
static_cast<int>(tItems.size());
1093 rtBestDist = rCMetric.
distance(tItems[0].getOutputVector(),rtVector);
1095 for(
int i=1;i<iSize;i++){
1096 Type tDist = rCMetric.
distance(tItems[i].getOutputVector(),rtVector);
1097 if(tDist<rtBestDist){
1104 template<
class Type>
1107 int iSize =
static_cast<int>(tItems.size());
1109 rtBestDist = rCMetric.
distance(tItems[0].getOutputVector(),rtVector);
1111 for(
int i=1;i<iSize;i++){
1112 Type tDist = rCMetric.
distance(tItems[i].getOutputVector(),rtVector);
1113 if(tDist<rtBestDist){
1129 template<
class Type>
1134 assert(iIndex >= 0);
1135 assert(iIndex < (
int)tItems.size());
1136 assert((
int)tItems.size() == 0 || tItems[0].inputDimension() == tItem.
inputDimension());
1137 assert((
int)tItems.size() == 0 || tItems[0].outputDimension() == tItem.
outputDimension());
1139 if(iIndex < 0 || iIndex >= (
int)tItems.size())
1142 tItems[iIndex] = (tItem);
1157 template<
class Type>
1162 assert(iIndex >= 0);
1163 assert(iIndex < (
int)tItems.size());
1164 assert((
int)tItems.size() == 0 || tItems[0].inputDimension() == tItem.
inputDimension());
1165 assert((
int)tItems.size() == 0 || tItems[0].outputDimension() == tItem.
outputDimension());
1167 if(iIndex < 0 || iIndex >= (
int)tItems.size())
1169 #ifndef DS_IS_A_DEQUE
1170 if(tItems.capacity() == tItems.size())
1171 tItems.reserve((
int)tItems.size() + iGrowSize);
1173 tItems.insert(tItems.begin()+iIndex, tItem);
1177 template<
class Type>
1181 assert(iNbItems>=0);
1182 assert(iInputDim>=0);
1183 assert(iOutputDim>=0);
1188 for(
int i=0;i<iNbItems;i++) {
1196 template<
class Type>
1199 for(
int i=0;i<this->items();i++){
1201 for(
int j=0;j<this->inputDimension();j++){
1202 ptData[i*(this->inputDimension()+this->outputDimension())+j]=this->getItem(i).getInputComponent(j);
1205 for(
int j=0;j<this->outputDimension();j++){
1206 ptData[i*(this->inputDimension()+this->outputDimension())+this->inputDimension()+j]=this->getItem(i).getOutputComponent(j);
1215 template<
class Type>
1219 assert((
int)tItems.size() == 0 || tItems[0].inputDimension() == tItem.
inputDimension());
1220 assert((
int)tItems.size() == 0 || tItems[0].outputDimension() == tItem.
outputDimension());
1222 #ifndef DS_IS_A_DEQUE
1223 if(tItems.capacity() == tItems.size())
1224 tItems.reserve(tItems.size() + iGrowSize);
1226 tItems.push_back(tItem);
1231 template<
class Type>
1235 assert(iIndex >= 0);
1236 assert(iIndex < (
int)tItems.size());
1238 if(iIndex < 0 || iIndex >=this->items())
1241 tItems.erase(tItems.begin()+iIndex);
1247 template<
class Type>
1252 tItems.erase(tItems.begin()+items()-1);
1256 template<
class Type>
1260 random_shuffle(tItems.begin(), tItems.end(), rnd);
1266 template<
class Type>
1269 bool bDelete =
true)
1272 assert(iIndexStart >= 0);
1273 assert(iIndexStart < (
int)tItems.size());
1274 assert(iIndexEnd >= 0);
1275 assert(iIndexEnd < (
int)tItems.size());
1276 assert(iIndexStart < iIndexEnd);
1279 #ifndef DS_IS_A_DEQUE
1280 tNewSet.
reserve(iIndexEnd-iIndexStart) ;
1282 for(
int i=iIndexStart;i<=iIndexEnd;i++) {
1286 if(iIndexEnd != (items()-1))
1287 tItems.erase(this->tItems.begin()+iIndexStart, this->tItems.begin()+iIndexEnd);
1289 tItems.resize(iIndexStart);
1294 template<
class Type>
1298 int iNumSelect = (int)floor(this->items() * (dFraction/100));
1299 for(
int i=0;i<iNumSelect;i++)
1301 tNewSet.
appendItem( this->getItem(this->items() - 1 - i) );
1303 tItems.erase(this->tItems.end()-iNumSelect, this->tItems.end());
1307 template<
class Type>
1310 if(iNbItems<0){
throw CException(
"iNbItems<0");}
1311 if(this->items()<=0){
throw CException(
"this->items()<=0");}
1315 vector<CDatasetItem<Type> > tTmp(iNbItems);
1317 random_sample(tItems.begin(), tItems.end(), tTmp.begin(), tTmp.end());
1320 for(
int i=0;i<iNbItems;i++){
1321 tSubset.appendItem(tTmp[i]);
1328 template<
class Type>
1330 int iNbItems=rtDataset.
items();
1331 #ifndef DS_IS_A_DEQUE
1332 this->reserve(this->items() + iNbItems);
1334 for(
int i=0;i<iNbItems;i++){
1335 this->appendItem(rtDataset.
getItem(i));
1339 template<
class Type>
1342 if((
int)tItems.size() == 0)
return 0;
1343 else return tItems[0].inputDimension();
1346 template<
class Type>
1349 if(tItems.size()==0)
return 0;
1350 else return tItems[0].outputDimension();
1355 template<
class Type>
1366 for(
int i=0; i<this->items();i++) {
1367 tData.
setRow(i, tItems[i].getInputVector() );
1375 template<
class Type>
1386 for(
int i=0; i<this->items();i++) {
1387 tData.
setRow(i, tItems[i].getOutputVector() );
void getBestMatchOutput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const
Definition: CDataset.h:1090
virtual bool isReading() const
Definition: CArchiv.h:25
bool bProtect
Definition: CDataset.h:153
Definition: gzstream.h:103
CDataset< Type > extract(int iIndexStart, int iIndexEnd, bool bDelete)
Copy or move a range of items to a new dataset.
Definition: CDataset.h:1267
void merge(const CDataset< Type > &rtDataset)
Merge a second dataset.
Definition: CDataset.h:1329
Definition: gzstream.h:92
int setItem(int iIndex, const CDatasetItem< Type > &rtItem)
Set item iIndex.
Definition: CDataset.h:1130
int outputDimension() const
Definition: CDatasetItem.h:259
#define MAX(x, y)
Definition: Macros.h:19
#define IRAND(x, y)
Definition: Macros.h:80
int insertItem(int iIndex, const CDatasetItem< Type > &rtItem)
Insert item.
Definition: CDataset.h:1158
CDatasetItem< Type > & randomItem(void)
Return the reference on a random item.
Definition: CDataset.h:896
void setGrowSize(int iGrowSize)
Set size increment.
Definition: CDataset.h:827
CDataset< Type > & operator=(const CDataset< Type > &rtDataset)
Assignment operator.
Definition: CDataset.h:605
IO_MODE
Definition: CObject.h:38
void * pMetaData
Definition: CDataset.h:124
CDataset< Type > split(double dFraction)
Split dataset in two disjoint subsets.
Definition: CDataset.h:1295
string sInfo
Definition: CDataset.h:121
const CDatasetItem< Type > & operator()(int iIndex) const
Return a const reference to item iIndex.
Definition: CDataset.h:852
Base class for metrix objects.
Definition: CMetric.h:36
Template object implementing a matrix of single and double precision elements.
Definition: CDenseVector.h:38
void clear()
Delete all items.
Definition: CDataset.h:597
int iGrowSize
Definition: CDataset.h:122
string sType
Definition: CDataset.h:120
bool serialize(fstream &stream, IO_MODE mode=READ)
Read/write from binary stream.
Definition: CDataset.h:743
void randomShuffle()
Random shuffle of items.
Definition: CDataset.h:1257
CDatasetItem< Type > & getLast()
Return reference to the last item.
Definition: CDataset.h:918
Definition: CDataset.h:61
int appendItem(const CDatasetItem< Type > &rtItem)
Append item to the end of the dataset.
Definition: CDataset.h:1216
const CDatasetItem< Type > & item(int iIndex) const
Return a const reference to item iIndex.
Definition: CDataset.h:874
void setElement(int iIndex, Type tValue)
RandomNumber()
Definition: CDataset.h:63
void initRandomSeed()
Init random seed.
Definition: CDataset.h:591
void open(const char *name, int open_mode=std::ios::out)
Definition: gzstream.h:109
void * metaData()
Return pointer to meta data object.
Definition: CDataset.h:542
CDataset(int iInitialSize=1000, int iGrowSize=1000)
Constructor.
Definition: CDataset.h:555
void reserve(int iSize)
Reserve space for iSize items.
Definition: CDataset.h:816
int inputDimension() const
Return dimension of input vectors.
Definition: CDataset.h:1340
void sortdata(bool(*func)(CDatasetItem< Type >, CDatasetItem< Type >))
Sort data using a generic function as less operator.
Definition: CDataset.h:492
int getBestMatchIDOutput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const
Definition: CDataset.h:1023
int inputDimension() const
Definition: CDatasetItem.h:253
void setRow(int iRow, const CDenseVector< Type > &tVec)
int outputDimension() const
Return dimension of output vectors.
Definition: CDataset.h:1347
const CDatasetItem< Type > & getRandomItem(void) const
Definition: CDataset.h:886
const CDatasetItem< Type > & getItem(int iIndex) const
Definition: CDataset.h:863
Single item of a dataset consisting of a pair of input and out vectors.
Definition: CDatasetItem.h:43
void setDataToVector(Type *ptData)
Definition: CDataset.h:1197
bool write(const char *pcPath)
Write dataset to binary and compressed ml file.
Definition: CDataset.h:640
string getInfo(void)
Return info text.
Definition: CDataset.h:810
virtual Type distance(const CVector< Type > &rtVec1, const CVector< Type > &rtVec2) const
Definition: CMetric.h:53
void getDataFromVector(Type *ptData, int iNbItems, int iInputDim, int iOutputDim)
Definition: CDataset.h:1178
int bestMatchIDInput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const
Definition: CDataset.h:942
CMatrix< Type > getInputMatrix() const
Get input data as matrix.
Definition: CDataset.h:1356
bool read(istream &istr)
Read data set from ascii stream.
Definition: CDataset.h:653
CDatasetItem< Type > & operator[](int iIndex)
Return reference on item iIndex.
Definition: CDataset.h:840
Base class of object serialization.
Definition: CArchiv.h:19
void bestMatchOutput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const
Definition: CDataset.h:1106
Definition: CException.h:40
Template object for vectors of single and double precision and integer.
Definition: CDenseVector.h:37
#define ML_LEQ_CHK(var, val)
Definition: Macros.h:113
bool serialize(fstream &stream, IO_MODE mode=READ)
Read/write from binary stream.
CMatrix< Type > getOutputMatrix() const
Get output data as matrix.
Definition: CDataset.h:1376
vector< CDataset< Type > > createFolds(int iN) const
Create n disjoint folds.
Definition: CDataset.h:906
templatized vector for numerical applications
Definition: CMatrix.h:39
void removeLastItem()
Remove the last item.
Definition: CDataset.h:1248
int bestMatchIDOutput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const
Definition: CDataset.h:1039
bool serialize2(CArchiv &tA)
Definition: CDataset.h:779
Base class for all object.
Definition: CObject.h:51
void getBestMatchInput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const
Definition: CDataset.h:1058
#define ML_GEQ_CHK(var, val)
Definition: Macros.h:119
void setInfo(const string &str)
Set info text.
Definition: CDataset.h:805
Manages pairs of input and output vectors.
Definition: CDataset.h:110
int removeItem(int iIndex)
Remove the item iIndex.
Definition: CDataset.h:1232
void sortdata()
Sort data.
Definition: CDataset.h:484
void open(const char *name, int open_mode=std::ios::in)
Definition: gzstream.h:98
~CDataset()
Destructor.
Definition: CDataset.h:584
int items() const
Return number of items in the dataset.
Definition: CDataset.h:517
CDataset< Type > subset(int iNbItems) const
Copy i random items to a new dataset.
Definition: CDataset.h:1308
int getBestMatchIDInput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const
Definition: CDataset.h:925
void setMetaData(void *pTheData)
Sets a meta data pointer.
Definition: CDataset.h:547
void bestMatchInput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const
Definition: CDataset.h:1074
virtual string className() const
Returns the class name.
Definition: CDataset.h:182
virtual void flush()
Definition: CArchiv.h:26
vector< CDatasetItem< Type > > tItems
Definition: CDataset.h:118