Machine Learning Library
CDataset.h
Go to the documentation of this file.
1 /*
2  COPYRIGHT (C) 2003 APPLIED NEUROINFORMATIC GROUP - UNIVERSITY OF BIELEFELD.
3 
4  ALL RIGHTS RESERVED.
5 
6  REDISTRIBUTION AND USE IN SOURCE AND BINARY FORM, WITH OR WITHOUT
7  MODIFICATION, REQUIRE THE PERMISSION OF THE COPYRIGHT HOLDERS.
8 
9  COMMERCIAL USE WITHOUT THE EXPLICIT PERMISSION OF THE COPYRIGHT HOLDERS
10  IS FORBIDDEN
11 
12  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
13  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
14  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
15  ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
16  ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
17  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
18  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
19  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
20  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
21  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22 */
23 
24 
25 
26 #ifndef CDATASET_H
27 #define CDATASET_H
28 
29 #include "CDatasetItem.h"
30 #include "CDenseVector.h"
31 #include "CException.h"
32 #include "CMetric.h"
33 #include "Macros.h"
34 #include <vector>
35 #include <deque>
36 #include <algorithm>
37 #include <ctime>
38 #include <fstream>
39 #include <iostream>
40 #include <cstdio>
41 #include <stdlib.h>
42 #include <cassert>
43 #include <cstring>
44 #include <iomanip>
45 #include <string>
46 #include "zlib.h"
47 #include "gzstream.h"
48 #include "CObject.h"
49 #include <ext/algorithm>
50 #include <cstdlib>
51 #include <ctime>
52 using __gnu_cxx::random_sample;
53 
54 
55 using namespace std;
56 
57 
58 
59 
60 
61 class RandomNumber {
62 public:
64  srand(time(0));
65  }
66 
67  int operator() ( int n) {
68  return static_cast<int>( static_cast<float>(n) * rand()/(RAND_MAX+1.0) );
69  }
70 };
71 
109 template<class Type>
110 class CDataset : public CObject<Type>
111 
112 {
113 
114 protected:
115 #ifdef DS_IS_A_DEQUE
116  deque< CDatasetItem<Type> > tItems;
117 #else
118  vector< CDatasetItem<Type> > tItems;
119 #endif
120  string sType;
121  string sInfo;
123 
124  void *pMetaData;
125 
126 
127 
128 public:
129 
131 
138  CDataset(int iInitialSize=1000, int iGrowSize=1000);
139 
141 
144  CDataset(const CDataset<Type>& rtDataset);
145 
146 
148 
151  ~CDataset();
152 
153  bool bProtect;
154 
156 
159  CDataset<Type>& operator=(const CDataset<Type>& rtDataset);
160 
161 
162 
164 
167  void initRandomSeed();
168 
169 
171 
176  bool read(istream& istr);
177 
178 
179 
180 
181 
182  virtual string className() const { return string("CDataset");};
183 
184 
186 
191  bool read(const char* pcPath);
192 
193 
195 
200  bool write(const char* pcPath);
201 
202 
204 
209  bool write(ostream& ostr);
210 
211 
212  bool serialize(fstream& stream, IO_MODE mode = READ);
213  bool serialize2(CArchiv& tA);
215 
219  void setInfo(const string& str);
220 
221 
223 
227  string getInfo(void);
228 
229 
231  void reserve(int iSize);
232 
233 
235  void setGrowSize(int iGrowSize);
236 
237 
239 
244  CDataset<Type> split(double dFraction);
245 
246 
248 
253  CDataset<Type> subset(int iNbItems) const;
254 
255 
257 
262  vector<CDataset<Type> > createFolds(int iN) const ;
263 
265 
274  CDataset<Type> extract(int iIndexStart, int iIndexEnd, bool bDelete);
275 
276 
278 
282  void merge(const CDataset<Type>& rtDataset);
283 
288  inline const CDatasetItem<Type>& getItem(int iIndex) const;
289 
290 
292 
296  inline const CDatasetItem<Type>& item(int iIndex) const;
297 
298 
303  inline const CDatasetItem<Type>& getRandomItem(void) const;
304 
305 
307 
311  inline CDatasetItem<Type>& randomItem(void);
312 
313 
315 
319  CDatasetItem<Type>& getLast() ;
320 
321 
323 
327  inline CDatasetItem<Type>& operator[](int iIndex);
328 
330 
334  inline const CDatasetItem<Type>& operator()(int iIndex) const;
335 
339  int getBestMatchIDInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric) const;
340 
341 
342 
349  int bestMatchIDInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric) const;
350 
351  int bestMatchIDInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric,
352  const int iStart, const int iEnd) const;
353 
354  int bestMatchIDInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric,
355  const int iStart, const int iEnd, const int iOmit) const;
356 
357  int bestMatchIDInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric,
358  vector<int>::iterator itBegin, vector<int>::iterator itEnd) const;
359 
360 
364  int getBestMatchIDOutput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric) const ;
365 
372  int bestMatchIDOutput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric) const ;
373 
377  void getBestMatchInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric, int& riBestID, Type& rtBestDist) const ;
385  void bestMatchInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric, int& riBestID, Type& rtBestDist) const ;
386 
387 
391  void getBestMatchOutput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric, int& riBestID, Type& rtBestDist) const ;
392 
400  void bestMatchOutput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric, int& riBestID, Type& rtBestDist) const ;
401 
402 
404 
409  int setItem(int iIndex, const CDatasetItem<Type>& rtItem);
410 
411 
413 
418  int insertItem(int iIndex, const CDatasetItem<Type>& rtItem);
419 
428  void getDataFromVector(Type* ptData, int iNbItems, int iInputDim, int iOutputDim);
429 
435  void setDataToVector(Type* ptData);
436 
437 
438 
440 
444  CMatrix<Type> getInputMatrix() const ;
445 
446 
448 
452  CMatrix<Type> getOutputMatrix() const ;
453 
454 
456 
460  int appendItem(const CDatasetItem<Type>& rtItem);
461 
462 
464 
469  int removeItem(int iIndex);
470 
471 
472 
474 
477  void removeLastItem();
478 
479 
481 
484  void sortdata() {sort(tItems.begin(),tItems.end());};
485 
486 
487 
489 
492  void sortdata(bool (*func)(CDatasetItem<Type>, CDatasetItem<Type>)) {sort(tItems.begin(),tItems.end(),func);};
493 
494 
495 
496 
498 
501  void randomShuffle();
502 
503 
504 
506 
509  void clear();
510 
511 
513 
517  inline int items() const { return (int)tItems.size();};
518 
519 
521 
525  inline int inputDimension() const;
526 
527 
528 
530 
534  inline int outputDimension() const;
535 
536 
538 
542  inline void* metaData() { return pMetaData; }
543 
544 
546  //
547  inline void setMetaData(void* pTheData) { pMetaData = pTheData; }
548 
549 
550 };
551 
552 
553 
554 template<class Type>
555 CDataset<Type>::CDataset(int iInitialSize, int iGrowSize) : CObject<Type>()
556 
557 {
558 #ifndef DS_IS_A_DEQUE
559  tItems.reserve(iInitialSize);
560 #endif
561  this->iGrowSize=iGrowSize;
562  sType =string("CDataset 0.1");
563  sInfo=string("none");
564  bProtect = false;
565  pMetaData = NULL;
566 }
567 
568 
569 
570 template<class Type>
572 
573 {
574  tItems=rtDataset.tItems;
575  iGrowSize=rtDataset.iGrowSize;
576  sType=rtDataset.sType;
577  sInfo=rtDataset.sInfo;
578  bProtect = false;
579  pMetaData = rtDataset.pMetaData;
580 }
581 
582 
583 template<class Type>
585 {
586  tItems.clear();
587 }
588 
589 
590 template<class Type>
592 {
593  srand(time(NULL));
594 }
595 
596 template<class Type>
598 {
599  tItems.clear();
600 }
601 
602 
603 
604 template<class Type>
606 {
607 
608  if(this == &rtDataset)
609  return *this;
610  this->clear();
611 #ifndef DS_IS_A_DEQUE
612  this->tItems.reserve(rtDataset.items());
613 #endif
614  this->tItems=rtDataset.tItems;
615  return *this;
616 }
617 
618 
619 template<class Type>
620 bool CDataset<Type>::read(const char* pcPath){
621  igzstream stream;
622  ifstream ist(pcPath, ios::in);
623 
624  if(!ist.good())
625  return false;
626  else
627  ist.close();
628  stream.open(pcPath, ios::in);
629 
630  if(!stream)
631  return false;
632  read(stream);
633  stream.close();
634  return true;
635 
636 }
637 
638 
639 template<class Type>
640 bool CDataset<Type>::write(const char* pcPath){
641  ogzstream stream;
642  stream.open(pcPath, ios::out);
643 
644  if(!stream) return false;
645  write(stream);
646  stream.close();
647  return true;
648 
649 }
650 
651 
652 template<class Type>
653 bool CDataset<Type>::read(istream& istr){
654 
655  char* pcBuff;
656  float fBuff;
657  int iBuff;
658  int iNbItems;
659  int iInputDim;
660  int iOutputDim;
661 
662  istr.read((char*) &iBuff, sizeof(int));
663 
664  pcBuff=new char[iBuff];
665  istr.read((char*) pcBuff, iBuff*sizeof(char));
666 
667  if(this->sType.compare(pcBuff)){
668  cerr << "Wrong file type!" << endl;
669  delete[] pcBuff;
670  return false;
671  }
672  delete[] pcBuff;
673 
674  istr.read((char*) &iNbItems, sizeof(int));
675  istr.read((char*) &this->iGrowSize, sizeof(int));
676  istr.read((char*) &iInputDim, sizeof(int));
677  istr.read((char*) &iOutputDim, sizeof(int));
678 
679  istr.read((char*) &iBuff, sizeof(int));
680  pcBuff=new char[iBuff];
681  istr.read((char*) pcBuff, iBuff*sizeof(char));
682  this->sInfo=pcBuff;
683  delete[] pcBuff;
684 
685  CDenseVector<Type> inpVec(iInputDim);
686  CDenseVector<Type> outVec(iOutputDim);
687 
688  this->tItems.clear();
689 #ifndef DS_IS_A_DEQUE
690  this->tItems.reserve(iNbItems);
691 #endif
692  for(int i=0;i<iNbItems;i++){
693 
694  for(int j=0;j<iInputDim;j++){
695  istr.read((char*) &fBuff, sizeof(float));
696  inpVec.setElement(j,static_cast<Type>(fBuff));;
697  }
698 
699  for(int j=0;j<iOutputDim;j++){
700  istr.read((char*) &fBuff, sizeof(float));
701  outVec.setElement(j,static_cast<Type>(fBuff));
702  }
703 
704  this->appendItem(CDatasetItem<Type>(inpVec, outVec, i));
705  }
706  return true;
707 }
708 
709 template<class Type>
710 bool CDataset<Type>::write(ostream& ostr){
711 
712  float fBuff;
713  int iBuff;
714  int iNbItems =this->items();
715  int iInputDim =this->inputDimension();
716  int iOutputDim=this->outputDimension();
717 
718  iBuff=sType.length()+1;
719  ostr.write((const char*)&iBuff,sizeof(int));
720  ostr.write((const char*)sType.c_str(),iBuff*sizeof(char));
721  ostr.write((const char*)&iNbItems,sizeof(iNbItems));
722  ostr.write((const char*)&this->iGrowSize,sizeof(this->iGrowSize));
723  ostr.write((const char*)&iInputDim,sizeof(iInputDim));
724  ostr.write((const char*)&iOutputDim,sizeof(iOutputDim));
725  iBuff=sInfo.length()+1;
726  ostr.write((const char*)&iBuff,sizeof(int));
727  ostr.write((const char*)sInfo.c_str(),iBuff*sizeof(char));
728 
729  for(int i=0;i<iNbItems;i++){
730  for(int j=0;j<iInputDim;j++){
731  fBuff=static_cast<float>(this->getItem(i).getInputComponent(j));
732  ostr.write((const char*)&fBuff,sizeof(fBuff));
733  }
734  for(int j=0;j<iOutputDim;j++){
735  fBuff=static_cast<float>(this->getItem(i).getOutputComponent(j));
736  ostr.write((const char*)&fBuff,sizeof(fBuff));
737  }
738  }
739  return true;
740 }
741 
742 template<class Type>
743 bool CDataset<Type>::serialize(fstream& stream, IO_MODE mode) {
744  if(mode == READ) {
745  int iSize;
746  stream.read( (char*)&iSize, sizeof(int));
747 #ifndef DS_IS_A_DEQUE
748  tItems.reserve(iSize);
749 #endif
750  stream.read( (char*)&iSize, sizeof(int));
751  char* pcBuffer = new char[iSize+1];
752  memset(pcBuffer,'\0', iSize+1);
753  stream.read( (char*)pcBuffer, iSize*sizeof(char));
754  sInfo = string(pcBuffer);
755  delete [] pcBuffer;
756  CDenseVector<Type> tIn;
757  CDenseVector<Type> tOut;
758  for(int i=0;i<iSize ;i++) {
759  tIn.serialize(stream, mode);
760  tOut.serialize(stream, mode);
761  tItems.push_back( CDatasetItem<Type>(tIn, tOut) );
762  }
763  } else {
764  int iSize = this->items();
765  stream.write( (char*)&iSize, sizeof(int));
766  iSize = sInfo.length();
767  stream.write( (char*)&iSize, sizeof(int));
768  stream.write( (char*)sInfo.c_str(), iSize*sizeof(char));
769 
770  for(int i=0;i<iSize ;i++) {
771  tItems[i].getInputVector().serialize(stream, mode);
772  tItems[i].getOutputVector().serialize(stream, mode);
773  }
774  }
775  return false;
776 }
777 
778 template<class Type>
780  string str;
781  unsigned int uiItems;
782 
784  return false;
785  if( tA.isReading()) {
786  tItems.clear();
787  tA >> uiItems;
788  tItems.resize(uiItems);
789  for(int i=0;i<(int)uiItems ;i++)
790  tA >> tItems[i];
791 
792  return true;
793  } else {
794  tA << tItems.size(); tA.flush();
795  for(int i=0;i<(int)tItems.size() ;i++)
796  tA << tItems[i]; tA.flush();
797  return true;
798  }
799  return false;
800 }
801 
802 
803 
804 template<class Type>
805 void CDataset<Type>::setInfo(const string& str){
806  this->sInfo=str;
807 }
808 
809 template<class Type>
811  return this->sInfo;
812 }
813 
814 
815 template<class Type>
816 void CDataset<Type>::reserve(int iSize){
817 
818 #ifdef SAFE
819  assert(iSize>=0);
820 #endif
821 #ifndef DS_IS_A_DEQUE
822  tItems.reserve(MAX((int)iSize, (int)tItems.size()));
823 #endif
824 }
825 
826 template<class Type>
827 void CDataset<Type>::setGrowSize(int iGrowSize){
828 
829 #ifdef SAFE
830  assert(iGrowSize>0);
831 #endif
832 
833  this->iGrowSize=iGrowSize;
834 }
835 
836 
837 
838 template<class Type>
839 inline
841 {
842 #ifdef SAFE
843  assert(iIndex >= 0);
844  assert((int)iIndex < (int)tItems.size());
845 #endif
846  return (tItems[iIndex]);
847 }
848 
849 template<class Type>
850 inline
851 const
853 {
854 #ifdef SAFE
855  assert(iIndex >= 0);
856  assert((int)iIndex < (int)tItems.size());
857 #endif
858  return (tItems[iIndex]);
859 }
860 
861 template<class Type>
862 inline
864 {
865 #ifdef SAFE
866  assert(iIndex >= 0);
867  assert(iIndex < (int)tItems.size());
868 #endif
869  return (tItems[iIndex]);
870 }
871 
872 template<class Type>
873 inline
874 const CDatasetItem<Type>& CDataset<Type>::item(int iIndex) const
875 {
876 #ifdef SAFE
877  assert(iIndex >= 0);
878  assert(iIndex < (int)tItems.size());
879 #endif
880  return (tItems[iIndex]);
881 }
882 
883 
884 template<class Type>
885 inline
887 {
888 #ifdef SAFE
889 #endif
890  int i = IRAND(0,(int)tItems.size());
891  return(tItems[i]);
892 }
893 
894 template<class Type>
895 inline
897 {
898 #ifdef SAFE
899 #endif
900  int i = IRAND(0,(int)tItems.size());
901  return(tItems[i]);
902 }
903 
904 
905 template<class Type>
906 vector<CDataset<Type> > CDataset<Type>::createFolds(int iN) const {
907  vector< CDataset<Type> > tVec(iN);
908 
909  for(int i=0;i<iN;i++)
910  tVec.reserve(this->items());
911  for(int i=0;i<items();i++)
912  tVec[i%iN].appendItem(getItem(i));
913  return tVec;
914 }
915 
916 template<class Type>
917 inline
919 {
920  return (tItems[tItems.size()-1]);
921 }
922 
923 template<class Type>
924 inline
925 int CDataset<Type>::getBestMatchIDInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric) const{
926  Type tMinDist = rCMetric.distance(tItems[0].getInputVector(),rtVector);
927  int iBestId = 0;
928  int iSize = static_cast<int>(tItems.size());
929 
930  for(int i=1;i<iSize;i++){
931  Type tDist = rCMetric.distance(tItems[i].getInputVector(),rtVector);
932  if(tDist<tMinDist){
933  tMinDist=tDist;
934  iBestId=i;
935  }
936  }
937  return iBestId;
938 }
939 
940 template<class Type>
941 inline
942 int CDataset<Type>::bestMatchIDInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric) const{
943  Type tMinDist = rCMetric.distance(tItems[0].getInputVector(),rtVector);
944  int iBestId = 0;
945  int iSize = static_cast<int>(tItems.size());
946 
947  for(int i=1;i<iSize;i++){
948  Type tDist = rCMetric.distance(tItems[i].getInputVector(),rtVector);
949  if(tDist<tMinDist){
950  tMinDist=tDist;
951  iBestId=i;
952  }
953  }
954  return iBestId;
955 }
956 
957 template<class Type>
958 inline
959 int CDataset<Type>::bestMatchIDInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric,
960  const int iStart, const int iEnd) const{
961  Type tMinDist = rCMetric.distance(tItems[iStart].getInputVector(),rtVector);
962  int iBestId = iStart;
963 
964  for(int i=iStart+1; i <= iEnd; i++){
965  Type tDist = rCMetric.distance(tItems[i].getInputVector(),rtVector);
966  if(tDist<tMinDist){
967  tMinDist=tDist;
968  iBestId=i;
969  }
970  }
971  return iBestId;
972 }
973 
974 template<class Type>
975 inline
976 int CDataset<Type>::bestMatchIDInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric,
977  const int iStart, const int iEnd, const int iOmit) const{
978  Type tMinDist = rCMetric.distance(tItems[iStart].getInputVector(),rtVector);
979 
980  int iBestId;
981 
982  if (iOmit == iStart) {
983  iBestId = iStart+1;
984  } else {
985  iBestId = iStart;
986  }
987 
988  for(int i=iStart+1; i <= iEnd; i++) {
989 
990  if (i != iOmit) {
991  Type tDist = rCMetric.distance(tItems[i].getInputVector(),rtVector);
992  if(tDist<tMinDist){
993  tMinDist=tDist;
994  iBestId=i;
995  }
996  }
997  }
998  return iBestId;
999 }
1000 
1001 template<class Type>
1002 inline
1003 int CDataset<Type>::bestMatchIDInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric,
1004  vector<int>::iterator itBegin, vector<int>::iterator itEnd) const {
1005 
1006  Type tMinDist = rCMetric.distance(tItems[*itBegin].getInputVector(),rtVector);
1007 
1008  int iBestId = *itBegin;
1009  vector<int>::iterator it = itBegin + 1;
1010 
1011  for(; it < itEnd; it++) {
1012  Type tDist = rCMetric.distance(tItems[*it].getInputVector(),rtVector);
1013  if(tDist<tMinDist){
1014  tMinDist=tDist;
1015  iBestId=*it;
1016  }
1017  }
1018  return iBestId;
1019 }
1020 
1021 template<class Type>
1022 inline
1023 int CDataset<Type>::getBestMatchIDOutput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric)const{
1024  Type tMinDist = rCMetric.distance(tItems[0].getOutputVector(),rtVector);
1025  int iBestId = 0;
1026  int iSize = static_cast<int>(tItems.size());
1027 
1028  for(int i=1;i<iSize;i++){
1029  Type tDist = rCMetric.distance(tItems[i].getOutputVector(),rtVector);
1030  if(tDist<tMinDist){
1031  tMinDist=tDist;
1032  iBestId=i;
1033  }
1034  }
1035  return iBestId;
1036 }
1037 template<class Type>
1038 inline
1039 int CDataset<Type>::bestMatchIDOutput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric)const{
1040  Type tMinDist = rCMetric.distance(tItems[0].getOutputVector(),rtVector);
1041  int iBestId = 0;
1042  int iSize = static_cast<int>(tItems.size());
1043 
1044  for(int i=1;i<iSize;i++){
1045  Type tDist = rCMetric.distance(tItems[i].getOutputVector(),rtVector);
1046  if(tDist<tMinDist){
1047  tMinDist=tDist;
1048  iBestId=i;
1049  }
1050  }
1051  return iBestId;
1052 }
1053 
1054 
1055 
1056 template<class Type>
1057 inline
1058 void CDataset<Type>::getBestMatchInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric, int& riBestID, Type& rtBestDist)const{
1059  int iSize = static_cast<int>(tItems.size());
1060  riBestID = 0;
1061  rtBestDist = rCMetric.distance(tItems[0].getInputVector(),rtVector);
1062 
1063  for(int i=1;i<iSize;i++){
1064  Type tDist = rCMetric.distance(tItems[i].getInputVector(),rtVector);
1065  if(tDist<rtBestDist){
1066  rtBestDist=tDist;
1067  riBestID=i;
1068  }
1069  }
1070 }
1071 
1072 template<class Type>
1073 inline
1074 void CDataset<Type>::bestMatchInput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric, int& riBestID, Type& rtBestDist)const{
1075  int iSize = static_cast<int>(tItems.size());
1076  riBestID = 0;
1077  rtBestDist = rCMetric.distance(tItems[0].getInputVector(),rtVector);
1078 
1079  for(int i=1;i<iSize;i++){
1080  Type tDist = rCMetric.distance(tItems[i].getInputVector(),rtVector);
1081  if(tDist<rtBestDist){
1082  rtBestDist=tDist;
1083  riBestID=i;
1084  }
1085  }
1086 }
1087 
1088 template<class Type>
1089 inline
1090 void CDataset<Type>::getBestMatchOutput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric, int& riBestID, Type& rtBestDist)const{
1091  int iSize = static_cast<int>(tItems.size());
1092  riBestID = 0;
1093  rtBestDist = rCMetric.distance(tItems[0].getOutputVector(),rtVector);
1094 
1095  for(int i=1;i<iSize;i++){
1096  Type tDist = rCMetric.distance(tItems[i].getOutputVector(),rtVector);
1097  if(tDist<rtBestDist){
1098  rtBestDist=tDist;
1099  riBestID=i;
1100  }
1101  }
1102 }
1103 
1104 template<class Type>
1105 inline
1106 void CDataset<Type>::bestMatchOutput(const CVector<Type>& rtVector, const CMetric<Type>& rCMetric, int& riBestID, Type& rtBestDist)const{
1107  int iSize = static_cast<int>(tItems.size());
1108  riBestID = 0;
1109  rtBestDist = rCMetric.distance(tItems[0].getOutputVector(),rtVector);
1110 
1111  for(int i=1;i<iSize;i++){
1112  Type tDist = rCMetric.distance(tItems[i].getOutputVector(),rtVector);
1113  if(tDist<rtBestDist){
1114  rtBestDist=tDist;
1115  riBestID=i;
1116  }
1117  }
1118 }
1119 
1129 template<class Type>
1131  const CDatasetItem<Type>& tItem)
1132 {
1133 #ifdef SAFE
1134  assert(iIndex >= 0);
1135  assert(iIndex < (int)tItems.size());
1136  assert((int)tItems.size() == 0 || tItems[0].inputDimension() == tItem.inputDimension());
1137  assert((int)tItems.size() == 0 || tItems[0].outputDimension() == tItem.outputDimension());
1138 #endif
1139  if(iIndex < 0 || iIndex >= (int)tItems.size())
1140  return 0;
1141 
1142  tItems[iIndex] = (tItem);
1143  return 1;
1144 }
1145 
1146 
1157 template<class Type>
1159  const CDatasetItem<Type>& tItem)
1160 {
1161 #ifdef SAFE
1162  assert(iIndex >= 0);
1163  assert(iIndex < (int)tItems.size());
1164  assert((int)tItems.size() == 0 || tItems[0].inputDimension() == tItem.inputDimension());
1165  assert((int)tItems.size() == 0 || tItems[0].outputDimension() == tItem.outputDimension());
1166 #endif
1167  if(iIndex < 0 || iIndex >= (int)tItems.size())
1168  return 0;
1169 #ifndef DS_IS_A_DEQUE
1170  if(tItems.capacity() == tItems.size())
1171  tItems.reserve((int)tItems.size() + iGrowSize);
1172 #endif
1173  tItems.insert(tItems.begin()+iIndex, tItem);
1174  return 1;
1175 }
1176 
1177 template<class Type>
1178 void CDataset<Type>::getDataFromVector(Type* ptData, int iNbItems, int iInputDim, int iOutputDim){
1179 
1180 #ifdef SAFE
1181  assert(iNbItems>=0);
1182  assert(iInputDim>=0);
1183  assert(iOutputDim>=0);
1184 #endif
1185 
1186  this->clear();
1187 
1188  for(int i=0;i<iNbItems;i++) {
1189  this->appendItem(CDatasetItem<Type>(CDenseVector<Type>(iInputDim,(ptData+i*(iInputDim+iOutputDim))),
1190  CDenseVector<Type>(iOutputDim,(ptData+i*(iInputDim+iOutputDim)+iInputDim)),
1191  i+1));
1192  }
1193 
1194 }
1195 
1196 template<class Type>
1198 
1199  for(int i=0;i<this->items();i++){
1200 
1201  for(int j=0;j<this->inputDimension();j++){
1202  ptData[i*(this->inputDimension()+this->outputDimension())+j]=this->getItem(i).getInputComponent(j);
1203  }
1204 
1205  for(int j=0;j<this->outputDimension();j++){
1206  ptData[i*(this->inputDimension()+this->outputDimension())+this->inputDimension()+j]=this->getItem(i).getOutputComponent(j);
1207  }
1208 
1209  }
1210 
1211 }
1212 
1213 
1214 
1215 template<class Type>
1217 {
1218 #ifdef SAFE
1219  assert((int)tItems.size() == 0 || tItems[0].inputDimension() == tItem.inputDimension());
1220  assert((int)tItems.size() == 0 || tItems[0].outputDimension() == tItem.outputDimension());
1221 #endif
1222 #ifndef DS_IS_A_DEQUE
1223  if(tItems.capacity() == tItems.size())
1224  tItems.reserve(tItems.size() + iGrowSize);
1225 #endif
1226  tItems.push_back(tItem);
1227  return 1;
1228 }
1229 
1230 
1231 template<class Type>
1233 {
1234 #ifdef SAFE
1235  assert(iIndex >= 0);
1236  assert(iIndex < (int)tItems.size());
1237 #endif
1238  if(iIndex < 0 || iIndex >=this->items())
1239  return 0;
1240 
1241  tItems.erase(tItems.begin()+iIndex);
1242  return 1;
1243 }
1244 
1245 
1246 
1247 template<class Type>
1249 {
1250 
1251  if((items() > 0))
1252  tItems.erase(tItems.begin()+items()-1);
1253 }
1254 
1255 
1256 template<class Type>
1258 {
1259  RandomNumber rnd;
1260  random_shuffle(tItems.begin(), tItems.end(), rnd);
1261 }
1262 
1263 
1264 
1265 
1266 template<class Type>
1268  int iIndexEnd,
1269  bool bDelete = true)
1270 {
1271 #ifdef SAFE
1272  assert(iIndexStart >= 0);
1273  assert(iIndexStart < (int)tItems.size());
1274  assert(iIndexEnd >= 0);
1275  assert(iIndexEnd < (int)tItems.size());
1276  assert(iIndexStart < iIndexEnd);
1277 #endif
1278  CDataset<Type> tNewSet;
1279 #ifndef DS_IS_A_DEQUE
1280  tNewSet.reserve(iIndexEnd-iIndexStart) ;
1281 #endif
1282  for(int i=iIndexStart;i<=iIndexEnd;i++) {
1283  tNewSet.appendItem( this->getItem(i) );
1284  }
1285  if(bDelete) {
1286  if(iIndexEnd != (items()-1))
1287  tItems.erase(this->tItems.begin()+iIndexStart, this->tItems.begin()+iIndexEnd);
1288  else
1289  tItems.resize(iIndexStart);
1290  }
1291  return tNewSet;
1292 }
1293 
1294 template<class Type>
1296 {
1297  CDataset<Type> tNewSet( this->items() );
1298  int iNumSelect = (int)floor(this->items() * (dFraction/100));
1299  for(int i=0;i<iNumSelect;i++)
1300  {
1301  tNewSet.appendItem( this->getItem(this->items() - 1 - i) );
1302  }
1303  tItems.erase(this->tItems.end()-iNumSelect, this->tItems.end());
1304  return tNewSet;
1305 }
1306 
1307 template<class Type>
1309 #ifdef SAFE
1310  if(iNbItems<0){throw CException("iNbItems<0");}
1311  if(this->items()<=0){throw CException("this->items()<=0");}
1312 
1313 #endif
1314  ML_LEQ_CHK(iNbItems, tItems.size());
1315  vector<CDatasetItem<Type> > tTmp(iNbItems);
1316 
1317  random_sample(tItems.begin(), tItems.end(), tTmp.begin(), tTmp.end());
1318 
1319  CDataset<Type> tSubset(iNbItems);
1320  for(int i=0;i<iNbItems;i++){
1321  tSubset.appendItem(tTmp[i]);
1322  }
1323  return tSubset;
1324 
1325 }
1326 
1327 
1328 template<class Type>
1329 void CDataset<Type>::merge(const CDataset<Type>& rtDataset){
1330  int iNbItems=rtDataset.items();
1331 #ifndef DS_IS_A_DEQUE
1332  this->reserve(this->items() + iNbItems);
1333 #endif
1334  for(int i=0;i<iNbItems;i++){
1335  this->appendItem(rtDataset.getItem(i));
1336  }
1337 }
1338 
1339 template<class Type>
1341 {
1342  if((int)tItems.size() == 0) return 0;
1343  else return tItems[0].inputDimension();
1344 }
1345 
1346 template<class Type>
1348 {
1349  if(tItems.size()==0) return 0;
1350  else return tItems[0].outputDimension();
1351 }
1352 
1353 
1354 
1355 template<class Type>
1357 /*-----------------------------------------------------*/
1358 #ifdef SAVE
1359  ML_GEQ_CHK(this->items(), 1);
1360  ML_GEQ_CHK(this->inputDimension(), 1);
1361 #endif
1362 /*-----------------------------------------------------*/
1363 
1364  CMatrix<Type> tData(this->items(), this->inputDimension());
1365 
1366  for(int i=0; i<this->items();i++) {
1367  tData.setRow(i, tItems[i].getInputVector() );
1368  }
1369 
1370  return tData;
1371 }
1372 
1373 
1374 
1375 template<class Type>
1377 /*-----------------------------------------------------*/
1378 #ifdef SAVE
1379  ML_GEQ_CHK(this->items(), 1);
1380  ML_GEQ_CHK(this->inputDimension(), 1);
1381 #endif
1382 /*-----------------------------------------------------*/
1383 
1384  CMatrix<Type> tData(this->items(), this->inputDimension());
1385 
1386  for(int i=0; i<this->items();i++) {
1387  tData.setRow(i, tItems[i].getOutputVector() );
1388  }
1389  return tData;
1390 }
1391 
1392 
1393 
1394 
1395 
1396 
1397 
1398 
1399 
1400 
1401 
1402 #endif
void getBestMatchOutput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const
Definition: CDataset.h:1090
virtual bool isReading() const
Definition: CArchiv.h:25
bool bProtect
Definition: CDataset.h:153
Definition: gzstream.h:103
CDataset< Type > extract(int iIndexStart, int iIndexEnd, bool bDelete)
Copy or move a range of items to a new dataset.
Definition: CDataset.h:1267
void merge(const CDataset< Type > &rtDataset)
Merge a second dataset.
Definition: CDataset.h:1329
Definition: gzstream.h:92
int setItem(int iIndex, const CDatasetItem< Type > &rtItem)
Set item iIndex.
Definition: CDataset.h:1130
int outputDimension() const
Definition: CDatasetItem.h:259
#define MAX(x, y)
Definition: Macros.h:19
#define IRAND(x, y)
Definition: Macros.h:80
int insertItem(int iIndex, const CDatasetItem< Type > &rtItem)
Insert item.
Definition: CDataset.h:1158
CDatasetItem< Type > & randomItem(void)
Return the reference on a random item.
Definition: CDataset.h:896
void setGrowSize(int iGrowSize)
Set size increment.
Definition: CDataset.h:827
CDataset< Type > & operator=(const CDataset< Type > &rtDataset)
Assignment operator.
Definition: CDataset.h:605
IO_MODE
Definition: CObject.h:38
void * pMetaData
Definition: CDataset.h:124
virtual void close()
CDataset< Type > split(double dFraction)
Split dataset in two disjoint subsets.
Definition: CDataset.h:1295
string sInfo
Definition: CDataset.h:121
const CDatasetItem< Type > & operator()(int iIndex) const
Return a const reference to item iIndex.
Definition: CDataset.h:852
Base class for metrix objects.
Definition: CMetric.h:36
Template object implementing a matrix of single and double precision elements.
Definition: CDenseVector.h:38
void clear()
Delete all items.
Definition: CDataset.h:597
int iGrowSize
Definition: CDataset.h:122
string sType
Definition: CDataset.h:120
bool serialize(fstream &stream, IO_MODE mode=READ)
Read/write from binary stream.
Definition: CDataset.h:743
void randomShuffle()
Random shuffle of items.
Definition: CDataset.h:1257
CDatasetItem< Type > & getLast()
Return reference to the last item.
Definition: CDataset.h:918
Definition: CDataset.h:61
int appendItem(const CDatasetItem< Type > &rtItem)
Append item to the end of the dataset.
Definition: CDataset.h:1216
const CDatasetItem< Type > & item(int iIndex) const
Return a const reference to item iIndex.
Definition: CDataset.h:874
void setElement(int iIndex, Type tValue)
RandomNumber()
Definition: CDataset.h:63
void initRandomSeed()
Init random seed.
Definition: CDataset.h:591
void open(const char *name, int open_mode=std::ios::out)
Definition: gzstream.h:109
void * metaData()
Return pointer to meta data object.
Definition: CDataset.h:542
CDataset(int iInitialSize=1000, int iGrowSize=1000)
Constructor.
Definition: CDataset.h:555
void reserve(int iSize)
Reserve space for iSize items.
Definition: CDataset.h:816
int inputDimension() const
Return dimension of input vectors.
Definition: CDataset.h:1340
void sortdata(bool(*func)(CDatasetItem< Type >, CDatasetItem< Type >))
Sort data using a generic function as less operator.
Definition: CDataset.h:492
int getBestMatchIDOutput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const
Definition: CDataset.h:1023
Definition: CObject.h:38
int inputDimension() const
Definition: CDatasetItem.h:253
void setRow(int iRow, const CDenseVector< Type > &tVec)
int outputDimension() const
Return dimension of output vectors.
Definition: CDataset.h:1347
const CDatasetItem< Type > & getRandomItem(void) const
Definition: CDataset.h:886
const CDatasetItem< Type > & getItem(int iIndex) const
Definition: CDataset.h:863
Single item of a dataset consisting of a pair of input and out vectors.
Definition: CDatasetItem.h:43
void setDataToVector(Type *ptData)
Definition: CDataset.h:1197
bool write(const char *pcPath)
Write dataset to binary and compressed ml file.
Definition: CDataset.h:640
string getInfo(void)
Return info text.
Definition: CDataset.h:810
virtual Type distance(const CVector< Type > &rtVec1, const CVector< Type > &rtVec2) const
Definition: CMetric.h:53
void getDataFromVector(Type *ptData, int iNbItems, int iInputDim, int iOutputDim)
Definition: CDataset.h:1178
int bestMatchIDInput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const
Definition: CDataset.h:942
CMatrix< Type > getInputMatrix() const
Get input data as matrix.
Definition: CDataset.h:1356
bool read(istream &istr)
Read data set from ascii stream.
Definition: CDataset.h:653
CDatasetItem< Type > & operator[](int iIndex)
Return reference on item iIndex.
Definition: CDataset.h:840
Base class of object serialization.
Definition: CArchiv.h:19
void bestMatchOutput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const
Definition: CDataset.h:1106
Definition: CException.h:40
Template object for vectors of single and double precision and integer.
Definition: CDenseVector.h:37
#define ML_LEQ_CHK(var, val)
Definition: Macros.h:113
bool serialize(fstream &stream, IO_MODE mode=READ)
Read/write from binary stream.
CMatrix< Type > getOutputMatrix() const
Get output data as matrix.
Definition: CDataset.h:1376
vector< CDataset< Type > > createFolds(int iN) const
Create n disjoint folds.
Definition: CDataset.h:906
templatized vector for numerical applications
Definition: CMatrix.h:39
void removeLastItem()
Remove the last item.
Definition: CDataset.h:1248
int bestMatchIDOutput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const
Definition: CDataset.h:1039
bool serialize2(CArchiv &tA)
Definition: CDataset.h:779
Base class for all object.
Definition: CObject.h:51
void getBestMatchInput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const
Definition: CDataset.h:1058
#define ML_GEQ_CHK(var, val)
Definition: Macros.h:119
void setInfo(const string &str)
Set info text.
Definition: CDataset.h:805
Manages pairs of input and output vectors.
Definition: CDataset.h:110
int removeItem(int iIndex)
Remove the item iIndex.
Definition: CDataset.h:1232
void sortdata()
Sort data.
Definition: CDataset.h:484
void open(const char *name, int open_mode=std::ios::in)
Definition: gzstream.h:98
~CDataset()
Destructor.
Definition: CDataset.h:584
int items() const
Return number of items in the dataset.
Definition: CDataset.h:517
CDataset< Type > subset(int iNbItems) const
Copy i random items to a new dataset.
Definition: CDataset.h:1308
int getBestMatchIDInput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const
Definition: CDataset.h:925
void setMetaData(void *pTheData)
Sets a meta data pointer.
Definition: CDataset.h:547
void bestMatchInput(const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const
Definition: CDataset.h:1074
virtual string className() const
Returns the class name.
Definition: CDataset.h:182
virtual void flush()
Definition: CArchiv.h:26
vector< CDatasetItem< Type > > tItems
Definition: CDataset.h:118