Machine Learning Library
|
Manages pairs of input and output vectors. More...
#include <CDataset.h>
Public Member Functions | |
CDataset (int iInitialSize=1000, int iGrowSize=1000) | |
Constructor. More... | |
CDataset (const CDataset< Type > &rtDataset) | |
Copy constructor. More... | |
~CDataset () | |
Destructor. More... | |
CDataset< Type > & | operator= (const CDataset< Type > &rtDataset) |
Assignment operator. More... | |
void | initRandomSeed () |
Init random seed. More... | |
bool | read (istream &istr) |
Read data set from ascii stream. More... | |
virtual string | className () const |
Returns the class name. More... | |
bool | read (const char *pcPath) |
Read dataset from binary compressed *.ml file. More... | |
bool | write (const char *pcPath) |
Write dataset to binary and compressed ml file. More... | |
bool | write (ostream &ostr) |
Write dataset to ascii stream. More... | |
bool | serialize (fstream &stream, IO_MODE mode=READ) |
Read/write from binary stream. More... | |
bool | serialize2 (CArchiv &tA) |
void | setInfo (const string &str) |
Set info text. More... | |
string | getInfo (void) |
Return info text. More... | |
void | reserve (int iSize) |
Reserve space for iSize items. More... | |
void | setGrowSize (int iGrowSize) |
Set size increment. More... | |
CDataset< Type > | split (double dFraction) |
Split dataset in two disjoint subsets. More... | |
CDataset< Type > | subset (int iNbItems) const |
Copy i random items to a new dataset. More... | |
vector< CDataset< Type > > | createFolds (int iN) const |
Create n disjoint folds. More... | |
CDataset< Type > | extract (int iIndexStart, int iIndexEnd, bool bDelete) |
Copy or move a range of items to a new dataset. More... | |
void | merge (const CDataset< Type > &rtDataset) |
Merge a second dataset. More... | |
const CDatasetItem< Type > & | getItem (int iIndex) const |
const CDatasetItem< Type > & | item (int iIndex) const |
Return a const reference to item iIndex. More... | |
const CDatasetItem< Type > & | getRandomItem (void) const |
CDatasetItem< Type > & | randomItem (void) |
Return the reference on a random item. More... | |
CDatasetItem< Type > & | getLast () |
Return reference to the last item. More... | |
CDatasetItem< Type > & | operator[] (int iIndex) |
Return reference on item iIndex. More... | |
const CDatasetItem< Type > & | operator() (int iIndex) const |
Return a const reference to item iIndex. More... | |
int | getBestMatchIDInput (const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const |
int | bestMatchIDInput (const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const |
int | bestMatchIDInput (const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, const int iStart, const int iEnd) const |
int | bestMatchIDInput (const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, const int iStart, const int iEnd, const int iOmit) const |
int | bestMatchIDInput (const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, vector< int >::iterator itBegin, vector< int >::iterator itEnd) const |
int | getBestMatchIDOutput (const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const |
int | bestMatchIDOutput (const CVector< Type > &rtVector, const CMetric< Type > &rCMetric) const |
void | getBestMatchInput (const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const |
void | bestMatchInput (const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const |
void | getBestMatchOutput (const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const |
void | bestMatchOutput (const CVector< Type > &rtVector, const CMetric< Type > &rCMetric, int &riBestID, Type &rtBestDist) const |
int | setItem (int iIndex, const CDatasetItem< Type > &rtItem) |
Set item iIndex. More... | |
int | insertItem (int iIndex, const CDatasetItem< Type > &rtItem) |
Insert item. More... | |
void | getDataFromVector (Type *ptData, int iNbItems, int iInputDim, int iOutputDim) |
void | setDataToVector (Type *ptData) |
CMatrix< Type > | getInputMatrix () const |
Get input data as matrix. More... | |
CMatrix< Type > | getOutputMatrix () const |
Get output data as matrix. More... | |
int | appendItem (const CDatasetItem< Type > &rtItem) |
Append item to the end of the dataset. More... | |
int | removeItem (int iIndex) |
Remove the item iIndex. More... | |
void | removeLastItem () |
Remove the last item. More... | |
void | sortdata () |
Sort data. More... | |
void | sortdata (bool(*func)(CDatasetItem< Type >, CDatasetItem< Type >)) |
Sort data using a generic function as less operator. More... | |
void | randomShuffle () |
Random shuffle of items. More... | |
void | clear () |
Delete all items. More... | |
int | items () const |
Return number of items in the dataset. More... | |
int | inputDimension () const |
Return dimension of input vectors. More... | |
int | outputDimension () const |
Return dimension of output vectors. More... | |
void * | metaData () |
Return pointer to meta data object. More... | |
void | setMetaData (void *pTheData) |
Sets a meta data pointer. More... | |
![]() | |
CObject () | |
Constructor. More... | |
virtual | ~CObject () |
Destructor. More... | |
void | setVerbose (VERBOSE_LEVEL tVerbose) |
Set the verbose level. More... | |
VERBOSE_LEVEL | verbose (void) const |
Return current verbose level. More... | |
virtual bool | isA (const char *acClass) const |
Check if the object is an instance of the class with given name. More... | |
DATATYPE | dataType () const |
Returns the template data type. More... | |
Public Attributes | |
bool | bProtect |
Protected Attributes | |
vector< CDatasetItem< Type > > | tItems |
string | sType |
string | sInfo |
int | iGrowSize |
void * | pMetaData |
![]() | |
unsigned char | ucVerbose |
Manages pairs of input and output vectors.
The class CDataset manages a set of CDatasetItem objects. Each item consists of a pair of CVector objects (input and output vector). There is no need for specifing the exact number of items to store, since memory is allocated automatically. Nevertheless, specifying an approximate number of items or a appropriate value for iGrowSize can speed-up iterative filling of the dataset. When the initial amount of memory is used-up, the dataset will reallocate memory for iGrowSize more items to reduce permanent time consuming reallocations of new memory blocks.
The items are stored in a stl vector. Therefore, the same complexity considerations for access and deletion apply to the dataset object:
There are two ways for loading and saving datasets. First, using the operator<< for writing and operator>> for reading data from a stream. These operators can be used e.g. with a fstream to read/write data from a ASCCI file. In this case, the format of the file has to be:
21 5 4
i(0,0) i(0,1) i(0,2) i(0,3) i(0,4) i(0,5) o(0,1) o(0,2) o(0,3) o(0,4)
i(1,0) i(1,1) i(1,2) i(0,3) i(0,4) i(1,5) o(1,1) o(1,2) o(0,3) o(1,4)
i(2,0) i(2,1) i(2,2) i(2,3) i(2,4) i(2,5) o(2,1) o(2,2) o(2,3) o(2,4)
i(x,x) i(x,x) i(x,x) i(x,x) i(x,x) i(x,x) o(x,x) o(x,x) o(x,x) o(x,x)
i(20,0) i(20,1) i(20,2) i(20,3) i(20,4) i(20,5) o(20,1) o(20,2) o(20,3) o(20,4)
The second way is to use the method read(char* pcPath), read(istream& istr), write(ostream& ostr) and write(char* pcPath). Using read(char* pcPath) and write(char* pcPath), data will be read/written from/to a gzip-compressed binary file.
There are several functions defined in CDatasetAlgorithm.h which modify the input data of a dataset e.g. zscore(..), scaleRange(..), etc.
Constructor.
Constructor. Creates a new dataset object with space reserved for iInitialSize items. Each time the number of items in the dataset exceeds the reserved size, space for iGrowSize new items will be reserved (the dataset is still empty!). Note: If the dataset is build by repeatedly appending new items, it is important to reserve an adequate space in order to avoid repeated time-consuming reallocations of memory.
iInitialSize | Inititially reserved space |
iGrowSize | Increment per reallocation of space for items |
References CDataset< Type >::bProtect, CDataset< Type >::iGrowSize, CDataset< Type >::pMetaData, CDataset< Type >::sInfo, CDataset< Type >::sType, and CDataset< Type >::tItems.
Copy constructor.
Constructor
References CDataset< Type >::bProtect, CDataset< Type >::iGrowSize, CDataset< Type >::pMetaData, CDataset< Type >::sInfo, CDataset< Type >::sType, and CDataset< Type >::tItems.
int CDataset< Type >::appendItem | ( | const CDatasetItem< Type > & | rtItem | ) |
Append item to the end of the dataset.
Append item to dataset
rtItem | Reference to item |
References CDatasetItem< Type >::inputDimension(), and CDatasetItem< Type >::outputDimension().
Referenced by CDataset< Type >::extract(), and CDataset< Type >::split().
|
inline |
Calculates the dataset vector where the input vector is closest to a vector given as an argument.
rtVector | Reference to a Vector |
rCMetric | Reference to a Metric |
References CMetric< Type >::distance().
|
inline |
References CMetric< Type >::distance().
|
inline |
References CMetric< Type >::distance().
|
inline |
References CMetric< Type >::distance().
|
inline |
(Calculates the dataset vector where the output vector is closest to a vector given as an argument.
rtVector | Reference to a Vector |
rCMetric | Reference to a Metric |
References CMetric< Type >::distance().
|
inline |
Calculates the dataset vector where the input vector is closest to a vector given as an argument.
rtVector | Reference to a Vector |
rCMetric | Reference to a Metric |
riBestID | Reference to the ID of the best matching dataset vector (returns the result) |
rtBestDist | Reference to the distance between the dataset vector and the argument (returns the result) |
References CMetric< Type >::distance().
|
inline |
Calculates the dataset vector where the output vector is closest to a vector given as an argument.
rtVector | Reference to a Vector |
rCMetric | Reference to a Metric |
riBestID | Reference to the ID of the best matching dataset vector (returns the result) |
rtBestDist | Reference to the distance between the dataset vector and the argument (returns the result) |
References CMetric< Type >::distance().
|
inlinevirtual |
Returns the class name.
Reimplemented from CObject< Type >.
void CDataset< Type >::clear | ( | ) |
Delete all items.
Delete al items
Create n disjoint folds.
Return a vector of iN disjoint and equally sized datasets
iN | Number of datasets |
CDataset< Type > CDataset< Type >::extract | ( | int | iIndexStart, |
int | iIndexEnd, | ||
bool | bDelete = true |
||
) |
Copy or move a range of items to a new dataset.
Returns the subset starting at element iIndexStart and ending with element iIndexEnd. If bDelete is true, the corresponding elements will be deleted from the source dataset. Note that this can be quit slow if iIndexEnd is not the last element of the source set.
iIndexStart | Index of first element to extract |
iIndexEnd | Index of last element to extract |
bDelete | If true, elements will be deleted from the source set (default: true) |
References CDataset< Type >::appendItem(), and CDataset< Type >::reserve().
|
inline |
(Obsolete! Use bestMatchIDInput(..) instead)
References CMetric< Type >::distance().
|
inline |
(Obsolete! Use bestMatchIDOutput(..) instead)
References CMetric< Type >::distance().
|
inline |
(Obsolete! Use bestMatchInput(..) instead)
References CMetric< Type >::distance().
|
inline |
(Obsolete! Use bestMatchOutput(..) instead)
References CMetric< Type >::distance().
void CDataset< Type >::getDataFromVector | ( | Type * | ptData, |
int | iNbItems, | ||
int | iInputDim, | ||
int | iOutputDim | ||
) |
Sets the entire dataset according to the tData assuming the following data organization: ((input,output),...,(input,output))
ptData | Pointer to the input and output data |
iNbItems | Number of items |
iInputDim | Input dimension |
iOutputDim | Output dimension |
string CDataset< Type >::getInfo | ( | void | ) |
Return info text.
Get info tag
Get input data as matrix.
Return input data as nxd matrix (item=row)
References ML_GEQ_CHK, and CMatrix< Type >::setRow().
|
inline |
(Obsolete! Use item(int) instead) Return item with index iIndex
Referenced by CDataset< Type >::merge().
|
inline |
Return reference to the last item.
Return last item
Get output data as matrix.
Return output data as nxd matrix (item=row)
References ML_GEQ_CHK, and CMatrix< Type >::setRow().
|
inline |
void CDataset< Type >::initRandomSeed | ( | ) |
Init random seed.
Init random seed
|
inline |
Return dimension of input vectors.
Return the dimension of the items input vector
int CDataset< Type >::insertItem | ( | int | iIndex, |
const CDatasetItem< Type > & | tItem | ||
) |
Insert item.
Insert item behind item iIndex
iIndex | |
Item | to insert |
Insert item given by the pointer at position iIndex in the dataset. If successfull, the dataset is the new owner and is responsible for the items memory managment.
iIndex | Position of the item after inserting into the dataset |
Reference | to item. |
References CDatasetItem< Type >::inputDimension(), and CDatasetItem< Type >::outputDimension().
|
inline |
Return a const reference to item iIndex.
Return item with index iIndex
|
inline |
Return number of items in the dataset.
Return current number of items
Referenced by CDataset< Type >::merge(), and CDataset< Type >::operator=().
Merge a second dataset.
Append dataset to source dataset
Dataset | to append |
References CDataset< Type >::getItem(), and CDataset< Type >::items().
|
inline |
Return pointer to meta data object.
Returns a pointer to a meta data object
|
inline |
Return a const reference to item iIndex.
Return const item with index iIndex
CDataset< Type > & CDataset< Type >::operator= | ( | const CDataset< Type > & | rtDataset | ) |
Assignment operator.
Assignment operator
References CDataset< Type >::items(), and CDataset< Type >::tItems.
|
inline |
Return reference on item iIndex.
Return item with index iIndex
|
inline |
Return dimension of output vectors.
Return the dimension of the items input vector
|
inline |
void CDataset< Type >::randomShuffle | ( | ) |
Random shuffle of items.
Random shuffle all items
bool CDataset< Type >::read | ( | istream & | istr | ) |
Read data set from ascii stream.
Read dataset from instream
istr | Stream |
References CDenseVector< Type >::setElement().
bool CDataset< Type >::read | ( | const char * | pcPath | ) |
Read dataset from binary compressed *.ml file.
Read dataset from file. The file must be a compressed ml file.
pcPath | Path to file |
References gzstreambase::close(), and igzstream::open().
int CDataset< Type >::removeItem | ( | int | iIndex | ) |
Remove the item iIndex.
Remove item with index iIndex. Note that removing an item which is not the last item of the dataset can be quit slow.
iIndex | Index of item ti remove |
void CDataset< Type >::removeLastItem | ( | ) |
Remove the last item.
Remove last item
void CDataset< Type >::reserve | ( | int | iSize | ) |
|
virtual |
Read/write from binary stream.
The functions handles different data types e.g for reading float objects in a double instance, etc.
fstream | Reference to binary stream |
mode | Switches between reading and writing |
Reimplemented from CObject< Type >.
References READ, and CDenseVector< Type >::serialize().
Reimplemented from CObject< Type >.
References CArchiv::flush(), and CArchiv::isReading().
void CDataset< Type >::setDataToVector | ( | Type * | ptData | ) |
Sets a vector to the values from a dataset using the following data organization: ((input,output),...,(input,output))
ptData | Pointer to the input and output data |
void CDataset< Type >::setGrowSize | ( | int | iGrowSize | ) |
Set size increment.
void CDataset< Type >::setInfo | ( | const string & | str | ) |
Set info text.
Set info tag
str | Info tag |
int CDataset< Type >::setItem | ( | int | iIndex, |
const CDatasetItem< Type > & | tItem | ||
) |
Set item iIndex.
Overwrite item with index iIndex
iIndex | Index of item to overwrite |
rtItem | Reference to item with new data |
Replace item at position iIndex with item given by the pointer pItem. If successfull, the dataset is responsible for the items memory managment.
iIndex | Index of item to replace; |
Reference | to item. |
References CDatasetItem< Type >::inputDimension(), and CDatasetItem< Type >::outputDimension().
|
inline |
Sets a meta data pointer.
|
inline |
Sort data.
Sort items
|
inline |
Sort data using a generic function as less operator.
Sort items using a binary comparison function
Split dataset in two disjoint subsets.
Splits the data in two datasets. The dFraction items (starting from the end) are moved to the new dataset.
dFraction | Fraction (in percent) to be move to the returned dataset |
References CDataset< Type >::appendItem().
Copy i random items to a new dataset.
Returns a random subest of iNumItems. The items remain in the dataset
iNbItems | Size of subset |
References ML_LEQ_CHK.
bool CDataset< Type >::write | ( | const char * | pcPath | ) |
Write dataset to binary and compressed ml file.
Write dataset to file. The file will be a compressed ml file.
pcPath | Path to file |
References gzstreambase::close(), and ogzstream::open().
bool CDataset< Type >::write | ( | ostream & | ostr | ) |
Write dataset to ascii stream.
Write dataset to outstream
ostr | Stream |
bool CDataset< Type >::bProtect |
Referenced by CDataset< Type >::CDataset().
|
protected |
Referenced by CDataset< Type >::CDataset().
|
protected |
Referenced by CDataset< Type >::CDataset().
|
protected |
Referenced by CDataset< Type >::CDataset().
|
protected |
Referenced by CDataset< Type >::CDataset().
|
protected |
Referenced by CDataset< Type >::CDataset(), and CDataset< Type >::operator=().