Machine Learning Library
CSOM.h
Go to the documentation of this file.
1 #ifndef CSOM_H
2 #define CSOM_H
3 
4 #include "CObject.h"
5 #include "CDataset.h"
6 #include "CDenseVector.h"
7 #include "CDatasetItem.h"
8 #include "CDatasetStatistic.h"
9 #include "CMetric.h"
10 #include "CKernel.h"
11 #include "CLearnRate.h"
12 #include "CClusterAlgorithm.h"
13 #include "Macros.h"
14 
15 #include "PyramidOfWords.hh"
16 
17 #include <vector>
18 #include <set>
19 #include <map>
20 #include <ext/hash_set>
21 
22 #ifdef MYSQL_MODULE
23 #include "MysqlDatabase.h"
24 #endif
25 
26 #define CARTESIAN_SOM 0x01 // plain SOM
27 #define HYPERBOLIC_SOM 0x02 // hyperbolic SOM
28 #define H_HYPERBOLIC_SOM 0x04 // hierarchical hyperbolic SOM
29 #define HG_HYPERBOLIC_SOM 0x08 // hierarchically growing hyperbolic SOM
30 
31 #define STATS_DIMENSION 11
32 
33 #define COPY_CLASSCOUNT 0x01
34 #define COPY_PROTOTYPE 0x02
35 #define APPEND_BMUS 0x04
36 
42 
43 using namespace __gnu_cxx; // we want to use hash_sets
44 
45 typedef multimap<float, int> MapBMUs;
46 typedef map<int, int> MapPath;
47 typedef map<int, float> MapDist;
48 
49 template<class Type> class CSOM;
50 
51 class CFlowItem {
52 public:
53  int *piNodes;
54  char acDate[20];
55  unsigned long iTimestamp;
56 };
57 
58 
59 
60 template<class Type>
61 class CSomTopology {
62 
63  friend class CSOM<Type>;
64 
65 protected:
66  int iType;
67  vector< vector<int> > vvNeighbours;
70  vector<int> viGeometry;
71 
72  vector<int> viVisibleNode;
73  vector<int> viVisiblePos;
74 
75  int iCenterNode, iOldCenterNode, iVisibleNodesCenter;
77 
78  vector< pair<int,int> > vDomains;
79 
80 
81 public:
82 
83  vector<bool> vbExpand;
84 
88  CSomTopology(int iNumNodes);
89  virtual ~CSomTopology();
90 
94  void setNodePosition(int i, CVector<double>& rtPos) { tNodes[i].setInputVector(rtPos); }
95 
99  const CVector<double>& getNodePosition(int i) { return tNodes[i].getInputVector(); }
100 
101  const CVector<double>& getCenterPosition() { return tCenter; }
102 
106  void addLink(int iFrom, int iTo);
107 
111  int getNumNeighbours(int i) { return vvNeighbours[i].size(); }
112 
113  int getNumDomains() { return vDomains.size(); }
114  void addDomain(int iStart, int iEnd) { vDomains.push_back(make_pair(iStart, iEnd)); }
115  int getDomainStart(int iDomain) { return vDomains[iDomain].first; }
116  int getDomainEnd(int iDomain) { return vDomains[iDomain].second; }
117  int getDomainSize(int iDomain) { return vDomains[iDomain].second-vDomains[iDomain].first+1; }
118  int getDomainOfNode(int iNode) {
119 
120  int iDomain;
121  for (iDomain=vDomains.size()-1; iDomain > 0; iDomain--)
122  if (iNode >= vDomains[iDomain].first) break;
123  return iDomain;
124  }
125 
127  virtual void train(CDataset<Type>& rtTrainSet, CSOM<Type>* pSOM) { };
128  virtual void trainGrowing(CDataset<Type>& rtTrainSet, CSOM<Type>* pSOM, float fGrowParam) { };
129 
130  virtual void buildDomains() { };
131 
136  int getNeighbour(int iNode, int iPos) { return vvNeighbours[iNode][iPos]; }
137 
138  int getCenterNode(bool bVisibleNodes) { if (bVisibleNodes) return iVisibleNodesCenter;
139  else return iCenterNode; }
140 
142  int getType() { return iType; }
143  void setType(int iNewType) { iType = iNewType; }
144 
149  virtual void getInfo(vector<int>& viDims) { };
150 
152  virtual int translateGrid(float *pfTrans, float *pfTarget, int iMode=0) { return -1; }
153 
155  virtual void setParameter(float *pfParam) { };
156 
157  int getGeometrySize() { return viGeometry.size(); }
158  int getGeometryPoint(int iPoint) { return viGeometry[iPoint]; }
159 
160  virtual int getNumPolyLines() { return 0; }
161  virtual bool getPolyLine(int iLine, int &riStart, int &riEnd) { return false; }
162 
163  virtual int getNumOutbounds(int iNode, int iK=0) { };
164 };
165 
166 
170 
171 template<class Type>
172 class CHyperbolicTopology : public CSomTopology<Type> {
173 
174  friend class CSOM<Type>;
175 
176  typedef hash_set<int, hash<int>, equal_to<int> > NodeHashSet;
177  typedef set<int, less<int> > NodeSet;
178 
179 private:
180  double fAngle;
181  double fSin;
182  double fCos;
183  double fRadius;
184  int iNumNeighbours;
185  int iNumRings;
186 
187  int iNumVisibleRings;
188  int iMaxCenterNode;
189 
190  double fScale;
191 
192  NodeHashSet hashsetRingNodes;
193  NodeSet setRingNodes;
194 
195  int* piRingElements;
196 
197  vector<int> viCompleteGeometry;
198  vector<int> viOutBounds;
199 
201  vector<int> viPolyLinePoints;
202  int iNumOutbounds;
203 
204  set<int, less<int> > setOutbounds;
205 
206  int iBeamWinner;
207  int iBeamBranch;
208  bool bBeamSFsearch;
209  int iBeamLimit;
210  bool bBeamSavePath;
211 
212  CVector<Type> *ptBeamInp;
213  CDataset<Type> *pBeamRefs;
214  CMetric<Type> *pBeamMetric;
215 
216  MapBMUs mapBeamArea;
217  MapPath mapBeamPath;
218  MapDist mapBeamDist;
219 
220 public:
225  CHyperbolicTopology(int iNumNodes, int iNeighbours, int iRings, int iNumVisible=3);
226  virtual ~CHyperbolicTopology();
227 
229  void makeLattice();
230 
236  void rotateNode(int iCenter, int iFrom, int iTarget);
237 
241  int addRing(int iFirst, int iLast);
242 
243  void addTriangle(int i1, int i2, int i3);
244 
245  void getInfo(vector<int>& viDims);
246 
247  int translateGrid(float *pfTrans, float *pfTarget, int iMode=0);
248 
249  void copyVisibleNodeCoordinates(float *pfTarget);
250 
251  void setParameter(float *pfParam);
252 
253  void setNumVisibleRings(int iNumVisibleRings);
254 
255  int getNumNodesUptoRing(int iRing);
256 
259  int getNumPolyLines() { return viPolyLinePoints.size() / 2; }
261  bool getPolyLine(int iLine, int &riStart, int &riEnd);
262 
263  void train(CDataset<Type>& rtTrainSet, CSOM<Type>* pSOM);
264  void trainGrowing(CDataset<Type>& rtTrainSet, CSOM<Type>* pSOM, float fGrowParam);
265 
266  void performBeamSearch(int iRoot, int iLevel);
267 
268  void buildDomains();
269 
270  void addPolyLinesFrom(int iNode);
271 
272  void computeOutbounds(int iNode);
273  void computeOutbounds(int iNode, int iK);
274 
275  int getNumOutbounds(int iNode, int iK=0) {
276  iNumOutbounds = 0;
277  setOutbounds.clear();
278  if (iK) {
279  computeOutbounds(iNode, iK);
280  return setOutbounds.size();
281  } else {
282  computeOutbounds(iNode);
283  }
284  if (iNumOutbounds == 0) return 1;
285  else return iNumOutbounds;
286  }
287 
288  int getOutBounds(int iNode) { return viOutBounds[iNode]; };
289 
290  void getOutBoundRange(int iNode, vector<int>::iterator& itStart, vector<int>::iterator& itEnd) {
291  itStart = this->vvNeighbours[iNode].begin() + viOutBounds[iNode];
292  itEnd = this->vvNeighbours[iNode].end();
293  };
294 
295  void setBeamBranching(int iBranch) {
296  this->iBeamBranch = abs(iBranch);
297  if (iBranch < 0)
298  this->bBeamSFsearch = true;
299  };
300 };
301 
305 template<class Type>
306 class CCartesianTopology : public CSomTopology<Type> {
307 
308 public:
309  CCartesianTopology(int iNodes, vector<int>& rviNbUnits);
310  virtual ~CCartesianTopology() { }
311  vector<int> viDimensions;
312 
313  void getInfo(vector<int>& viDims);
314 
315  int translateGrid(float *pfTrans, float *pfTarget, int iMode=0);
316 
317  void train(CDataset<Type>& rtTrainSet, CSOM<Type>* pSOM) { pSOM->train(rtTrainSet); }
318  void buildDomains();
319 
320  int getNumOutbounds(int iNode, int iK=0) { return 4; }
321 };
322 
323 
327 
328 template<class Type>
329 class CSOM : public CClusterAlgorithm<Type> {
330 
331 #ifdef MYSQL_MODULE
332  friend class CMysqlDatabase;
333 #endif
334  friend class CHyperbolicTopology<Type>;
335 
336  typedef set<int, less<int> > NodeSet;
337 
338 private:
339 
340  CSomTopology<Type> *ptTopology;
341  CMetric<double> *ptGridMetric;
342  CMetric<Type> *ptDataMetric;
343 
344  CLearnRate<Type> *ptAlphaRate;
345  CLearnRate<Type> *ptSigmaRate;
346  CKernel<Type> *ptKernel;
347 
348  CDataset<Type> tMetaData;
349 
350  vector<string> vLabel;
351  vector<char*> vKeyWords;
352 
353  vector<vector< pair<float,int> > > vvIDs;
354 
355  int iNumNodes;
356  int iUpdateSteps;
357  int iBestMatchID;
358 
359  void makeReferenceVectors();
360 
361  NodeSet setSelectedNodes;
362  NodeSet::iterator itNodeSet;
363 
364  vector<CFlowItem> vFlowItems;
365 
366  vector<CDenseVector<float> > tActivities;
367 
368  float fProgress;
369 
370 public:
371  vector<string> vTopicNames;
372 
373 public:
377  CSOM(CMetric<double>* ptTheGridMetric = new CEuclideanMetric<double>());
378 
380  CSOM(const CSOM<Type> &SOM);
381 
383  ~CSOM();
384 
387  void setGridMetric(CMetric<double>* ptMetric);
388 
392  void setDataMetric(CMetric<Type>* ptMetric);
393 
394  CMetric<Type>* getDataMetricPtr() { return ptDataMetric; }
395 
398  void setAlphaRate(CLearnRate<Type>* ptRate);
399 
402  void setSigmaRate(CLearnRate<Type>* ptRate);
403 
406  void setKernel(CKernel<Type>* ptTheKernel);
407 
408  CSomTopology<Type>* topologyPtr() { return ptTopology; }
409 
411  CDataset<Type>& metaData() { return tMetaData; }
412  void setMetaData(int iPos, Type *ptTheData);
413 
414  float getMeta4Node(int iNode, int iPos) { return tMetaData[iNode].inputVectorPtr()->getElement(iPos); }
415 
416  void setUpdateSteps(int iVal) { iUpdateSteps = iVal; }
417 
418  CLearnRate<Type>& getAlphaRate() { return *ptAlphaRate; }
419  CLearnRate<Type>& getSigmaRate() { return *ptSigmaRate; }
420  CKernel<Type>& getKernel() { return *ptKernel; }
421 
422  void setLabel(int iNode, string strLabel);
423  string getLabel(int iNode);
424 
431  void applyDataset(CDataset<Type>& rtDataset, int iMode=COPY_CLASSCOUNT);
432 
433  void setUnit(int iNode, CDenseVector<Type>& rtVec);
434 
439  int setCartesianTopology(vector<int>& rviNbUnits);
440 
447  int setHyperbolicTopology(int iNeighbours, int iRings, int iVisible=3);
448 
452  void setUnitsFromDataset(CDataset<Type>& rtData);
453 
458  void setUnitsFromDatasetRandom(CDataset<Type>& rtData);
459 
463  void setUnitsAlongVectors(vector<CDenseVector<Type> > cVectors,
464  vector<Type> cMin,
465  vector<Type> cMax);
466 
467 
468  void train(CDataset<Type>& rtTrainSet);
469 
470  void trainTopology(CDataset<Type>& rtTrainSet) {
471  if (ptTopology) ptTopology->train(rtTrainSet, this);
472  }
473 
474  void trainGrowing(CDataset<Type>& rtTrainSet, float fGrowParam) {
475  if (ptTopology) ptTopology->trainGrowing(rtTrainSet, this, fGrowParam);
476  }
477 
479  return ptTopology->tNodes;
480  }
481 
485  Type minimal_wiring(CMetric<Type>& cMetric);
486 
488  void dump(void);
489 
494  void uMat(vector<Type>& rvtDist, Type fNeighborhood);
495 
496  int getNumNodes() { return iNumNodes; }
497 
499  if (ptTopology) return ptTopology->iNumVisibleNodes;
500  else return 0;
501  }
502 
503  int getVisibleNode(int iPos) {
504  if (ptTopology) return ptTopology->viVisibleNode[iPos];
505  else return -1;
506  }
507 
508 
509 
510  void copyNodeAttributes(int iContext, int iAttrib, float *pfTarget,
511  float &fMin, float &fMax, float *pfSource=NULL);
512 
513  void copyNodeDistances(float *pfDist);
514 
515  bool selectNode(int iNode, bool bSelect);
516  int getNumSelectedNodes() { return setSelectedNodes.size(); }
517  int getSelectedNodes(bool bFirst);
518 
519  int getBestMatchID() { return iBestMatchID; }
520 
521  Type getNodeDistance(int iN1, int iN2);
522 
523  void setFlowSize(int iNumItems);
524  int getFlowSize() { return vFlowItems.size(); }
525 
526  void setFlowItem(int iPos, int *piNodes, char *pcDate, unsigned long iTime);
527  void computeFlow(float *pfParams);
528  void getFlow(int iPos, float *pfTarget, char *pcDate);
529 
530  // passes all children nodes of node iNode to piChildren. Returns number of children
531  int getChildren(int iNode, int* piChildren);
532 
533  float getProgress() { return fProgress; }
534 
535 };
536 
537 #endif
int getBestMatchID()
Definition: CSOM.h:519
void setUpdateSteps(int iVal)
Definition: CSOM.h:416
void train(CDataset< Type > &rtTrainSet, CSOM< Type > *pSOM)
Use topology specific training algorithm to build SOM.
Definition: CSOM.h:317
virtual void buildDomains()
Definition: CSOM.h:130
multimap< float, int > MapBMUs
Definition: CSOM.h:45
Definition: CSOM.h:61
int getDomainStart(int iDomain)
Definition: CSOM.h:115
Base class for all learning rate functions Template base class for all learning rates (virtual)...
Definition: CLearnRate.h:39
int getNumDomains()
Definition: CSOM.h:113
virtual int translateGrid(float *pfTrans, float *pfTarget, int iMode=0)
Translates grid structure.
Definition: CSOM.h:152
Definition: MysqlDatabase.h:40
CKernel< Type > & getKernel()
Definition: CSOM.h:420
Topology class describing two dimensional hyperbolic lattice structure.
Definition: CSOM.h:172
int getNumOutbounds(int iNode, int iK=0)
Definition: CSOM.h:275
const CVector< double > & getCenterPosition()
Definition: CSOM.h:101
void train(CDataset< Type > &rtTrainSet, CSOM< Type > *pSOM)
Use topology specific training algorithm to build SOM.
int getFlowSize()
Definition: CSOM.h:524
map< int, float > MapDist
Definition: CSOM.h:47
int getGeometrySize()
Definition: CSOM.h:157
virtual int getNumPolyLines()
Definition: CSOM.h:160
int iType
type of lattice topology
Definition: CSOM.h:66
void setBeamBranching(int iBranch)
Definition: CSOM.h:295
CDataset< double > & nodeData()
Definition: CSOM.h:478
CDataset< Type > & metaData()
Returns the meta data dataset containing some statistics like node sizes.
Definition: CSOM.h:411
Base class for metrix objects.
Definition: CMetric.h:36
int getNumOutbounds(int iNode, int iK=0)
Definition: CSOM.h:320
Euclidean metric object.
Definition: CMetric.h:66
#define COPY_CLASSCOUNT
Definition: CSOM.h:33
int * piNodes
Definition: CSOM.h:53
CSomTopology< Type > * topologyPtr()
Definition: CSOM.h:408
int iNumVisibleNodes
Definition: CSOM.h:76
vector< int > viGeometry
geometry data for VTK
Definition: CSOM.h:70
CDataset< double > tNodes
dataset of node coordinates
Definition: CSOM.h:68
SOM class for Self Organizing Maps with arbitrary topology.
Definition: MysqlDatabase.h:14
const CVector< double > & getNodePosition(int i)
Definition: CSOM.h:99
vector< bool > vbExpand
each node possible carries an expand attribute
Definition: CSOM.h:83
vector< string > vTopicNames
Meta data: topic names.
Definition: CSOM.h:371
virtual void getInfo(vector< int > &viDims)
Definition: CSOM.h:149
unsigned long iTimestamp
Definition: CSOM.h:55
vector< int > viVisiblePos
each visible node points to its "real" node
Definition: CSOM.h:73
int getDomainOfNode(int iNode)
Definition: CSOM.h:118
vector< int > viDimensions
Definition: CSOM.h:311
CMetric< Type > * getDataMetricPtr()
Definition: CSOM.h:394
map< int, int > MapPath
Definition: CSOM.h:46
int getOutBounds(int iNode)
Definition: CSOM.h:288
int getGeometryPoint(int iPoint)
Definition: CSOM.h:158
virtual bool getPolyLine(int iLine, int &riStart, int &riEnd)
Definition: CSOM.h:161
void getOutBoundRange(int iNode, vector< int >::iterator &itStart, vector< int >::iterator &itEnd)
Definition: CSOM.h:290
CDenseVector< double > tCenter
vector describing topology center
Definition: CSOM.h:69
void trainTopology(CDataset< Type > &rtTrainSet)
Definition: CSOM.h:470
int getNumNodes()
Definition: CSOM.h:496
float getMeta4Node(int iNode, int iPos)
Definition: CSOM.h:414
Topology class for n dimensional cartesian grid.
Definition: CSOM.h:306
void setType(int iNewType)
Definition: CSOM.h:143
int getVisibleNode(int iPos)
Definition: CSOM.h:503
float getProgress()
Definition: CSOM.h:533
vector< pair< int, int > > vDomains
SOM area might consist of several domains.
Definition: CSOM.h:78
Base class for kernel functions Base class for kernel function used for example by the support vector...
Definition: CKernel.h:46
int getNumNeighbours(int i)
Definition: CSOM.h:111
CLearnRate< Type > & getAlphaRate()
Definition: CSOM.h:418
int getNumVisibleNodes()
Definition: CSOM.h:498
templatized vector for numerical applications
Definition: CMatrix.h:39
virtual int getNumOutbounds(int iNode, int iK=0)
Definition: CSOM.h:163
Base class for cluster algorithms.
Definition: CClusterAlgorithm.h:38
int getNumSelectedNodes()
Definition: CSOM.h:516
vector< vector< int > > vvNeighbours
each node has a list of neighbours
Definition: CSOM.h:67
void trainGrowing(CDataset< Type > &rtTrainSet, float fGrowParam)
Definition: CSOM.h:474
int iVisibleNodesCenter
Definition: CSOM.h:75
CLearnRate< Type > & getSigmaRate()
Definition: CSOM.h:419
virtual ~CCartesianTopology()
Definition: CSOM.h:310
vector< int > viVisibleNode
list of currently visible nodes
Definition: CSOM.h:72
int getDomainEnd(int iDomain)
Definition: CSOM.h:116
void setNodePosition(int i, CVector< double > &rtPos)
Definition: CSOM.h:94
int getNumPolyLines()
Definition: CSOM.h:259
virtual void setParameter(float *pfParam)
Sets parameters.
Definition: CSOM.h:155
void addDomain(int iStart, int iEnd)
Definition: CSOM.h:114
void train(CDataset< Type > &rtTrainSet)
int getType()
Returns type of topology.
Definition: CSOM.h:142
int getNeighbour(int iNode, int iPos)
Definition: CSOM.h:136
virtual void train(CDataset< Type > &rtTrainSet, CSOM< Type > *pSOM)
Use topology specific training algorithm to build SOM.
Definition: CSOM.h:127
virtual void trainGrowing(CDataset< Type > &rtTrainSet, CSOM< Type > *pSOM, float fGrowParam)
Definition: CSOM.h:128
Definition: CSOM.h:51
int getCenterNode(bool bVisibleNodes)
Definition: CSOM.h:138
int getDomainSize(int iDomain)
Definition: CSOM.h:117