Machine Learning Library
mvGenerator.h
Go to the documentation of this file.
1 #ifndef MVGENERATOR_H
2 #define MVGENERATOR_H
3 
4 using namespace std;
5 
6 #include <iostream>
7 #include <stdio.h>
8 #include "soapH.h"
9 #include "ParseText.hh"
10 #include "CDataset.h"
11 #include <string>
12 #include "stemming/english_stem.h"
13 #include <ext/hash_map>
14 #include <set>
15 #include <stdlib.h>
16 #include "MySQL.hh"
17 
18 //Jens gmx
19 #define KEY "gBZjV4BQFHLsnOnD72hRRtBI50aWolQA"
20 //Jörg//#define KEY "mfcRpBdQFHIdKlI3yvSnFHcwMqce1TpX" #define GOOGLEURL "http://api.google.com/search/beta2" #define GOOGLEACTION "urn:GoogleSearchAction" extern "C" { int doSearch(char* pcQuery, char*** pppcTitles, char*** pppcSnippets, char*** pppcURLs, int iMaxNumber); } static const int MIN_WORD_OCCURENCE=2; static const char* pcFileName = "./tmp.html"; static const int TIMEOUT = 7; //defines timeout time in sec for wget static const char* DATABASE = "google"; class mvGenerator{ public: mvGenerator(); ~mvGenerator(); // fills pDataset with attribute vectors, based on the google.output for pcQuery void computeMV(CDataset<float>* pDataset, char* pcQuery, int iMaxResults=500); //displays internal bags an dictionaries for verification void controlFunc(); // enables or disables html-filtering. recomputes attribute-vector // (only if a search has been performed before void setHtmlFiltering(bool bHtmlFiltering); private: void createBOW(char* pcQuery, int iMaxResults); void createDict(); void createMV(); void doCleanUp(bool); // copies contents of the given URL to dest int readWebSite(char* pcURL, string* psDest); // writes stored website to database. Returns 1 on success or 0 on failure int writeToDB(int ID, char* pcTitles, char* pcText, char* pcURL); // uses wget to safe website with URL = pcURL to pcFileName int safeSite(char* pcURL); // converts stored files to string int site2String(string* psDest); // this is used to convert a downloaded pdf-site to text void convertPdf(); // removes html-tags from given string, starting at iStart void rmHTMLTags(string* psTemp, int iStart); // BagOfWords ^= Hash( Word->Count ), one bag for each doc vector<BagOfWords> vBags; // map Word to ID BagOfWords String2ID; // dictionary. the i-th component holds a pair which contains the total // occurence of the word associated with ID = i, and the number of documents // it occurs in. ID starts with 1, so 0 is not defined vector< pair<int,int> > vpDictWords; // contains one hash_map per document. for each document the wordID // is associated with the words occurence in the corresponding document vector< stdext::hash_map<int,int> > vhIDBags; // map ID to word.ID starts with 1, so 0 is not defined vector<string> vsID2String; // map ID to component of attribute vector stdext::hash_map<int,int> hID2Comp; // map component of attribute vector to word vector<string> vsComp2String; //Dataset containing attribute vectors CDataset<float>* pDataset; // if html-filtering is active, html-keywords from this set are ignored set<string> htmlKWs; bool bHtmlFiltering; // #-... are strings describing numbers and should not be stemmed string sExclude1; string sExclude2; string sExclude3; // database where documents are stored CMySQL* pMysql; // used to store the mysql-query (this declared global because of its // big size) string sMysqlQuery; }; #endif
21 //#define KEY "mfcRpBdQFHIdKlI3yvSnFHcwMqce1TpX"
22 #define GOOGLEURL "http://api.google.com/search/beta2"
23 #define GOOGLEACTION "urn:GoogleSearchAction"
24 
25 
26 extern "C" {
27 
28  int doSearch(char* pcQuery, char*** pppcTitles, char*** pppcSnippets, char*** pppcURLs, int iMaxNumber);
29 
30 }
31 
32 static const int MIN_WORD_OCCURENCE=2;
33 static const char* pcFileName = "./tmp.html";
34 static const int TIMEOUT = 7; //defines timeout time in sec for wget
35 static const char* DATABASE = "google";
36 
38 public:
39 
40  mvGenerator();
41 
42  ~mvGenerator();
43 
44  // fills pDataset with attribute vectors, based on the google.output for pcQuery
45  void computeMV(CDataset<float>* pDataset, char* pcQuery, int iMaxResults=500);
46 
47  //displays internal bags an dictionaries for verification
48  void controlFunc();
49 
50  // enables or disables html-filtering. recomputes attribute-vector
51  // (only if a search has been performed before
52  void setHtmlFiltering(bool bHtmlFiltering);
53 
54 private:
55  void createBOW(char* pcQuery, int iMaxResults);
56  void createDict();
57  void createMV();
58 
59  void doCleanUp(bool);
60 
61  // copies contents of the given URL to dest
62  int readWebSite(char* pcURL, string* psDest);
63 
64  // writes stored website to database. Returns 1 on success or 0 on failure
65  int writeToDB(int ID, char* pcTitles, char* pcText, char* pcURL);
66 
67  // uses wget to safe website with URL = pcURL to pcFileName
68  int safeSite(char* pcURL);
69 
70  // converts stored files to string
71  int site2String(string* psDest);
72 
73  // this is used to convert a downloaded pdf-site to text
74  void convertPdf();
75 
76  // removes html-tags from given string, starting at iStart
77  void rmHTMLTags(string* psTemp, int iStart);
78 
79  // BagOfWords ^= Hash( Word->Count ), one bag for each doc
80  vector<BagOfWords> vBags;
81 
82  // map Word to ID
83  BagOfWords String2ID;
84 
85  // dictionary. the i-th component holds a pair which contains the total
86  // occurence of the word associated with ID = i, and the number of documents
87  // it occurs in. ID starts with 1, so 0 is not defined
88  vector< pair<int,int> > vpDictWords;
89 
90  // contains one hash_map per document. for each document the wordID
91  // is associated with the words occurence in the corresponding document
92  vector< stdext::hash_map<int,int> > vhIDBags;
93 
94  // map ID to word.ID starts with 1, so 0 is not defined
95  vector<string> vsID2String;
96 
97  // map ID to component of attribute vector
98  stdext::hash_map<int,int> hID2Comp;
99 
100  // map component of attribute vector to word
101  vector<string> vsComp2String;
102 
103  //Dataset containing attribute vectors
104  CDataset<float>* pDataset;
105 
106  // if html-filtering is active, html-keywords from this set are ignored
107  set<string> htmlKWs;
108  bool bHtmlFiltering;
109 
110  // #-... are strings describing numbers and should not be stemmed
111  string sExclude1;
112  string sExclude2;
113  string sExclude3;
114 
115  // database where documents are stored
116  CMySQL* pMysql;
117  // used to store the mysql-query (this declared global because of its
118  // big size)
119  string sMysqlQuery;
120 };
121 #endif
int doSearch(char *pcQuery, char ***pppcTitles, char ***pppcSnippets, char ***pppcURLs, int iMaxNumber)
Definition: mvGenerator.h:37