#ifndef cutPtsSet_h #define cutPtsSet_h #include "../Basics/basic.h" #include "../Matrices/Matrix.h" #include "multiDS.h" /* ******************** class cutPtsSet ******************** An object of type cutPtsSet is a set of cut points. Each cut point is a particular value associated to an attribute of a multiDS. */ extern const char nonMonotonicAttr; extern const char positiveAttr; extern const char negativeAttr; #define _val_GEQ_cp 1 // when _val_GEQ_cp is 1, a value V is above a cut point of value T iff V >= T, // when _val_GEQ_cp is 0, a value V is above a cut point of value T iff V > T. tcT class multiDS; // forward tcT class cutPtsSet : private Matrix { public: ////////////////// // Constructors // ////////////////// cutPtsSet (); // Default constructor. cutPtsSet (multiDS& d, float& confidence, const char method=0, const char weighting=0); // Constructs a large set of cut points separating points of different // classes in d. // If method==0, a cut point of value (S1+S2)/2 is introduced along // variable V iff there are in d two points of classes C1 and C2 // (C1!=C2) and of values S1 and S2 (S1S2) if V is not monotonic (resp. V is positive, V is negative). // if weighting==0, all the weights are set to 0. // If weighting==1, the weight of a cut point depends on the correlation // of this cut point with the output. // If weighting==2, the weight of a cut point X depends on the minimal // distance to a point A for which there exists a point B so that X // separates A and B (available only if method=0, otherwise equivalent // to weighting==0). // If weighting==3, the weight of a cut point depends on the sum of the // separabilities of each separated pair. cutPtsSet (cutPtsSet& from, const Matrix& index); // Constructs the subset of tresholds taken in FROM with index INDEX. ~cutPtsSet (); // Default destructor. /////////////// // Selectors // /////////////// int nbVars () const; // Returns the # of variables. int nbCP () const; // Returns the total # of cut points. int nbCP (int var) const; // Returns the # of cut points for var. int attribute (int cpIndex) const; // Returns the variable concerned by the cpIndex-th cut point. boolean above (T val, int cpIndex) const; boolean above (T val, int cpInV, int var) const; // Returns true if VAL is above the corresponding cut point. int compare (T val, int cpIndex, float confidence=0.0) const; int compare (T val, int cpInV, int var, float confidence=0.0) const; // Returns +1 if VAL is above the cpIndex-th cut point with a margin // CONFIDENCE, 0 if VAL is below the cpIndex-th cut point with a margin // CONFIDENCE, -1 otherwise. boolean between (T val1, T val2, int cpIndex, float confidence=0.0) const; boolean between (T val1, T val2, int cpInV, int var, float confidence=0.0) const; // Returns true if the corresponding cut point is between val1 and val2 // within a margin CONFIDENCE. int firstCPG (T val, int var, float confidence=0.0) const; // Returns the index of the first cut point of VAR which is // ( _val_GEQ_cp ? > : >= ) than val within a margin CONFIDENCE. // Returns -1 if no cut point fullfil this property. int lastCPL (T val, int var, float confidence=0.0) const; // Returns the index of the last cut point of VAR which is // ( _val_GEQ_cp ? <= : < ) than val within a margin CONFIDENCE. // Returns -1 if no cut point fullfil this property. T value (int cpInV, int var) const; // Returns the value of the cpInV-th cut point of variable VAR. float original (int cpInV, int var) const; // Returns the original value associated with cut point (cpInV,VAR). // original(t,v) can differ with value(t,v) after a call to normalize. /////////////// // Operators // /////////////// cutPtsSet& normalize (multiDS& d); // Replaces in d each value of attribute X by an integer in range // {0,...,nbCP(X)}, and the cut points become values in {1,...,nbCP(X)}. cutPtsSet& reduce (multiDS& d, int method, float& separability, float confidence=0.0); // Reduce the number of cut points, while keeping the required separa- // bility towards the multiDS d. // // Separability: // For a pair of points (a,b), the separability of a cut point of value // S along variable V is given by a function Sep(S,a(V),b(V),V). The // separability of a set of cut points for a pair (a,b) is the sum of // the separabilities of each cut point. The separability of a set of // cut points for a multiple dataset is the least separability over all // pairs of points from different classes. // // Confidence: // Give a margin on both sides of a cut point within which a value is // neither above nor below a cut point. // // method==0: // All the cut points are sorted according to their weight, and the k // first achieving the required total separability are kept. Function // Sep is defined as follows: // Sep(S,a(V),b(V),V) = 0 if S<=min{a(V),b(V)} or S>=max{a(V),b(V)} // or a(V) < S < b(V) but V is negative // or a(V) > S > b(V) but V is positive // = 1 otherwise. // // method==1: // Similar to method==0, when the separability function is defined as: // Sep(S,a(V),b(V),V) = 0 if S<=min{a(V),b(V)} or S>=max{a(V),b(V)} // or a(V) < S < b(V) but V is negative // or a(V) > S > b(V) but V is positive // = minimum{|a(V)-S|,|b(V)-S|}/(max(V)-min(V)). // So, Sep(S,a(V),b(V),V) is in the interval [0,1/2]. // // method==2: // In this case, Sep is defined as in method==0. A set covering problem // is solved to find a small subset of cut points achieving the required // total separability. If compress>1, then at each step of the greedy // procedure solving the set covering problem, redundant rows or columns // are suppressed. // // method==3: // Sep is defined as in method==1. A small subset of cut points // achieving the required total separability is obtained by a greedy // procedure for the resolution of the integer linear program. // // method==-2,-3: // This is an informative option that does not reduce the set of cut // points. It presents a cumulative histogram of the separability // according to the number of cut points maintained, starting from those // with largest weights up to those with smallest weights. The function // Sep is defined as in -method. friend ostream& operator << (ostream& s, const cutPtsSet& myself); //////////////////////////////////////////////////////////////////////////////// private: cutPtsSet (int nbCP, int nbVars); // Allocates the memory space for data set but does not initialize it. cutPtsSet& resize (int nbCP, int nbVars); // Resizes the data set but does not initialize it. int index (int cpIndex) const; // Returns the rank of the cpIndex-th cut point, among cut points // for the same variable. (N.B. index(firstIndex(x))==0. ) // An error is raise if cpIndex is out of {0,...,nbCP()-1}. int firstIndex (int var) const; // Returns index of the first CP of VAR. int lastIndex (int var) const; // Returns index of the last CP of VAR. // Returns is indetermined if there is no cut point for VAR. T operator () (int cpIndex) const; // Returns 2 times the value of the cpIndex-th cut point among all. T operator () (int cpInV, int var) const; // Returns 2 times the value of the cpInV-th cut point of variable VAR. Matrix origin; Matrix weight; Matrix var; Matrix sigmaCP; Matrix span; // int nbAttr; // # of attributes with at least one cut point. }; tcT ostream& operator << (ostream& s, const cutPtsSet& myself); // Outputs a set of cut points on stream s. #endif