/* This file contains the definition of the class dataSet. A dataSet is a set of data. Each data is a vector of DIM components. Various tools are available to sort data by lexicographical order as well as to suppress repetitions. A label is associated to each data. September 8, 94 Creation: E. Mayoraz */ #ifndef dataSet_h #define dataSet_h #include #include "../Basics/basic.h" #include "../Matrices/Matrix.h" tcT class dataSet : private Matrix { public: ////////////////// // Constructors // ////////////////// dataSet (); // Default constructor. dataSet (const dataSet& d); // Copy constructor. dataSet (const Array& d, const Array& multiple, const Array& label, const Array& Monotonicity); dataSet (const Array& d, const Array& multiple, const Array& label); dataSet (const Array& d, const Array& label); // dataSet (const Array& d); dataSet (const Array& d, const dataSet& original); // Constructs a data set of M data of dim N from an MxN Matrix. // MULTIPLE and LABEL must be row vectors of size M. // Default values are 1 for MULTIPLE(i) and i for LABEL(i). // If original is provided MULTIPLE and LABEL are maintained, d.m() // must be equal to original.distinct(), while d.n() can be arbitrary. // If d.n() equal to original.distinct(), monotonicity is preserved. dataSet (const dataSet& from, const Array& varIndex); // Constructs a new data set by keeping only variables with given // indices from data set from. // Indices out of {0,...,from.nbVars()-1} are ignored. dataSet (const dataSet& from, double rate); // Creates a random subset of rate*from.nbData() rows of from. // An error is raised if rate in not in range [0,1]. ~dataSet (); // Default destructor: deallocate the space used by the data set. /////////////// // Selectors // /////////////// int dim() const; // dimension of each data int card() const; // total number of data int distinct() const; // number of distinguished data (see checkMult) Matrix matrix () const;// data set represented in matrix form, one row // per distinct data Array operator () (int dIndex) const; // data indexed by dIndex Matrix operator [] (int attrIndex) const; // column ass. to attrIndex T& operator () (int dIndex, int j) const; // its j-th component int multiplicity(int dIndex) const; // its currently known multipl. int label (int dIndex) const; // its label // An error is raised in this 4 routines if dIndex is out of // {0,...,distinct()-1}. int classCol() const; // return the index of the column representing // the classes, if any. /////////////// // Operators // /////////////// dataSet& operator = (dataSet& from); // Assignement of dataSet. Note that the size of *this will be // adjusted to the size of FROM. dataSet operator + (dataSet& d); // Returns the data set resulting of the union of *this and d. // No checking of identical data is done. // An error is raised if dim() != d.dim(). dataSet operator * (dataSet& d); // Returns the data set resulting of the intersection of *this d. // Labels and multiplicities of elements in the intersection are // those of *this. dataSet& add(Array& data, int label); // Add the data DATA with its label to the data set. // An error is raised if Data.n()!=dim(). dataSet& remove(int dIndex); // Remove from the data set all instances of data at index dIndex. // No effect if dIndex is out of range {0,...,distinct()}. dataSet& checkMult(int& rep); // Checks repetitions and return in REP the # of repetitions found. // After a call to this routine, only one copy of each data is stored // and the multiplicity is updated. dataSet& sort(); // Sort each data by lexicographical order. dataSet& reorganize(boolean reduce=true); // Reorder the data set and reduce the size if REDUCE. void split(dataSet& first, dataSet& second, double rate) const; // Split the dataset into two parts, the first is dimensioned by rate. friend ostream& operator << (ostream& s, const dataSet& myself); friend void operator << (const char* s, const dataSet& myself); friend void operator >> (const char* fileName, dataSet& myself); ////////////////////////////////////////////////////////////////////////////// private: dataSet (int nbData, int dim); // Allocates the memory space for data set but does not initialize it. dataSet& resize(int nbData=0, int dim=0); // Resizes the data set but does not initialize it. Matrix index; Matrix mult; Matrix ancest; int totCard; int nbDiff; int classcol; boolean sorted; boolean i_is_i; public: Matrix monotone; }; tcT ostream& operator << (ostream& s, const dataSet& myself); // Output data set on stream s. tcT void operator << (const char* s, const dataSet& myself); tcT void operator >> (const char* fileName, dataSet& myself); tcT boolean identical(const Array& first,const Array& second); #endif