/* data structure of class dataSet: private: Matrix index; Matrix mult; Matrix ancest; int totCard; int nbDiff; int type; // output we want: file or screen boolean sorted; boolean i_is_i; - *this is derived from an MxN matrix. Each row correspond to one data. - INDEX is a row vector of size at least NBDIFF and gives the order of data in the data set, the I-th data is in row INDEX(I) of the matrix. - MULT and ANCEST are row vectors of size M, MULT(INDEX(I)) gives the number of identical data represented by the INDEX(I)-th row, ANCEST(INDEX(I)) is the label of one particular data represented by the INDEX(I)-th row. - TOTCARD gives the total number of data in the data set. - NBDIFF gives the number of data that as currently been distinguished. Invariants: *this.m() == mult.n() == ancest.n() >= nbDiff index.n() >= nbDiff for all 0 <= i < nbDiff : 0 <= index(i) < *this.m() totCard = sum_{i=0,i for all 0 < i < nbDiff : *this(index(i-1)) <=lexi *this(index(i)) i_is_i <=> for all 0 <=i < nbDiff : index(i)==i */ #include "dataSet.h" #include #include #include //#include #define _CHECK_INDEX const char nonMonotonicAttr = 0; const char positiveAttr = 1; const char negativeAttr = 2; ////////////////// // Constructors // ////////////////// tcT dataSet::dataSet (int nbData, int dim) : Matrix(nbData,dim), index(nbData), mult(1,nbData,1), ancest(nbData), classcol(0), totCard(nbData), nbDiff(nbData), sorted(0), i_is_i(0), monotone() { } tcT dataSet& dataSet::resize(int nbData/**=0**/, int dim/**=0**/) { Matrix::resize(nbData,dim); index.resize(1,nbData); mult.resize(1,nbData,1); ancest.resize(1,nbData); return *this; } tcT /*inline*/ dataSet::dataSet () : Matrix(), index(), mult(), ancest(), classcol(0), totCard(0), nbDiff(0), sorted(0), i_is_i(0), monotone() { } tcT /*inline*/ dataSet::dataSet (const dataSet& d) : Matrix(d.distinct(),d.dim()), index(d.distinct()), mult(d.distinct()), ancest(d.distinct()), classcol(d.classcol),totCard(d.totCard), nbDiff(d.distinct()), sorted(d.sorted), i_is_i(true), monotone(d.monotone) { if ( distinct() && dim() ) if ( d.i_is_i ) { s(0,d.distinct(),0,d.dim()) = d.s(0,d.distinct(),0,d.dim()); index = d.index.s(0,1,0,d.distinct()); mult = d.mult.s(0,1,0,d.distinct()); ancest= d.ancest.s(0,1,0,d.distinct()); } else for (int i=0; i::~dataSet () { } tcT dataSet::dataSet (const Array& d, const Array& multiple, const Array& label, const Array& Monotonicity) : Matrix(d), index(d.m()), mult(multiple), ancest(label),classcol(0), totCard(0), nbDiff(0), sorted(0), i_is_i(0), monotone(Monotonicity) { #ifdef _CHECK_INDEX if ( d.m()!=multiple.n() || d.m()!=label.n() ) throw whereDimMismatch( "dataSet::dataSet(const Array&,const Array&,const Array&)", d.m(),d.m(),label.n(),multiple.n()); #endif nbDiff = d.m(); i_is_i = 1; for (int i=0; i::dataSet (const Array& d, const Array& multiple, const Array& label) : Matrix(d), index(d.m()), mult(multiple), ancest(label),classcol(0), totCard(0), nbDiff(0), sorted(0), i_is_i(0), monotone() { #ifdef _CHECK_INDEX if ( d.m()!=multiple.n() || d.m()!=label.n() ) throw whereDimMismatch( "dataSet::dataSet(const Array&,const Array&,const Array&)", d.m(),d.m(),label.n(),multiple.n()); #endif nbDiff = d.m(); i_is_i = 1; for (int i=0; i::dataSet (const Array& d, const Array& label) : Matrix(d), index(d.m()), mult(1,d.m(),1), ancest(label),classcol(0), totCard(0), nbDiff(0), sorted(0), i_is_i(0), monotone() { #ifdef _CHECK_INDEX if ( d.m()!=label.n() ) throw whereDimMismatch( "dataSet::dataSet(const Array&,const Array&)", 1,d.m(),1,label.n()); #endif nbDiff = d.m(); i_is_i = 1; for (int i=0; i::dataSet (const Array& d) : Matrix(d) //{ //index.resize(d.m()); mult.resize(1,d.m(),1); ancest.resize(d.m()); // totCard = d.m(); nbDiff = d.m(); sorted = 0; i_is_i = 1; // for (int i=0; i::dataSet (const Array& d, const dataSet& original) : Matrix(d), index(d.m()), mult(d.m()), ancest(d.m()),classcol(0), totCard(0), nbDiff(0), sorted(0), i_is_i(0), monotone() { #ifdef _CHECK_INDEX if ( d.m()!=original.distinct() ) throw whereDimMismatch( "dataSet::dataSet(const Array&,const dataSet&)", 1,d.m(),1,original.distinct()); #endif for (int i=0; i::dataSet (const dataSet& from, const Array& varIndex) : Array (from.distinct(), varIndex.n()), Matrix(from.distinct(), varIndex.n()), index(from.index), mult(from.mult), ancest(from.ancest), classcol(from.classcol), totCard(from.card()), nbDiff(from.distinct()), sorted(0), i_is_i(from.i_is_i), monotone() { int newdim = 0; for (int varI=0; varI=0 && varIndex(varI)=0 && varIndex(j)::dataSet (const dataSet& from, double rate) { if ( rate < 0.0 || rate > 1.0 ) throw whereOutOfRange( "dataSet::dataSet(dataSet& from, double rate)", "rate",int(rate),0,1); totCard = int(rate * double(from.card())); int rest = from.card(); int taken = 0; nbDiff = 0; Matrix wm(1, from.distinct(), 0); for (int row = 0; row < from.distinct(); row++) for (int m = 0; m < from.mult(from.index(row)); m++) { if ( randomR() < double(totCard-taken)/double(rest) ) { taken++; if ( wm(row)++ == 0 ) nbDiff++; } rest--; } resize(nbDiff, from.dim()); int ptr = 0; for (row = 0; row < wm.n(); row++) if ( wm(row) ) { s(ptr) = from.s(from.index(row)); index(ptr) = ptr; mult(ptr) = wm(row); ancest(ptr) = from.ancest(from.index(row)); ptr++; } sorted = from.sorted; i_is_i = 1; classcol = from.classcol; monotone.resize(from.monotone); } /////////////// // Selectors // /////////////// tcT /*inline*/ int dataSet::dim () const { return n(); } tcT /*inline*/ int dataSet::card () const { return totCard; } tcT /*inline*/ int dataSet::distinct () const { return nbDiff; } tcT /*inline*/ int dataSet::classCol () const { return classcol; } tcT /*inline*/ Matrix dataSet::matrix () const { if ( i_is_i ) return _(0,distinct()); Matrix res(distinct(),dim()); for (int r=0; r dataSet::operator () (int dIndex) const { #ifdef _CHECK_INDEX if ( dIndex<0 || dIndex>=distinct() ) throw whereOutOfRange( "Array dataSet::operator()(int dIndex) const", "dIndex",dIndex,0,distinct()-1); #endif return s(index(dIndex),1,0,dim()); } tcT Matrix dataSet::operator [] (int attrIndex) const { #ifdef _CHECK_INDEX if ( attrIndex<0 || attrIndex>=dim() ) throw whereOutOfRange( "Matrix dataSet::operator[](int attrIndex) const", "attrIndex",attrIndex,0,dim()-1); #endif if ( i_is_i ) return _(0,distinct(),attrIndex,1); Matrix res(distinct(),1); for (int r=0; r::operator () (int dIndex, int j) const { #ifdef _CHECK_INDEX if ( dIndex<0 || dIndex>=distinct() ) throw whereOutOfRange( "T& dataSet::operator()(int dIndex, int j) const", "dIndex",dIndex,0,distinct()-1); if ( j<0 || j>=dim() ) throw whereOutOfRange( "T& dataSet::operator()(int dIndex, int j) const", "j",j,0,dim()-1); #endif return Matrix::operator()(index(dIndex),j); } tcT int dataSet::multiplicity (int dIndex) const { #ifdef _CHECK_INDEX if ( dIndex<0 || dIndex>=distinct() ) throw whereOutOfRange( "int dataSet::multiplicity(int dIndex) const", "dIndex",dIndex,0,distinct()-1); #endif return mult(index(dIndex)); } tcT int dataSet::label (int dIndex) const { #ifdef _CHECK_INDEX if ( dIndex<0 || dIndex>=distinct() ) throw whereOutOfRange( "int dataSet::label(int dIndex) const", "dIndex",dIndex,0,distinct()-1); #endif return ancest(index(dIndex)); } /////////////// // Operators // /////////////// tcT dataSet& dataSet::operator = (dataSet& from) { if ( m()!=from.m() || n()!=from.n() || index.n()!=from.index.n() ) resize(from.m(), from.n()); for (int i=0; i dataSet::operator + (dataSet& d) { if ( dim()!=d.dim() ) throw whereDimMismatch( "dataSet dataSet::operator+(dataSet&)", 1,dim(),1,d.dim()); dataSet uni(distinct()+d.distinct(), dim()); for (int i=0; i& first,const Array& second) { for(int i=0; i< first.m();i++) for(int j=0; j < first.n();j++) if(first(i,j)!=second(i,j) && known(first(i,j)) && known(second(i,j))) return(false); return(true); } tcT dataSet dataSet::operator * (dataSet& d) { int interCard=0; if ( dim()==d.dim() ) for (int left = 0; left < distinct(); left++) for (int right = 0; right < d.distinct(); right++) interCard += identical(s(left),d.s(right)); dataSet inter(interCard,dim()); inter.totCard=0; inter.nbDiff=0; inter.i_is_i=true; inter.sorted=sorted; inter.monotone.resize(monotone); if ( interCard ) { for (int left=0; left& dataSet::add(Array& data, int label) { if ( data.n()!=dim() ) throw whereDimMismatch( "dataSet& dataSet::add(Array&,int)", 1,data.n(),1,dim()); else { nbDiff++; totCard++; sorted=0; if ( nbDiff>m() || !i_is_i ) { int newM = m()+1; resize_(newM,n()); index.resize_(newM); mult.resize_(newM); ancest.resize_(newM); } int freePtr = ( i_is_i ? nbDiff-1 : m()-1 ); s(freePtr,1) = data.s(0,1); index(nbDiff-1) = freePtr; mult(freePtr) = 1; ancest(freePtr) = label; } return *this; } tcT dataSet& dataSet::remove(int dIndex) { if ( dIndex>=0 && dIndex& dataSet::checkMult (int& rep) { if ( nbDiff < 2 ) return *this; rep = 0; int lastNew = 0; // the first is a fortiori new if ( sorted ) { for (int right = 1; right < distinct(); right++) if ( s(index(right-1)) != s(index(right)) ) { if ( ++lastNew < right ) { index(lastNew) = index(right); i_is_i = 0; } } else { mult(index(lastNew)) += mult(index(right)); rep++; } } else // ( !sorted ) { for (int right = 1; right < distinct(); right++) { int isNew = 1; // let be otpimistic for (int left = 0; (left <= lastNew) && isNew; left++) if ( s(index(left)) == s(index(right)) ) { isNew = 0; mult(index(left)) += mult(index(right)); rep++; } if ( isNew && (++lastNew < right) ) { index(lastNew) = index(right); i_is_i = 0; } } } nbDiff = ++lastNew; return *this; } tcT dataSet& dataSet::sort () { if ( nbDiff < 2 ) { sorted=true; return *this; } if ( !sorted ) { Matrix ind(index.s(0,1,0,nbDiff)); Array::sort(ind); if ( index.s(0,1,0,nbDiff) != ind ) { index.s(0,1,0,nbDiff) = ind; i_is_i = 0; } sorted = 1; } return *this; } tcT dataSet& dataSet::reorganize(boolean reduce/**=true**/) { if ( nbDiff == m() || !reduce ) { Matrix used(nbDiff); for (int i=0; i tempRow(1,dim()); int tempMult, tempAncest; while ( head < nbDiff && used(head) ) head++; while ( head < nbDiff ) { tempRow = s (head); tempMult = mult (head); tempAncest = ancest(head); hand = head; used(head) = 1; do { s (hand) = s (index(hand)); mult (hand) = mult (index(hand)); ancest(hand) = ancest(index(hand)); hand = index(hand); used(hand) = 1; } while ( index(hand) != head ); s (hand) = tempRow; mult (hand) = tempMult; ancest(hand) = tempAncest; while ( ++head newD(nbDiff,dim()); Matrix newIndex(1,nbDiff); Matrix newMult(1,nbDiff); Matrix newAncest(1,nbDiff); for (int i=0; i::split(dataSet& first, dataSet& second, double rate) const { if ( rate < 0.0 || rate > 1.0 ) throw whereOutOfRange( "void dataSet::split(dataSet&,dataSet&,double rate) const", "rate",int(rate),0,1); first.totCard = int(rate * double(card())); second.totCard = card() - first.totCard; int rest = card(); int taken = 0; first.nbDiff = 0; second.nbDiff = 0; Matrix wm(1, distinct(), 0); for (int row = 0; row < distinct(); row++) { for (int m = 0; m < mult(index(row)); m++) { if ( randomR() < double(first.totCard-taken)/double(rest) ) { taken++; wm(row)++; } rest--; } if ( wm(row) ) first.nbDiff++; if ( wm(row) < mult(index(row)) ) second.nbDiff++; } first.resize (first.nbDiff, dim()); second.resize(second.nbDiff, dim()); int ptrF = 0; int ptrS=0; for (row = 0; row < wm.n(); row++) { if ( wm(row) ) { first.s(ptrF) = s(index(row)); first.index(ptrF) = ptrF; first.mult(ptrF) = wm(row); first.ancest(ptrF) = ancest(index(row)); ptrF++; } if ( wm(row) < mult(index(row)) ) { second.s(ptrS) = s(index(row)); second.index(ptrS) = ptrS; second.mult(ptrS) = mult(index(row))-wm(row); second.ancest(ptrS) = ancest(index(row)); ptrS++; } } first.sorted=sorted; second.sorted=sorted; first.i_is_i=1; second.i_is_i=1; first.monotone.resize(monotone); second.monotone.resize(monotone); } // This is new procedure to write the binarized data back into a file for // later use. Mario tcT void operator << (const char* s, const dataSet& myself) { ofstream out_file(s); out_file.setf(ios::showpoint); if ( uses_dont_know ) out_file << "don't_know " << setw(3) << dont_know() << endl; out_file << setw(3) << "label_column " << 1 << "\n" << setw(3) << "multiplicity_column " << 2 << "\n" << setw(3) << "class_column " << myself.classcol+3 << "\n" ; if ( myself.monotone.n() ) { out_file << "Monotonicity"; for (int i=0; i=T(-9)) { out_file << coefWidth(2) << coefPrecision(0); } else { out_file << coefWidth(6) << coefPrecision(2); } for (int row = 0; row < myself.distinct(); row++) out_file << setw(3) << myself.label(row) << " " << setw(3) << myself.multiplicity(row) << " " << coefPerLine(0)<< myself(row); } out_file.close(); } tcT ostream& operator << (ostream& s, const dataSet& myself) { if ( myself.monotone.n() ) { s << " "; for (int var=0; var> (char* fileName, dataSet&)", message); } } tcT void operator >> (const char* fileName, dataSet& myself) { ifstream in(fileName); int aux_label=1; if ( !in ) { char message[100]; sprintf(message, "void operator >> (char* fileName, dataSet&) , [fileName=%s]", fileName); throw whereWhich(message,"File open error"); } // read the header a first time int mult=-1; int label=-1; int classC=0; boolean monotone=false; boolean isfactor=false; boolean isoffset=false; dsKeyWord kwd; while ( (kwd=readKeyWord(in))!=noKeyWord ) { switch( kwd ) { case dontKnow: T dn; in >> dn; set_dont_know(dn); break; case labelCol: in >> label; --label; break; case multiCol: in >> mult; --mult; break; case classCol: in >> classC; --classC; break; case monotonicity: monotone=true; eatLine(in); break; case factor: isfactor=true; eatLine(in); break; case offset: isoffset=true; eatLine(in); break; } } // count the number of elements in a row int dim=0; double bidon; do { in >> bidon; dim++; eatSpace(in); if ( in.peek()==',' || in.peek()==';' ) { in.get(); eatSpace(in); } } while ( in.peek()!='\n' && in.peek()!='\f' && in.peek()!=EOF ); // count the number of rows eatLine(in); boolean whiteLine=false; int nbData=1; while ( !in.eof() && !whiteLine ) { whiteLine = !eatLine(in); if ( !whiteLine ) nbData++; } in.close(); // adjust classcol according to the existence of other columns { myself.classcol=classC; if (label>-1 && label-1 && mult -1) --dim; if (label >-1) --dim; if (monotone) myself.monotone.resize(dim-1); Matrix myfactor, myoffset; if (isfactor) myfactor.resize(dim-1); if (isoffset) myoffset.resize(dim-1); myself.resize(nbData, dim); myself.totCard = nbData; myself.nbDiff = nbData; myself.sorted = 0; myself.i_is_i=1; // read the header a second time in.open(fileName,ios::in); while ( (kwd=readKeyWord(in))!=noKeyWord ) { switch( kwd ) { case dontKnow: case labelCol: case multiCol: case classCol: in >> bidon; break; case factor : in >> myfactor; break; case offset : in >> myoffset; break; case monotonicity: for (int k=0; k> (char* fileName, dataSet&) , [fileName=%s]", fileName); throw whereWhich(message,"Syntax error: (/, \\ or ~ expected)"); } in.get(); eatSpace(in); } break; } } for (int i=0; i> myself.ancest(i);} else if (j==mult) {in >> myself.mult(i); } else if (j==classC) {in >> myself(i,attr++);} else { in >> bidon; if ( isfactor ) bidon *= myfactor(attrEff); if ( isoffset ) bidon += myoffset(attrEff); myself(i,attr++) = T(bidon); attrEff++; } if (!in) { char message[100]; sprintf(message, "void operator >> (char* fileName, dataSet&) , [fileName=%s]", fileName); throw whereWhich(message,"Syntax error in file"); } } } in.close(); }