diff --git a/CMakeLists.txt b/CMakeLists.txt index a709ecb..540861f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,7 @@ add_executable( example_17-01 example_17-01.cpp ) add_executable( example_18-01 example_18-01.cpp ) add_executable( example_20-01 example_20-01.cpp ) add_executable( example_20-02 example_20-02.cpp ) +add_executable( example_21-01 example_21-01.cpp ) #... target_link_libraries( example_02-01 ${OpenCV_LIBS} ) @@ -120,4 +121,5 @@ target_link_libraries( example_17-01 ${OpenCV_LIBS} ) target_link_libraries( example_18-01 ${OpenCV_LIBS} ) target_link_libraries( example_20-01 ${OpenCV_LIBS} ) target_link_libraries( example_20-02 ${OpenCV_LIBS} ) +target_link_libraries( example_21-01 ${OpenCV_LIBS} ) #... diff --git a/example_21-01.cpp b/example_21-01.cpp new file mode 100644 index 0000000..a963916 --- /dev/null +++ b/example_21-01.cpp @@ -0,0 +1,106 @@ +#include +#include +#include +using namespace std; +using namespace cv; +int main(int argc, char *argv[]) { + // If the caller gave a filename, great. Otherwise, use a default. + // + const char *csv_file_name = argc >= 2 ? argv[1] : "agaricus-lepiota.data"; + cout << "OpenCV Version: " << CV_VERSION << endl; + // Read in the CSV file that we were given. + // + cv::Ptr data_set = + cv::ml::TrainData::loadFromCSV(csv_file_name, + // Input file name + 0, + // Header lines (ignore this many) + 0, + // Responses are (start) at thie column + 1, + // Inputs start at this column + "cat[0-22]" + // All 23 columns are categorical + ); + // Use defaults for delimeter (',') and missch ('?') + // Verify that we read in what we think. + // + int n_samples = data_set->getNSamples(); + if (n_samples == 0) { + cerr << "Could not read file: " << csv_file_name << endl; + exit(-1); + } else { + cout << "Read " << n_samples << " samples from " << csv_file_name << endl; + } + // Split the data, so that 90% is train data + // + data_set->setTrainTestSplitRatio(0.90, false); + int n_train_samples = data_set->getNTrainSamples(); + int n_test_samples = data_set->getNTestSamples(); + cout << "Found " << n_train_samples << " Train Samples, and " + << n_test_samples << " Test Samples" << endl; + // Create a DTrees classifier. + // + cv::Ptr dtree = cv::ml::RTrees::create(); + // set parameters + // + // These are the parameters from the old mushrooms.cpp code + // Set up priors to penalize "poisonous" 10x as much as "edible" + // + float _priors[] = {1.0, 10.0}; + cv::Mat priors(1, 2, CV_32F, _priors); + dtree->setMaxDepth(8); + dtree->setMinSampleCount(10); + dtree->setRegressionAccuracy(0.01f); + dtree->setUseSurrogates(false /* true */); + dtree->setMaxCategories(15); + dtree->setCVFolds(0 /*10*/); // nonzero causes core dump + dtree->setUse1SERule(true); + dtree->setTruncatePrunedTree(true); + // dtree->setPriors( priors ); + dtree->setPriors(cv::Mat()); // ignore priors for now... + // Now train the model + // NB: we are only using the "train" part of the data set + // + dtree->train(data_set); + // Having successfully trained the data, we should be able + // to calculate the error on both the training data, as well + // as the test data that we held out. + // + cv::Mat results; + float train_performance = dtree->calcError(data_set, false, + // use train data + results // cv::noArray() + ); + std::vector names; + data_set->getNames(names); + Mat flags = data_set->getVarSymbolFlags(); + // Compute some statistics on our own: + // + { + cv::Mat expected_responses = data_set->getResponses(); + int good = 0, bad = 0, total = 0; + for (int i = 0; i < data_set->getNTrainSamples(); ++i) { + float received = results.at(i, 0); + float expected = expected_responses.at(i, 0); + cv::String r_str = names[(int)received]; + cv::String e_str = names[(int)expected]; + cout << "Expected: " << e_str << ", got: " << r_str << endl; + if (received == expected) + good++; + else + bad++; + total++; + } + cout << "Correct answers: " <<(float(good)/total) <<" % " << endl; + cout << "Incorrect answers: " << (float(bad) / total) << "%" + << endl; + } + float test_performance = dtree->calcError(data_set, true, + // use test data + results // cv::noArray() + ); + cout << "Performance on training data: " << train_performance << "%" << endl; + cout << "Performance on test data: " < + Organization: Dept. of Computer Methods, UMK + + I have attached a file containing logical rules for mushrooms. + It should be helpful for other people since only in the last year I + have seen about 10 papers analyzing this dataset and obtaining quite + complex rules. We will try to contribute other results later. + + With best regards, Wlodek Duch + ________________________________________________________________ + + Logical rules for the mushroom data sets. + + Logical rules given below seem to be the simplest possible for the + mushroom dataset and therefore should be treated as benchmark results. + + Disjunctive rules for poisonous mushrooms, from most general + to most specific: + + P_1) odor=NOT(almond.OR.anise.OR.none) + 120 poisonous cases missed, 98.52% accuracy + + P_2) spore-print-color=green + 48 cases missed, 99.41% accuracy + + P_3) odor=none.AND.stalk-surface-below-ring=scaly.AND. + (stalk-color-above-ring=NOT.brown) + 8 cases missed, 99.90% accuracy + + P_4) habitat=leaves.AND.cap-color=white + 100% accuracy + + Rule P_4) may also be + + P_4') population=clustered.AND.cap_color=white + + These rule involve 6 attributes (out of 22). Rules for edible + mushrooms are obtained as negation of the rules given above, for + example the rule: + + odor=(almond.OR.anise.OR.none).AND.spore-print-color=NOT.green + + gives 48 errors, or 99.41% accuracy on the whole dataset. + + Several slightly more complex variations on these rules exist, + involving other attributes, such as gill_size, gill_spacing, + stalk_surface_above_ring, but the rules given above are the simplest + we have found. + + +4. Relevant Information: + This data set includes descriptions of hypothetical samples + corresponding to 23 species of gilled mushrooms in the Agaricus and + Lepiota Family (pp. 500-525). Each species is identified as + definitely edible, definitely poisonous, or of unknown edibility and + not recommended. This latter class was combined with the poisonous + one. The Guide clearly states that there is no simple rule for + determining the edibility of a mushroom; no rule like ``leaflets + three, let it be'' for Poisonous Oak and Ivy. + +5. Number of Instances: 8124 + +6. Number of Attributes: 22 (all nominally valued) + +7. Attribute Information: (classes: edible=e, poisonous=p) + 1. cap-shape: bell=b,conical=c,convex=x,flat=f, + knobbed=k,sunken=s + 2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s + 3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, + pink=p,purple=u,red=e,white=w,yellow=y + 4. bruises?: bruises=t,no=f + 5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, + musty=m,none=n,pungent=p,spicy=s + 6. gill-attachment: attached=a,descending=d,free=f,notched=n + 7. gill-spacing: close=c,crowded=w,distant=d + 8. gill-size: broad=b,narrow=n + 9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, + green=r,orange=o,pink=p,purple=u,red=e, + white=w,yellow=y + 10. stalk-shape: enlarging=e,tapering=t + 11. stalk-root: bulbous=b,club=c,cup=u,equal=e, + rhizomorphs=z,rooted=r,missing=? + 12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s + 13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s + 14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, + pink=p,red=e,white=w,yellow=y + 15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, + pink=p,red=e,white=w,yellow=y + 16. veil-type: partial=p,universal=u + 17. veil-color: brown=n,orange=o,white=w,yellow=y + 18. ring-number: none=n,one=o,two=t + 19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, + none=n,pendant=p,sheathing=s,zone=z + 20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, + orange=o,purple=u,white=w,yellow=y + 21. population: abundant=a,clustered=c,numerous=n, + scattered=s,several=v,solitary=y + 22. habitat: grasses=g,leaves=l,meadows=m,paths=p, + urban=u,waste=w,woods=d + +8. Missing Attribute Values: 2480 of them (denoted by "?"), all for + attribute #11. + +9. Class Distribution: + -- edible: 4208 (51.8%) + -- poisonous: 3916 (48.2%) + -- total: 8124 instances diff --git a/mushroom/citation b/mushroom/citation new file mode 100644 index 0000000..382bd48 --- /dev/null +++ b/mushroom/citation @@ -0,0 +1,3 @@ +This datasets was obtained from: + +Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. diff --git a/mushroom/expanded.Z b/mushroom/expanded.Z new file mode 100644 index 0000000..21e54b2 Binary files /dev/null and b/mushroom/expanded.Z differ