這篇博客使用了libsvm中提供的 svm.h svm.c,然后自己編寫了一個SVMclassification類,里面有數(shù)據(jù)讀取,訓(xùn)練,預(yù)測三個主要的函數(shù),實現(xiàn)了libsvm函數(shù)庫的函數(shù)接口調(diào)用。而不是僅僅使用libsvm包中的提供的svm-train.exe文件,svm-train.exe文件也是編寫svm-train.cpp經(jīng)過編譯生成的。
libsvm中提供的3個主要的函數(shù)接口:
struct svm_model *svm_train(const struct svm_PRoblem *prob, const struct svm_parameter *param); 訓(xùn)練函數(shù)
double svm_predict(const struct svm_model *model, const struct svm_node *x); 預(yù)測函數(shù)
int svm_save_model(const char *model_file_name, const struct svm_model *model); 模型的保存和載入struct svm_model *svm_load_model(const char *model_file_name);
只要輸入數(shù)據(jù)符合svm.cpp中的格式要求即可。
當(dāng)然還要選擇合適的核函數(shù)和參數(shù).
We propose that beginners try the following procedure : 1、Transform data to the format of an SVM package(讀入訓(xùn)練數(shù)據(jù)并保存為合適的格式) 2、Conduct simple scaling on the data(歸一化) 3、Consider the RBF kernel (徑向基函數(shù)) 4、Use cross-validation to nd the best parameter C and gama(交叉驗證選取最佳參數(shù))5、 Use the best parameter C and to train the whole training set。 Test(使用最佳參數(shù)訓(xùn)練整個數(shù)據(jù)集)
libSVM是一個非常有名的SVM開源庫,最近我在做分類任務(wù),最后需要用到SVM進(jìn)行分類,可是網(wǎng)上對于libSVM的介紹大多是matlab的,還有就是使用DOS命令調(diào)用的,直接使用libSVM的函數(shù)進(jìn)行編程的介紹非常少,我來大體介紹一下我使用的情況吧。
我對于libSVM的了解也不是很清楚,只是單純的利用他做訓(xùn)練和識別而已。
一、環(huán)境搭建
我使用的VS2013 + C++作為開發(fā)的,首先下載libSVM最新的版本http://www.csie.ntu.edu.tw/~cjlin/libsvm/,解壓后如下圖所示:
使用VS2013創(chuàng)建一個新的空工程,把上圖目錄中的svm.cpp和svm.h復(fù)制到工程目錄下,并通過在工程中右鍵——Add——Exsiting Item把這兩個文件添加到工程中去,如下圖所示。
好了,到目前為止環(huán)境就搭建好了,簡單明了~注意:VS2013中使用fopen會出現(xiàn)一個錯誤,原因是VS2013自身兼容性不好,認(rèn)為fopen不安全,可以通過 工程右鍵——Properties——C++——Preprocesser——Preprocesser Definitions中添加_CRT_SECURE_NO_WARNINGS解決該問題。同時VS2013中編譯會出現(xiàn)strdup函數(shù)編譯不過去,同樣根據(jù)提示,把該函數(shù)改為_strdup即可。二、特征文件讀取
我感覺網(wǎng)上對于libsvm有一種誤導(dǎo),就是你的特征文件必須要按照一定的格式來,才能夠被讀取訓(xùn)練,其實這只是對于使用dos命令行調(diào)用libsvm時的規(guī)定,因為libsvm自定義的特征文件格式是與其讀取相匹配的。如果我們使用自己的讀取文件函數(shù),則完全不用拘束于這種格式,只要我們在讀取函數(shù)之中與我們自己的特征文件格式相匹配即可。在libsvm中,與讀取特征文件相關(guān)的類型為svm_problem。這個類中有三個元素,如下所示:[cpp] view plain copyprint?
struct svm_problem { int n; //記錄樣本總數(shù) double *y; //記錄樣本所屬類別 struct svm_node **x; //存儲所有樣本的特征,二維數(shù)組,一行存一個樣本的所有特征 }; ![]()
struct svm_problem { int n; //記錄樣本總數(shù) double *y; //記錄樣本所屬類別 struct svm_node **x; //存儲所有樣本的特征,二維數(shù)組,一行存一個樣本的所有特征};其中svm_node類型的定義如下:[cpp] view plain copyprint?
struct svm_node //用來存儲輸入空間中的單個特征 { int index; //該特征在特征空間中的維度編號 double value; //該特征的值 }; ![]()
struct svm_node //用來存儲輸入空間中的單個特征{ int index; //該特征在特征空間中的維度編號 double value; //該特征的值};借用網(wǎng)上的一張圖進(jìn)行表示:
好了,知道在libsvm中特征是如何存儲以后,就可以編寫讀取文件的函數(shù)了。以我使用svm舉例,我的特征文件如下圖所示:
我的讀取文件的函數(shù)如下所示:[cpp] view plain copyprint?
void ClassificationSVM::readTrainData(const string& featureFileName) { FILE *fp = fopen(featureFileName.c_str(), "r"); if (fp == NULL) { cout << "open feature file error!" << endl; return; } fseek(fp, 0L, SEEK_END); long end = ftell(fp); fseek(fp, 0L, SEEK_SET); long start = ftell(fp); //讀取文件,直到文件末尾 while (start != end) { //FEATUREDIM是自定義變量,表示特征的維度 svm_node* features = new svm_node[FEATUREDIM + 1];//因為需要結(jié)束標(biāo)記,因此申請空間時特征維度+1 for (int k = 0; k < FEATUREDIM; k++) { double value = 0; fscanf(fp, "%lf", &value); features[k].index = k + 1;//特征標(biāo)號,從1開始 features[k].value = value;//特征值 } features[FEATUREDIM].index = -1;//結(jié)束標(biāo)記 char c; fscanf(fp, "/n", &c); char name[100]; fgets(name, 100, fp); name[strlen(name) - 1] = '/0'; //negative sample type is 0 int type = 0; //positive sample type is 1 if (featureFileName == "PositiveFeatures.txt") type = 1; dataList.push_back(features); typeList.push_back(type); sampleNum++; start = ftell(fp); } fclose(fp); } ![]()
void ClassificationSVM::readTrainData(const string& featureFileName){ FILE *fp = fopen(featureFileName.c_str(), "r"); if (fp == NULL) { cout << "open feature file error!" << endl; return; } fseek(fp, 0L, SEEK_END); long end = ftell(fp); fseek(fp, 0L, SEEK_SET); long start = ftell(fp); //讀取文件,直到文件末尾 while (start != end) { //FEATUREDIM是自定義變量,表示特征的維度 svm_node* features = new svm_node[FEATUREDIM + 1];//因為需要結(jié)束標(biāo)記,因此申請空間時特征維度+1 for (int k = 0; k < FEATUREDIM; k++) { double value = 0; fscanf(fp, "%lf", &value); features[k].index = k + 1;//特征標(biāo)號,從1開始 features[k].value = value;//特征值 } features[FEATUREDIM].index = -1;//結(jié)束標(biāo)記 char c; fscanf(fp, "/n", &c); char name[100]; fgets(name, 100, fp); name[strlen(name) - 1] = '/0'; //negative sample type is 0 int type = 0; //positive sample type is 1 if (featureFileName == "PositiveFeatures.txt") type = 1; dataList.push_back(features); typeList.push_back(type); sampleNum++; start = ftell(fp); } fclose(fp);}其中dataList和typeList分別存放特征值和該特征樣本對應(yīng)的標(biāo)號(正或負(fù))。三、svm訓(xùn)練和識別
訓(xùn)練時的代碼如下圖所示:[cpp] view plain copyprint?
void ClassificationSVM::train(const string& modelFileName) { cout << "reading positivie features..." << endl; readTrainData("PositiveFeatures.txt"); cout << "reading negative features..." << endl; readTrainData("NegativeFeatures.txt"); cout << sampleNum << endl; prob.l = sampleNum;//number of training samples prob.x = new svm_node *[prob.l];//features of all the training samples prob.y = new double[prob.l];//type of all the training samples int index = 0; while (!dataList.empty()) { prob.x[index] = dataList.front(); prob.y[index] = typeList.front(); dataList.pop_front(); typeList.pop_front(); index++; } cout << "start training" << endl; svm_model *svmModel = svm_train(&prob, ¶m); cout << "save model" << endl; svm_save_model(modelFileName.c_str(), svmModel); cout << "done!" << endl; } ![]()
void ClassificationSVM::train(const string& modelFileName){ cout << "reading positivie features..." << endl; readTrainData("PositiveFeatures.txt"); cout << "reading negative features..." << endl; readTrainData("NegativeFeatures.txt"); cout << sampleNum << endl; prob.l = sampleNum;//number of training samples prob.x = new svm_node *[prob.l];//features of all the training samples prob.y = new double[prob.l];//type of all the training samples int index = 0; while (!dataList.empty()) { prob.x[index] = dataList.front(); prob.y[index] = typeList.front(); dataList.pop_front(); typeList.pop_front(); index++; } cout << "start training" << endl; svm_model *svmModel = svm_train(&prob, ¶m); cout << "save model" << endl; svm_save_model(modelFileName.c_str(), svmModel); cout << "done!" << endl;}prob是svm_problem類型的對象,就是把之前讀取的特征全部放入svm_problem的對象中。svm_train和svm_save_model函數(shù)都是libsvm中自帶的,在svm.h中定義。分類時的代碼如下所示:[cpp] view plain copyprint?
void ClassificationSVM::predict(const string& featureFileName, const string& modelFileName) { std::vector<bool> judgeRight; svm_model *svmModel = svm_load_model(modelFileName.c_str()); FILE *fp; if ((fp = fopen(featureFileName.c_str(), "rt")) == NULL) return; fseek(fp, 0L, SEEK_END); long end = ftell(fp); fseek(fp, 0L, SEEK_SET); long start = ftell(fp); while (start != end) { svm_node* input = new svm_node[FEATUREDIM + 1]; for (int k = 0; k<FEATUREDIM; k++) { double value = 0; fscanf(fp, "%lf", &value); input[k].index = k + 1; input[k].value = value; } char c; fscanf(fp, "/n", &c); char name[100]; fgets(name, 100, fp); name[strlen(name) - 1] = '/0'; input[FEATUREDIM].index = -1; int predictValue = svm_predict(svmModel, input); if (featureFileName == "positive_test.txt") { if (predictValue == 0) judgeRight.push_back(false); else judgeRight.push_back(true); } else if (featureFileName == "negative_test.txt") { if (predictValue == 1) judgeRight.push_back(false); else judgeRight.push_back(true); } start = ftell(fp); } fclose(fp); int correctNum = 0; int totalNum = judgeRight.size(); for (int i = 0; i < totalNum; i++) { if (judgeRight[i] == true) correctNum++; } double precent = 1.0 * correctNum / totalNum; cout << precent << endl; } ![]()
void ClassificationSVM::predict(const string& featureFileName, const string& modelFileName){ std::vector<bool> judgeRight; svm_model *svmModel = svm_load_model(modelFileName.c_str()); FILE *fp; if ((fp = fopen(featureFileName.c_str(), "rt")) == NULL) return; fseek(fp, 0L, SEEK_END); long end = ftell(fp); fseek(fp, 0L, SEEK_SET); long start = ftell(fp); while (start != end) { svm_node* input = new svm_node[FEATUREDIM + 1]; for (int k = 0; k<FEATUREDIM; k++) { double value = 0; fscanf(fp, "%lf", &value); input[k].index = k + 1; input[k].value = value; } char c; fscanf(fp, "/n", &c); char name[100]; fgets(name, 100, fp); name[strlen(name) - 1] = '/0'; input[FEATUREDIM].index = -1; int predictValue = svm_predict(svmModel, input); if (featureFileName == "positive_test.txt") { if (predictValue == 0) judgeRight.push_back(false); else judgeRight.push_back(true); } else if (featureFileName == "negative_test.txt") { if (predictValue == 1) judgeRight.push_back(false); else judgeRight.push_back(true); } start = ftell(fp); } fclose(fp); int correctNum = 0; int totalNum = judgeRight.size(); for (int i = 0; i < totalNum; i++) { if (judgeRight[i] == true) correctNum++; } double precent = 1.0 * correctNum / totalNum; cout << precent << endl;}分類時的代碼與之前的代碼非常類似,不多做贅述。最后注意:標(biāo)準(zhǔn)的svm只支持二分類問題,對于多分類問題,需要進(jìn)行其他的處理操作。