#include #include #include #include #include #include #include #include #include #include #include #include using namespace std; mt19937 mt; int K = 0; // number of cluster class stopwatch{ double begin, tmp, end, all; public: stopwatch(); ~stopwatch(); void start(); void stop(); void all_stop(); double show(); double all_show(); }; //measure time stopwatch::stopwatch(){ begin = end = all = tmp = 0.0; } stopwatch::~stopwatch(){ // cout<<"StopWatch" << endl; show(); } void stopwatch::start(){ begin=(double)clock()/CLOCKS_PER_SEC; tmp = begin; } void stopwatch::stop(){ end = (double)clock()/CLOCKS_PER_SEC; } void stopwatch::all_stop(){ all = (double)clock()/CLOCKS_PER_SEC; } double stopwatch::show(){ double interval = end - tmp; // cout << end - tmp << endl; tmp = end; return interval; } double stopwatch::all_show(){ //cout << all - begin << endl; return all - begin; } vector > Nomarize_matrix(vector > Data_matrix)// normalize data , average 0 , variance 1 { vector > N_matrix; for(int i=0; i<(int)Data_matrix.size(); i++) { vector tmp_row; double ave = 0; double div = 0; for(int j=0; j<(int)Data_matrix[i].size(); j++) { ave += Data_matrix[i][j]; } ave = ave/(double)Data_matrix[i].size(); for(int j=0; j<(int)Data_matrix[i].size(); j++) { div += pow(ave - Data_matrix[i][j], 2); } div = sqrt(div/(double)(Data_matrix[i].size())); if(div != 0) { for(int j=0; j<(int)Data_matrix[i].size(); j++) { tmp_row.push_back((Data_matrix[i][j] - ave)/div); } N_matrix.push_back(tmp_row); } else { cout << "error: data variance zero " << i+1 << endl; exit(1); } } return N_matrix; } vector > read_matrix(const char* filename) //read data matrix { string line; ifstream fin; fin.open(filename); if (!fin){ cout << "error : file not found" << endl; exit(1); } // if fin==0 file not exist vector > matrix; int r_num = 0; int d_num = -1; while(getline(fin,line)) { vector tmp_row; double tmp_n; if(line.empty()) { break; } replace(line.begin(), line.end(), ' ', ' '); // tab to space replace(line.begin(), line.end(), ',', ' '); // "," to space line = line.substr(0, line.find_last_not_of(' ')+1); // cut last space r_num++; stringstream buf(line); while(!buf.eof()) { buf >> tmp_n; tmp_row.push_back(tmp_n); } bool div_zero = true; if(d_num < 0) { d_num = (int)tmp_row.size(); } else { if(d_num != (int)tmp_row.size()) { cout << "error : dimension of data " << r_num << " is not same as other data " << endl; exit(1); } } double check_n = tmp_row[0]; for(int i=0; i< (int)tmp_row.size(); i++) { if(check_n != tmp_row[i]) { div_zero = false; break; } } if(div_zero) { cout << "Variance of vector number " << r_num << " is zero. Automatically removed." << endl; } else { matrix.push_back(tmp_row); } } if(r_num <= 0) { cout << "error : no data can be read" << endl; exit(1); } return matrix; } double cal_distance_nol_correlation(vector p_row, vector c_row)// calculation distance by 1 - correlation coefficiency { int r=p_row.size(); if(r != c_row.size()) { cout << "error: vector not same length" << endl; exit(1); } double new_dis = 0; for(int i=0; i cal_CC(vector data_row) { double tmp_ave = 0; for(int i=0; i<(int)data_row.size(); i++) { tmp_ave += data_row[i]; } tmp_ave = tmp_ave/(double)data_row.size(); double tmp_div = 0; for(int i=0; i<(int)data_row.size(); i++) { tmp_div += pow(data_row[i] - tmp_ave, 2); } tmp_div = sqrt(tmp_div); vector CC_row; if(tmp_div != 0) { for(int i=0; i<(int)data_row.size(); i++) { CC_row.push_back((data_row[i] - tmp_ave)/tmp_div); } } else { for(int i=0; i<(int)data_row.size(); i++) { CC_row.push_back(-1); } } return CC_row; } vector K_clustering_Lloyd(vector > Data_matrix, string vec_filename, vector > &cal_time, vector &cal_all_time, vector > first_c, bool median) { vector time_row; cal_time.push_back(time_row);//time of calculations in each iteration int l_num = (int)cal_time.size() - 1; int track_num = Data_matrix[0].size();// number of dimension int row_num = Data_matrix.size();// number of data vector > CC_Data_matrix;// correlation coefficient matrix of data for(int i=0; i > ave_s;//K centroid vectors vector > CC_ave_s;//correlation coefficient vectors of K centroid vectors vector vector_ave, vector_div; o_vec << "initial vectors" << endl; for(int i=0; i I_row;;//list of points classified into same cluster vector I_S_row; //initialization of matrix for(int i=0; i tmp_min) {min = tmp_min; min_n = j;} } I_row[i] = min_n; I_S_row[min_n]++; } //cluster each points into initial centroids end int loop = 1; // number of iterations stopwatch watch; watch.start();// start measurement of clustering time while(1)//main loop of K clustering { vector > ave_n; // new centroid vectors vector > CC_ave_n;//correlation coefficient vectors of new centroid vectors vector > dis_change;//upper-bound and lower-bound of distance variantion between points and clusters if(median) //K-median { vector > index_matrix; for(int i=0; i tmp_row; index_matrix.push_back(tmp_row); } for(int i=0; i tmp_ave; for(int j=0; j tmp_row_h, tmp_row_l; for(int k=0; k<(int)index_matrix[i].size(); k++) { if(Data_matrix[index_matrix[i][k]][j] > ave_s[i][j]) { tmp_row_h.push_back(Data_matrix[index_matrix[i][k]][j]); } else { tmp_row_l.push_back(Data_matrix[index_matrix[i][k]][j]); } } if(tmp_row_l.size() > (int)(index_matrix[i].size()/2)) { sort(tmp_row_l.begin(), tmp_row_l.end()); tmp_ave.push_back(tmp_row_l[(int)(index_matrix[i].size()/2)]); } else { sort(tmp_row_h.begin(), tmp_row_h.end()); tmp_ave.push_back(tmp_row_h[(int)(index_matrix[i].size()/2) - (int)(tmp_row_l.size())]); } } vector tmp_CC = cal_CC(tmp_ave); ave_n.push_back(tmp_ave); CC_ave_n.push_back(tmp_CC); } } else { for(int i=0; i tmp_ave; for(int j=0; j tmp_CC = cal_CC(ave_n[i]); CC_ave_n.push_back(tmp_CC); } } //Updating step end //Assigning step //replacement old centroids with new centroids bool same = true; o_vec << "iteration " << loop << endl; for(int i=0; i tmp_min) {min = tmp_min; min_n = j;} } if(I_row[i] != min_n) { no_change = false; I_S_row[I_row[i]]--; I_S_row[min_n]++; I_row[i] = min_n; } } //assigning each points to new nearest centroid : end if(no_change) //stop K clustering if all points are clustered into same clusters as last iteration { break; //stop K clustering } //Assigning step : end watch.stop(); cal_time[l_num].push_back(watch.show()); loop++; }//main loop of K clustering : end watch.all_stop(); cal_all_time.push_back(watch.all_show()); return I_row; } vector K_clustering_pruning_bound_A(vector > Data_matrix, string vec_filename, vector > &cal_time, vector &cal_all_time, vector > first_c, bool median) { vector time_row; cal_time.push_back(time_row);//time of calculations in each iteration int l_num = (int)cal_time.size() - 1; int track_num = Data_matrix[0].size();// number of dimension int row_num = Data_matrix.size();// number of data vector > CC_Data_matrix;// correlation coefficient matrix of data for(int i=0; i > ave_s;//K centroid vectors vector > CC_ave_s;//correlation coefficient vectors of K centroid vectors vector vector_ave, vector_div; o_vec << "initial vectors" << endl; for(int i=0; i > I_row;//list of points classified into same cluster vector I_row; vector I_S_row; vector > dis_matrix;//distance between each points and each centroids //initialization of matrix for(int i=0; i tmp_row; for(int j=0; j > total_matrix;//sum of each cluster for(int i=0; i tmp_row; for(int j=0; j tmp_min) {min = tmp_min; min_n = j;} } for(int j=0; j > ave_n; // new centroid vectors vector > CC_ave_n;//correlation coefficient vectors of new centroid vectors vector dis_change;//upper-bound and lower-bound of distance variantion between points and clusters //Updating step : calculate new centroids if(median) //K-median { vector > index_matrix; for(int i=0; i tmp_row; index_matrix.push_back(tmp_row); } for(int i=0; i tmp_ave; for(int j=0; j tmp_row_h, tmp_row_l; for(int k=0; k<(int)index_matrix[i].size(); k++) { if(Data_matrix[index_matrix[i][k]][j] > ave_s[i][j]) { tmp_row_h.push_back(Data_matrix[index_matrix[i][k]][j]); } else { tmp_row_l.push_back(Data_matrix[index_matrix[i][k]][j]); } } if(tmp_row_l.size() > (int)(index_matrix[i].size()/2)) { sort(tmp_row_l.begin(), tmp_row_l.end()); tmp_ave.push_back(tmp_row_l[(int)(index_matrix[i].size()/2)]); } else { sort(tmp_row_h.begin(), tmp_row_h.end()); tmp_ave.push_back(tmp_row_h[(int)(index_matrix[i].size()/2) - (int)(tmp_row_l.size())]); } } vector tmp_CC = cal_CC(tmp_ave); ave_n.push_back(tmp_ave); CC_ave_n.push_back(tmp_CC); } } else //K-means { for(int i=0; i tmp_ave; for(int j=0; j tmp_CC = cal_CC(tmp_ave); ave_n.push_back(tmp_ave); CC_ave_n.push_back(tmp_CC); } } //Updating step end //Assigning step //calculation of lower-bound and upper-bound between points and clusters for(int i=0; i dis_row; vector tmp_row; double tmp_dis = 0; for(int j=0; j check_num;//possible nearest centroids bool same = true; for(int k=0; k dis_matrix[i][k]) { same = false; check_num.push_back(k); } } } if(same)// all distance calculations are pruned { cut_num++; } else { double min = cal_distance_nol_correlation(CC_Data_matrix[i], CC_ave_s[I_row[i]]); dis_matrix[i][I_row[i]] = min; int min_n = I_row[i]; for(vector:: iterator it = check_num.begin(); it != check_num.end(); ++it) { if(min > dis_matrix[i][*it]) { double n_dis = cal_distance_nol_correlation(CC_Data_matrix[i], CC_ave_s[*it]); dis_matrix[i][*it] = n_dis; if(min > n_dis) {min = n_dis; min_n = *it;} } } if(I_row[i] != min_n) { no_change = false; change_n++; for(int t=0; t K_clustering_pruning_bound_B(vector > Data_matrix, string vec_filename, vector > &cal_time, vector &cal_all_time, vector > first_c, bool median) { vector time_row; cal_time.push_back(time_row);//time of calculations in each iteration int l_num = (int)cal_time.size() - 1; int track_num = Data_matrix[0].size();// number of dimension int row_num = Data_matrix.size();// number of data vector > CC_Data_matrix;// correlation coefficient matrix of data for(int i=0; i > ave_s;//K centroid vectors vector > CC_ave_s;//correlation coefficient vectors of K centroid vectors vector vector_ave, vector_div; o_vec << "initial vectors" << endl; for(int i=0; i > I_row;//list of points classified into same cluster vector I_row; vector I_S_row; vector > Max_I_row;//Maximum value of correlation coefficient in each dimension of each centroids vector > Min_I_row;//Minimum value of correlation coefficient in each dimension of each centroids vector > dis_matrix;//distance between each points and each centroids //initialization of matrix for(int i=0; i tmp_row; for(int j=0; j > total_matrix;//sum of each cluster for(int i=0; i tmp_row; for(int j=0; j tmp_max_row, tmp_min_row; for(int j=0; j tmp_min) {min = tmp_min; min_n = j;} } for(int j=0; j CC_Data_matrix[i][j]) Min_I_row[min_n][j] = CC_Data_matrix[i][j]; } I_row[i] = min_n; I_S_row[min_n]++; } //cluster each points into initial centroids end int loop = 1; // number of iterations stopwatch watch; watch.start();// start measurement of clustering time while(1)//main loop of K clustering { vector > ave_n; // new centroid vectors vector > CC_ave_n;//correlation coefficient vectors of new centroid vectors vector > dis_change;//upper-bound and lower-bound of distance variantion between points and clusters //Updating step : calculate new centroids if(median) //K-median { vector > index_matrix; for(int i=0; i tmp_row; index_matrix.push_back(tmp_row); } for(int i=0; i tmp_ave; for(int j=0; j tmp_row_h, tmp_row_l; for(int k=0; k<(int)index_matrix[i].size(); k++) { if(Data_matrix[index_matrix[i][k]][j] > ave_s[i][j]) { tmp_row_h.push_back(Data_matrix[index_matrix[i][k]][j]); } else { tmp_row_l.push_back(Data_matrix[index_matrix[i][k]][j]); } } if(tmp_row_l.size() > (int)(index_matrix[i].size()/2)) { sort(tmp_row_l.begin(), tmp_row_l.end()); tmp_ave.push_back(tmp_row_l[(int)(index_matrix[i].size()/2)]); } else { sort(tmp_row_h.begin(), tmp_row_h.end()); tmp_ave.push_back(tmp_row_h[(int)(index_matrix[i].size()/2) - (int)(tmp_row_l.size())]); } } vector tmp_CC = cal_CC(tmp_ave); ave_n.push_back(tmp_ave); CC_ave_n.push_back(tmp_CC); } } else //K-means { for(int i=0; i tmp_ave; for(int j=0; j tmp_CC = cal_CC(tmp_ave); ave_n.push_back(tmp_ave); CC_ave_n.push_back(tmp_CC); } } //Updating step end //Assigning step //calculation of lower-bound and upper-bound between points and clusters for(int i=0; i dis_row; vector tmp_row; for(int j=0; j 0) dis += dis_row[l] * Min_I_row[j][l]; else dis += dis_row[l] * Max_I_row[j][l]; } } else// calculate upper-bound of distance variation between points of one cluster and centroid of cluster { for(int l=0; l 0) dis += dis_row[l] * Max_I_row[j][l]; else dis += dis_row[l] * Min_I_row[j][l]; } } tmp_row.push_back(dis); } dis_change.push_back(tmp_row); } //calculation of lower-bound and upper-bound between points and clusters : end //replacement old centroids with new centroids bool same = true; o_vec << "iteration " << loop << endl; for(int i=0; i check_num;//possible nearest centroids check_num.push_back(I_row[i]); bool same = true; for(int k=0; k dis_matrix[i][k]) { same = false; check_num.push_back(k); } } } if(same)// all distance calculations are pruned { cut_num++; for(int l=0; l CC_Data_matrix[i][l]) Min_I_row[I_row[i]][l] = CC_Data_matrix[i][l]; } } else { double min = cal_distance_nol_correlation(CC_Data_matrix[i], CC_ave_s[I_row[i]]); dis_matrix[i][I_row[i]] = min; int min_n = I_row[i]; for(vector:: iterator it = check_num.begin(); it != check_num.end(); ++it) { if(min > dis_matrix[i][*it]) { double n_dis = cal_distance_nol_correlation(CC_Data_matrix[i], CC_ave_s[*it]); dis_matrix[i][*it] = n_dis; if(min > n_dis) {min = n_dis; min_n = *it;} } } if(I_row[i] != min_n) { no_change = false; change_n++; for(int t=0; t CC_Data_matrix[i][l]) Min_I_row[I_row[i]][l] = CC_Data_matrix[i][l]; } } } //assigning each points to new nearest centroid : end if(cut_num == row_num || no_change) //stop K clustering if all points are clustered into same clusters as last iteration { break; //stop K clustering } //Assigning step : end watch.stop(); cal_time[l_num].push_back(watch.show()); loop++; }//main loop of K clustering : end watch.all_stop(); cal_all_time.push_back(watch.all_show()); return I_row; } void out_cluster(vector I_matrix, string out_name) { ofstream ofs; ofs.open(out_name.c_str()); vector > out_matrix; for(int j=0; j tmp_row; out_matrix.push_back(tmp_row); } for(int l=0; l<(int)I_matrix.size(); l++) { out_matrix[I_matrix[l]].push_back(l); } for(int j=0; j<(int)out_matrix.size(); j++) { ofs << "cluster " << j+1 << " " << out_matrix[j].size() << endl; } ofs << endl; for(int j=0; j<(int)out_matrix.size(); j++) { ofs << "cluster " << j+1 << " " << out_matrix[j].size() << endl; for(int l=0; l<(int)out_matrix[j].size(); l++) { ofs << out_matrix[j][l] + 1 << endl; } ofs << endl; } } vector > cal_ini(vector > Data_matrix, int T) { random_device rnd; mt.seed(rnd()); uniform_int_distribution rand_data(0,(int)Data_matrix.size() - 1); vector > all_first_c; for(int t=0; t index_row; for(int k=0; k > Data_matrix, vector > all_first_c, bool median) { string time_all_name = out_name + "_time_" + type_name + ".txt"; //output time file name vector cal_all_time; vector > cal_time; for(int t=1; t > first_c; for(int k=0; k I_matrix; string cut_name = out_name + "_cut_" + type_name + "_" + ss.str() + ".txt"; string vec_name = out_name + "_vector_" + type_name + "_" + ss.str() + ".txt"; string cluster_name = out_name + "_cluster_" + type_name + "_" + ss.str() + ".txt"; if(type == 1) { I_matrix = K_clustering_pruning_bound_A(Data_matrix, vec_name, cal_time, cal_all_time, first_c, median); } else if(type == 2) { I_matrix = K_clustering_pruning_bound_B(Data_matrix, vec_name, cal_time, cal_all_time, first_c, median); } else if(type == 0) { I_matrix = K_clustering_Lloyd(Data_matrix, vec_name, cal_time, cal_all_time, first_c, median); } out_cluster(I_matrix, cluster_name); } ofstream t_ofs; t_ofs.open(time_all_name.c_str()); t_ofs << "Calculation time of clustering in each trial" << endl; for(int i=0; i<(int)cal_all_time.size(); i++) { t_ofs << "trial " << i+1 << " time " << cal_all_time[i] << " iteration " << (int)cal_time[i].size() << endl; } t_ofs << endl; } int main(int argc, char** argv) { //argv : input_name K number_of_trials (out_name) (Lloyd) (median) if(argc < 3 || argc > 10) { cout << "argc error " << argc << endl; cout << "number of parameters must be more than 4 and less than 10" << endl; return 1; } string data_name = string(argv[1]);//input file name string out_name;//output file name K = atoi(argv[2]);//number of cluster int T = atoi(argv[3]);//number of trial bool Lloyd = false, median = false, bound_B = false, bound_A = false, comp = false;//option if(argc >= 5) { for(int i=4; i > Data_matrix;//input data matrix Data_matrix = read_matrix(data_name.c_str()); cout << "number of data " << Data_matrix.size() < > first_c = cal_ini(Data_matrix, T); if(Lloyd) { k_means(out_name, Lloyd_name, 0, T, Data_matrix, first_c, median); } else if(bound_A) { k_means(out_name, bound_A_name, 1, T, Data_matrix, first_c, median); } else if(bound_B) { k_means(out_name, bound_B_name, 2, T, Data_matrix, first_c, median); } else if(comp) { k_means(out_name, Lloyd_name, 0, T, Data_matrix, first_c, median); k_means(out_name, bound_A_name, 1, T, Data_matrix, first_c, median); k_means(out_name, bound_B_name, 2, T, Data_matrix, first_c, median); } }