您好,登錄后才能下訂單哦!
在C++中,有多種聚類算法可以處理非線性數(shù)據(jù)。以下是一些常用的算法:
#include <iostream>
#include <vector>
#include <cmath>
#include <random>
#include <algorithm>
using namespace std;
vector<int> kMeans(const vector<vector<double>>& data, int k, int max_iterations = 100) {
int n = data.size();
vector<int> labels(n, -1);
vector<double> centroids(k, 0);
random_device rd;
mt19937 gen(rd());
uniform_int_distribution<> dis(0, n - 1);
for (int i = 0; i < max_iterations; ++i) {
vector<double> distances(n, 0);
for (int j = 0; j < n; ++j) {
double min_dist = DBL_MAX;
for (int l = 0; l < k; ++l) {
double dist = 0;
for (int m = 0; m < data[j].size(); ++m) {
dist += pow(data[j][m] - centroids[l][m], 2);
}
min_dist = min(min_dist, dist);
}
distances[j] = sqrt(min_dist);
}
vector<int> new_labels(n, -1);
for (int j = 0; j < n; ++j) {
double min_dist = DBL_MAX;
int min_index = -1;
for (int l = 0; l < k; ++l) {
if (distances[j] < min_dist) {
min_dist = distances[j];
min_index = l;
}
}
new_labels[j] = min_index;
if (new_labels[j] == labels[j]) {
break;
}
}
for (int j = 0; j < n; ++j) {
labels[j] = new_labels[j];
}
for (int l = 0; l < k; ++l) {
vector<double> cluster_data;
for (int j = 0; j < n; ++j) {
if (labels[j] == l) {
cluster_data.push_back(data[j]);
}
}
if (!cluster_data.empty()) {
double sum[cluster_data[0].size()] = {0};
for (const auto& point : cluster_data) {
for (int m = 0; m < point.size(); ++m) {
sum[m] += point[m];
}
}
for (int m = 0; m < cluster_data[0].size(); ++m) {
centroids[l][m] = sum[m] / cluster_data.size();
}
}
}
}
return labels;
}
#include <iostream>
#include <vector>
#include <cmath>
#include <queue>
#include <unordered_set>
using namespace std;
vector<int> dbscan(const vector<vector<double>>& data, double eps, int min_samples) {
int n = data.size();
vector<int> labels(n, -1);
queue<int> q;
unordered_set<int> visited;
for (int i = 0; i < n; ++i) {
if (visited.find(i) != visited.end()) {
continue;
}
q.push(i);
visited.insert(i);
int num_points = 0;
vector<double> point_eps_radius(data[0].size(), 0);
while (!q.empty()) {
int point = q.front();
q.pop();
num_points++;
for (int m = 0; m < data[point].size(); ++m) {
point_eps_radius[m] = max(point_eps_radius[m], abs(data[point][m] - data[q.front()][m]));
}
for (int neighbor : get_neighbors(data, point, eps)) {
if (visited.find(neighbor) == visited.end()) {
q.push(neighbor);
visited.insert(neighbor);
}
}
}
if (num_points < min_samples) {
continue;
}
int cluster_id = n;
for (int neighbor : get_neighbors(data, q.front(), eps)) {
if (visited.find(neighbor) == visited.end() && labels[neighbor] == -1) {
vector<int> cluster = dbscan(data, eps, min_samples);
if (cluster.size() > 0) {
cluster_id = min(cluster_id, cluster[0]);
}
}
}
for (int neighbor : get_neighbors(data, q.front(), eps)) {
if (visited.find(neighbor) != visited.end()) {
labels[neighbor] = cluster_id;
}
}
}
return labels;
}
vector<int> get_neighbors(const vector<vector<double>>& data, int point, double eps) {
int n = data.size();
vector<int> neighbors;
for (int i = 0; i < n; ++i) {
if (i == point) {
continue;
}
double distance = 0;
for (int m = 0; m < data[point].size(); ++m) {
distance += pow(data[point][m] - data[i][m], 2);
}
if (distance < eps * eps) {
neighbors.push_back(i);
}
}
return neighbors;
}
#include <iostream>
#include <vector>
#include <cmath>
#include <random>
#include <algorithm>
using namespace std;
vector<int> gmm(const vector<vector<double>>& data, int n_components, double max_iter = 100, double tol = 1e-4) {
int n = data.size();
vector<int> labels(n, -1);
vector<double> weights(n_components, 1.0 / n_components);
vector<vector<double>> means(n_components, vector<double>(data[0].size(), 0));
vector<vector<double>> covariances(n_components, vector<double>(data[0].size(), 0));
random_device rd;
mt19937 gen(rd());
uniform_real_distribution<> dis(0, 1);
for (int iter = 0; iter < max_iter; ++iter) {
vector<int> labels_new(n, -1);
vector<double> weights_new(n_components, 0);
vector<vector<double>> means_new(n_components, vector<double>(data[0].size(), 0));
vector<vector<double>> covariances_new(n_components, vector<double>(data[0].size(), 0));
for (int j = 0; j < n; ++j) {
double max_log_likelihood = -DBL_MAX;
int max_component = -1;
for (int k = 0; k < n_components; ++k) {
double log_likelihood = 0;
for (int m = 0; m < data[j].size(); ++m) {
double mean = means[k][m];
double covariance = covariances[k][m];
double value = data[j][m];
log_likelihood += log(2 * M_PI * pow(covariance, 0.5)) + pow(value - mean, 2) / (2 * covariance);
}
if (log_likelihood > max_log_likelihood) {
max_log_likelihood = log_likelihood;
max_component = k;
}
}
labels_new[j] = max_component;
weights_new[max_component] += 1;
means_new[max_component] = data[j];
if (data[j].size() > 1) {
covariances_new[max_component] += data[j] * data[j].t();
}
}
for (int k = 0; k < n_components; ++k) {
weights[k] = weights_new[k] / n;
means[k] = means_new[k] / weights[k];
if (data[j].size() > 1) {
covariances[k] = covariances_new[k] / weights[k];
}
}
if (max(abs(weights_new - weights)) < tol && max(abs(means_new - means)) < tol) {
break;
}
}
return labels;
}
這些算法可以處理非線性數(shù)據(jù),但可能需要調(diào)整參數(shù)以獲得最佳聚類效果。在實際應(yīng)用中,可以嘗試多種算法并比較它們的聚類結(jié)果,以選擇最適合特定數(shù)據(jù)的算法。
免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點不代表本網(wǎng)站立場,如果涉及侵權(quán)請聯(lián)系站長郵箱:is@yisu.com進行舉報,并提供相關(guān)證據(jù),一經(jīng)查實,將立刻刪除涉嫌侵權(quán)內(nèi)容。