通用kmeans算法c++_OOP实现与python可视化输出数据_注释版 - 东方耀AI技术分享

#include <iostream>
#include <fstream>
#include "zennze/kmeans_oop.hpp"
using namespace std;
using std::cin;
using std::cout;
using std::initializer_list;
using std::runtime_error;
// 通用kmeans算法c++_OOP实现与python可视化输出数据_注释版
class NDimenPoint : public VirtualPoint
{
private:
int dimension; // 多维点类的维度
vector<double> xs; // 一个点的数据
public:
// 构造一个点只要维度
NDimenPoint(const int d) : dimension(d) { xs.resize(d); }
// 构造一个点维度与点数据
NDimenPoint(const int d, vector<double> l) : dimension(d), xs(l){}
// 构造一个新点用另外一个点
NDimenPoint(const NDimenPoint &p) : dimension(p.dimension), xs(p.xs) {}
~NDimenPoint(){};
bool operator==(const VirtualPoint &p) override
{
// 类型转换：父类转子类
auto pp = static_cast<const NDimenPoint &>(p);
if (dimension != pp.dimension)
return false;
for (size_t i = 0; i < xs.size(); i++)
if (xs[i] != pp.xs[i])
return false;
return true;
}
bool operator!=(const VirtualPoint &p) override
{
auto pp = static_cast<const NDimenPoint &>(p);
if (dimension != pp.dimension)
return true;
for (size_t i = 0; i < xs.size(); i++)
if (xs[i] != pp.xs[i])
return true;
return false;
}
void add(const NDimenPoint &p)
{
// 定义点的加法
if (p.dimension != dimension)
throw runtime_error("dimension mismatch");
for (size_t i = 0; i < xs.size(); i++)
xs[i] += p.xs[i];
}
NDimenPoint operator/(const int n)
{
// 定义点除以一个n的操作
if (n == 0)
throw std::runtime_error("divisor zero error!");
NDimenPoint res(dimension);
for (size_t i = 0; i < dimension; i++)
{
res.xs[i] = xs[i] / n;
}
return res;
}
double disTo(const NDimenPoint &p)
{
// 定义两点之间的欧式距离这样支持了多维数据了
double tmp = 0;
for (size_t i = 0; i < dimension; i++)
tmp += pow(xs[i] - p.xs[i], 2);
return sqrt(tmp);
}
string toString() override
{
stringstream ss;
ss << "[";
for (size_t i = 0; i < dimension; i++)
{
if (i > 0)
ss << ", ";
ss << xs[i];
}
ss << "]";
return ss.str();
}
static double calcDisToCluster(const VirtualPoint &p, const Cluster &c)
{
// 静态方法：点到簇质心的距离
auto pp = static_cast<const NDimenPoint &>(p);
auto cp = static_cast<const NDimenPoint &>(*(c.getCentroid()));
// 本质还是：点到点之间的距离
return pp.disTo(cp);
}
static sharedVPoint avgPoints(const vector<sharedVPoint> &points)
{
// 计算一堆点集合的质心
if (points.size() <= 0)
return nullptr;
NDimenPoint resPoint(static_cast<const NDimenPoint &>(*points[0]).dimension);
for (auto &&p : points)
resPoint.add(static_cast<const NDimenPoint &>(*p));
// 求和再除以n 均值
resPoint = resPoint / points.size();
// cerr << "DEBUG\t" << resPoint.toString() << ", POINTS.SIZE " << points.size() << endl;
return make_shared<NDimenPoint>(resPoint);
};
};
vector<NDimenPoint> geneData(int num, const int dimension, double maxVal = 1000)
{
std::default_random_engine generator(time(NULL));
std::uniform_real_distribution<double> distribution(0, maxVal);
vector<NDimenPoint> points;
for (size_t i = 0; i < num; i++)
{
vector<double> tmpVec;
for (size_t j = 0; j < dimension; j++)
tmpVec.push_back(distribution(generator));
points.push_back(NDimenPoint(dimension, tmpVec));
}
return points;
}
void output(const vector<Cluster> &clusters, const int dimension)
{
cout << "{"
<< ""dimension":" << dimension << "," << endl
<< ""clusters":[";
for (int i = 0; i < clusters.size(); i++)
{
if (i > 0)
cout << ", ";
std::cout << clusters[i].toString() << std::endl;
}
cout << "]}" << endl;
}
void output_json(const vector<Cluster> &clusters, const int dimension)
{
std::string file_path = "./kmeans_visualization_py.json";
std::ofstream write_out_f;
write_out_f.open(file_path);
write_out_f << "{"
<< ""dimension":" << dimension << "," << endl
<< ""clusters":[";
for (int i = 0; i < clusters.size(); i++)
{
if (i > 0)
write_out_f << ", ";
write_out_f << clusters[i].toString() << std::endl;
}
write_out_f << "]}" << endl;
write_out_f.close();
}
void kmeans_work()
{
const int maxRound = 1000;
const int pointCnt = 150; // 数据集的点数
int dimension = 1; // 点的维度
int k = 0;
cerr << "dimension, k: ";
cin >> dimension >> k;
vector<sharedVPoint> points; // 点集的共享指针
for (auto &&p : geneData(pointCnt, dimension))
points.push_back(make_shared<NDimenPoint>(p));
auto clusters = KmeansAlg::run(points, k, NDimenPoint::calcDisToCluster, NDimenPoint::avgPoints, maxRound);
output_json(clusters, dimension);
output(clusters, dimension);
}
int main()
{
std::cout << "kmeans算法实现！" << endl;
kmeans_work();
return 0;
}

复制代码

#include <algorithm>
#include <cmath>
#include <ctime>
#include <exception>
#include <iostream>
#include <memory>
#include <random>
#include <sstream>
#include <string>
#include <vector>
using std::cerr;
using std::endl;
using std::make_shared;
using std::pow;
using std::shared_ptr;
using std::sqrt;
using std::string;
using std::stringstream;
using std::to_string;
using std::vector;
/**
* kmeans - 点作为数据，cluster是点的聚簇
* BEGIN
* 选出来 k 个点作为中心点生成聚簇
* 循环
* 计算点与聚簇的距离
* 每个点加入到距离最近的聚簇中
* 更新聚簇中心点
* 聚簇中心点未变？退出
* 输出聚簇
* END
*
* 数据结构
* 点 - ==() toString()
* 聚簇 - 计算中心点()
* calcDis(point cluster)
* kmeans() -
* 为了设计出更为通用的结构，选择采用OOP面向对象设计，结构比较复杂，尤其是距离计算，求质心这两个函数
* VirtualPoint - 虚拟点类(抽象类)，无数据成员，定义了 == != 两个纯虚函数
Cluster - 聚簇类，数据成员: VirtualPoint的集合和中心点(VirtualPoint类型)
函数成员: 设置质心更新质心清空点...
KmeansAlg - 算法框架，run方法实现了聚类算法，提供必要参数(点之间距离计算，求平均点方法)，无需重写算法即可运行
------------------
NDimenPoint - 多维点类，继承VirtualPoint，用来处理多维数据
* 两个通用类 - 虚拟点与聚簇，实际使用的时候，继承VirtualPoint类
*/
class VirtualPoint
{
private:
public:
VirtualPoint() {}
virtual ~VirtualPoint() {}
// 纯虚函数
virtual bool operator==(const VirtualPoint &p) = 0;
virtual bool operator!=(const VirtualPoint &p) = 0;
virtual string toString() = 0;
};
// 为何用智能指针因为簇里不停的清空点集与add点，所以为了提高效率直接操作指针
typedef shared_ptr<VirtualPoint> sharedVPoint;
// 求平均点的方法也可能是任意的，因此需要作为参数传递给算法(函数指针)
typedef sharedVPoint avgPointFunc(const vector<sharedVPoint> &);
class Cluster
{
// 簇类：管理质心与该簇所有的元素
private:
vector<sharedVPoint> points; // 频繁操作点用指针提高效率
sharedVPoint centroid; // centroid质心的点
avgPointFunc *avgPoints; // 求质心的函数指针
public:
Cluster(avgPointFunc avg) { avgPoints = avg; }
~Cluster() {}
Cluster &setCentroid(sharedVPoint p)
{
centroid = p;
// 把质心放进一堆点里这是为何？对结果没有影响影响输出了
points.push_back(p);
return *this;
}
bool updateCentroid()
{
sharedVPoint tmpPoint = avgPoints(points);
// 哪种情况计算出来为 nullptr ? 如果points.size()==0吗？
if (tmpPoint == nullptr)
return false;
bool changed;
// 质心是否改变 true为改变了
if (tmpPoint != nullptr && centroid != nullptr)
changed = (*tmpPoint) != (*centroid); // 计算出来的与原来的一样才为false
else
changed = true;
centroid = tmpPoint; // 计算出来的质心更新一下
return changed;
}
void clear() { points.clear(); }
void addPoint(sharedVPoint p)
{
points.push_back(p);
}
string toString() const
{
stringstream ss;
if (centroid == nullptr || points.size() == 0){
// setCentroid()不把质心加入到点集会影响这里
return "{}";
}
// 打印质心与该簇的所有点
ss << "{"centroid": " << centroid->toString() << ","points": [";
for (int i = 0; i < points.size(); i++)
{
if (i > 0)
ss << ", ";
ss << points[i]->toString();
}
ss << "]}";
return ss.str();
}
// 得到该簇的质心
sharedVPoint getCentroid() const { return centroid; }
// 得到该簇的所有的元素
const vector<sharedVPoint> &getPoints() { return points; }
};
// 计算 VirtualPoint 与 Cluster的质心之间的距离
// 距离的计算方法可能是任意的(不仅仅欧式距离)，因此需要作为参数传递给算法(函数指针)
typedef double calcFunc(const VirtualPoint &, const Cluster &);
class KmeansAlg
{
public:
KmeansAlg() {}
~KmeansAlg() {}
// 生成 k 个位于 [0, n) 中的不同的随机数, n < 100000000
static vector<int> randDiffNumbers(int n, int k)
{
// 选择随机的k个初始质心
const int maxn = 100000000;
vector<int> res;
if (n <= 0 || n >= maxn)
throw std::runtime_error("n is less than zero or greater than maxn(100,000,000)");
for (int i = 0; i < n; i++)
res.push_back(i);
random_shuffle(res.begin(), res.end());
res.resize(k);
return res;
}
static vector<Cluster> run(vector<sharedVPoint> data, int k, calcFunc calcDis, avgPointFunc avgPoints, const int maxRound = 1000)
{
if (k <= 1)
throw std::runtime_error("k is less than 1");
vector<Cluster> clusters;
for (auto &&i : randDiffNumbers(data.size(), k)){
// 从data里随机选择k个作为初始的质心
// Cluster(avgPoints)这是构造了一个簇
// clusters.size() == k
// && 是右值引用 & 是左值引用
clusters.push_back(Cluster(avgPoints).setCentroid(data[i]));
}
for (int round = 0; round < maxRound; round++)
{
// 每次迭代就需要把簇的点集清空因为都要重新计算
for (auto &&c : clusters)
c.clear();
for (size_t i = 0; i < data.size(); i++)
{
// 遍历计算所有的数据点，将其就近分配到对应的簇
double minDis = calcDis(*(data[i]), clusters[0]);
int minIndex_cluster = 0; // 离哪个簇质心距离最小的簇的索引
for (size_t j = 1; j < clusters.size(); j++)
{
// j为从1开始因为前面已经算过了
double tmpDis = calcDis(*(data[i]), clusters[j]);
if (tmpDis < minDis)
minDis = tmpDis, minIndex_cluster = j;
}
// 以上的目的是：看当前的点离哪个簇的质心最近
// 现在就知道你这个数据点属于哪个簇了
clusters[minIndex_cluster].addPoint(data[i]);
}
bool changed = false;
for (auto &&c : clusters){
// 每个簇更新各自的簇质心看是否有改变
changed = changed || c.updateCentroid();
}
std::cout << "第" << round << "轮迭代：" <<"簇质心是否有改变=" << changed << std::endl;
// 簇质心没有改变了就可以退出迭代了
if (!changed)
break;
}
return clusters;
}
};

复制代码

# -*- coding: utf-8 -*-
__author__ = u'东方耀微信：dfy_88888'
__date__ = '2022/3/16 下午5:21'
__product__ = 'PyCharm'
__filename__ = '14_kmeans聚类结果的可视化_for_c++'
# 运行kmeans算法
# 将结果(JSON化)输出到文件中
# 使用Python读取文件内容
# 使用pyplot可视化
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import json
import random
colors = [
"#ff0000", "#00ff00", "#0000ff", "#404040", "#ff00ff", "#00ffff", "#C0ff00", "#ffC000", "#ff00C0", "#000070",
"#007000", "#700000",
]
def paint(ax, xs, ys, color, zs=None, marker='.', s=30):
if zs != None:
# print("这是打印三维的")
ax.scatter(xs=xs, ys=ys, zs=zs, zdir='z', c=color, marker=marker, s=s)
else:
ax.scatter(x=xs, y=ys, c=color, marker=marker, s=s)
def readData():
random.shuffle(colors)
output_json_c_file = "/home/jiang/jjj_eigen_works/my_use_eigen_demos/build/kmeans_visualization_py.json"
data = json.load(open(output_json_c_file, mode="r", encoding="utf-8"))
dimension = data["dimension"]
clusters = []
clusterCnt = 0
for tmpRawCluster in data["clusters"]:
tmpCluster = {"centroid": None, "xss": [],
"color": colors[clusterCnt % 140]}
if "centroid" in tmpRawCluster:
tmpCluster["centroid"] = tmpRawCluster["centroid"]
for i in range(0, dimension):
tmpCluster["xss"].append([])
if "points" in tmpRawCluster:
for tmpRawPoint in tmpRawCluster["points"]:
for j in range(0, len(tmpRawPoint)):
tmpCluster["xss"][j].append(tmpRawPoint[j])
clusters.append(tmpCluster)
clusterCnt += 1
return {"dimension": dimension, "clusters": clusters}
def work():
data = readData()
print("读入的数据：维度=%d, 类别k=%d" % (int(data["dimension"]), len(data["clusters"])))
fig = plt.figure()
if data["dimension"] == 2:
ax = fig.add_subplot(111)
for cluster in data["clusters"]:
if cluster["centroid"]:
paint(ax, cluster["xss"][0],
cluster["xss"][1], cluster["color"], marker='o')
# 画质心用大点
paint(ax, [cluster["centroid"][0]], [
cluster["centroid"][1]], "#000000", marker='^', s=150)
elif data["dimension"] == 3:
ax = fig.add_subplot(111, projection='3d')
for cluster in data["clusters"]:
paint(ax, cluster["xss"][0], cluster["xss"]
[1], cluster["color"], cluster["xss"][2], marker='o')
# 画质心用大点
paint(ax, cluster["centroid"][0], cluster["centroid"]
[1], "#000000", cluster["centroid"][2], marker='^', s=150)
plt.show()
pass
if __name__ == "__main__":
work()

复制代码