21#include <unordered_map>
25using namespace Qt::StringLiterals;
29constexpr uint KMEANS_MAX_ITERATIONS = 1000;
31QString QgsKMeansClusteringAlgorithm::name()
const
33 return u
"kmeansclustering"_s;
36QString QgsKMeansClusteringAlgorithm::displayName()
const
38 return QObject::tr(
"K-means clustering" );
41QStringList QgsKMeansClusteringAlgorithm::tags()
const
43 return QObject::tr(
"clustering,clusters,kmeans,points" ).split(
',' );
46QString QgsKMeansClusteringAlgorithm::group()
const
48 return QObject::tr(
"Vector analysis" );
51QString QgsKMeansClusteringAlgorithm::groupId()
const
53 return u
"vectoranalysis"_s;
56void QgsKMeansClusteringAlgorithm::initAlgorithm(
const QVariantMap & )
61 QStringList initializationMethods;
62 initializationMethods << QObject::tr(
"Farthest points" )
63 << QObject::tr(
"K-means++" );
64 addParameter(
new QgsProcessingParameterEnum( u
"METHOD"_s, QObject::tr(
"Method" ), initializationMethods,
false, 0,
false ) );
66 auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( u
"FIELD_NAME"_s, QObject::tr(
"Cluster field name" ), u
"CLUSTER_ID"_s );
68 addParameter( fieldNameParam.release() );
69 auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( u
"SIZE_FIELD_NAME"_s, QObject::tr(
"Cluster size field name" ), u
"CLUSTER_SIZE"_s );
71 addParameter( sizeFieldNameParam.release() );
76QString QgsKMeansClusteringAlgorithm::shortHelpString()
const
78 return QObject::tr(
"This algorithm calculates the 2D distance based k-means cluster number for each input feature.\n\n"
79 "If input geometries are lines or polygons, the clustering is based on the centroid of the feature.\n\n"
81 "Arthur, David & Vassilvitskii, Sergei. (2007). K-Means++: The Advantages of Careful Seeding. Proc. of the Annu. ACM-SIAM Symp. on Discrete Algorithms. 8.\n\n"
82 "Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, Greedy and Not So Greedy k-means++" );
85QString QgsKMeansClusteringAlgorithm::shortDescription()
const
87 return QObject::tr(
"Calculates the 2D distance based k-means cluster number for each input feature." );
90QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance()
const
92 return new QgsKMeansClusteringAlgorithm();
97 std::unique_ptr<QgsProcessingFeatureSource> source( parameterAsSource( parameters, u
"INPUT"_s, context ) );
101 int k = parameterAsInt( parameters, u
"CLUSTERS"_s, context );
102 int initializationMethod = parameterAsInt( parameters, u
"METHOD"_s, context );
104 QgsFields outputFields = source->fields();
106 const QString clusterFieldName = parameterAsString( parameters, u
"FIELD_NAME"_s, context );
107 newFields.
append(
QgsField( clusterFieldName, QMetaType::Type::Int ) );
108 const QString clusterSizeFieldName = parameterAsString( parameters, u
"SIZE_FIELD_NAME"_s, context );
109 newFields.
append(
QgsField( clusterSizeFieldName, QMetaType::Type::Int ) );
113 std::unique_ptr<QgsFeatureSink> sink( parameterAsSink( parameters, u
"OUTPUT"_s, context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
118 feedback->
pushInfo( QObject::tr(
"Collecting input points" ) );
119 const double step = source->featureCount() > 0 ? 50.0 /
static_cast< double >( source->featureCount() ) : 1;
122 int featureWithGeometryCount = 0;
125 std::vector<Feature> clusterFeatures;
127 QHash<QgsFeatureId, std::size_t> idToObj;
139 featureWithGeometryCount++;
155 idToObj[feat.
id()] = clusterFeatures.size();
156 clusterFeatures.emplace_back( Feature( point ) );
161 feedback->
reportError( QObject::tr(
"Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
167 feedback->
pushInfo( QObject::tr(
"Calculating clusters" ) );
170 std::vector<QgsPointXY> centers( k );
171 switch ( initializationMethod )
174 initClustersFarthestPoints( clusterFeatures, centers, k, feedback );
177 initClustersPlusPlus( clusterFeatures, centers, k, feedback );
182 calculateKMeans( clusterFeatures, centers, k, feedback );
186 std::unordered_map<int, int> clusterSize;
187 for (
auto it = idToObj.constBegin(); it != idToObj.constEnd(); ++it )
189 clusterSize[clusterFeatures[it.value()].cluster]++;
192 features = source->getFeatures();
204 const auto obj = idToObj.find( feat.
id() );
207 attr << QVariant() << QVariant();
211 attr << 0 << featureWithGeometryCount;
215 const int cluster = clusterFeatures[*obj].cluster;
216 attr << cluster << clusterSize[cluster];
226 outputs.insert( u
"OUTPUT"_s, dest );
232void QgsKMeansClusteringAlgorithm::initClustersFarthestPoints( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
234 const std::size_t n = points.size();
240 for (
int i = 0; i < k; i++ )
241 centers[i] = points[0].point;
245 std::size_t duplicateCount = 1;
249 double distanceP1 = 0;
250 double distanceP2 = 0;
251 double maxDistance = -1;
252 for ( std::size_t i = 1; i < n; i++ )
254 distanceP1 = points[i].point.sqrDist( points[p1].point );
255 distanceP2 = points[i].point.sqrDist( points[p2].point );
258 if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
260 maxDistance = std::max( distanceP1, distanceP2 );
261 if ( distanceP1 > distanceP2 )
272 if ( feedback && duplicateCount > 1 )
274 feedback->
pushWarning( QObject::tr(
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested",
nullptr,
static_cast< int >( duplicateCount ) ) );
281 centers[0] = points[p1].point;
282 centers[1] = points[p2].point;
287 std::vector<double> distances( n );
290 for ( std::size_t j = 0; j < n; j++ )
292 distances[j] = points[j].point.sqrDist( centers[0] );
298 for (
int i = 2; i < k; i++ )
300 std::size_t candidateCenter = 0;
301 double maxDistance = std::numeric_limits<double>::lowest();
304 for ( std::size_t j = 0; j < n; j++ )
307 if ( distances[j] < 0 )
311 distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
314 if ( distances[j] > maxDistance )
317 maxDistance = distances[j];
322 Q_ASSERT( maxDistance >= 0 );
325 distances[candidateCenter] = -1;
327 centers[i] = points[candidateCenter].point;
332void QgsKMeansClusteringAlgorithm::initClustersPlusPlus( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
334 const std::size_t n = points.size();
340 for (
int i = 0; i < k; i++ )
341 centers[i] = points[0].point;
346 std::random_device rd;
347 std::mt19937 gen( rd() );
348 std::uniform_int_distribution<size_t> distrib( 0, n - 1 );
350 std::size_t p1 = distrib( gen );
351 centers[0] = points[p1].point;
354 std::vector<double> distances( n );
355 double totalError = 0;
356 std::size_t duplicateCount = 1;
357 for (
size_t i = 0; i < n; i++ )
359 double distance = points[i].point.sqrDist( centers[0] );
360 distances[i] = distance;
361 totalError += distance;
367 if ( feedback && duplicateCount > 1 )
369 feedback->
pushWarning( QObject::tr(
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested",
nullptr,
static_cast< int >( duplicateCount ) ) );
377 unsigned int numCandidateCenters = 2 +
static_cast< int >( std::floor( std::log( k ) ) );
378 std::vector<double> randomNumbers( numCandidateCenters );
379 std::vector<size_t> candidateCenters( numCandidateCenters );
381 std::uniform_real_distribution<double> dis( 0.0, 1.0 );
382 for (
int i = 1; i < k; i++ )
385 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
387 randomNumbers[j] = dis( gen ) * totalError;
391 std::vector<double> cumSum = distances;
392 for (
size_t j = 1; j < n; j++ )
394 cumSum[j] += cumSum[j - 1];
398 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
403 while ( low <= high )
405 size_t mid = low + ( high - low ) / 2;
406 if ( cumSum[mid] < randomNumbers[j] )
424 candidateCenters[j] = low;
427 std::vector<std::vector<double>> distancesCandidateCenters( numCandidateCenters, std::vector<double>( n ) );
431 double currentError = 0;
432 double lowestError = std::numeric_limits<double>::max();
433 unsigned int bestCandidateIndex = 0;
434 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
436 for (
size_t z = 0; z < n; z++ )
439 double distance = points[candidateCenters[j]].point.sqrDist( points[z].point );
441 if ( distance > distances[z] )
443 distance = distances[z];
445 distancesCandidateCenters[j][z] = distance;
446 currentError += distance;
448 if ( lowestError > currentError )
450 lowestError = currentError;
451 bestCandidateIndex = j;
456 for (
size_t j = 0; j < n; j++ )
458 distances[j] = distancesCandidateCenters[bestCandidateIndex][j];
461 centers[i] = points[candidateCenters[bestCandidateIndex]].point;
463 totalError = lowestError;
469void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> ¢ers,
int k,
QgsProcessingFeedback *feedback )
471 int converged =
false;
472 bool changed =
false;
475 std::vector<uint> weights( k );
478 for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
483 findNearest( objs, centers, k, changed );
484 updateMeans( objs, centers, weights, k );
485 converged = !changed;
488 if ( !converged && feedback )
489 feedback->
reportError( QObject::tr(
"Clustering did not converge after %n iteration(s)",
nullptr,
static_cast<int>( i ) ) );
491 feedback->
pushInfo( QObject::tr(
"Clustering converged after %n iteration(s)",
nullptr,
static_cast<int>( i ) ) );
496void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points,
const std::vector<QgsPointXY> ¢ers,
const int k,
bool &changed )
499 const std::size_t n = points.size();
500 for ( std::size_t i = 0; i < n; i++ )
502 Feature &point = points[i];
505 double currentDistance = point.point.sqrDist( centers[0] );
506 int currentCluster = 0;
509 for (
int cluster = 1; cluster < k; cluster++ )
511 const double distance = point.point.sqrDist( centers[cluster] );
512 if ( distance < currentDistance )
514 currentDistance = distance;
515 currentCluster = cluster;
520 if ( point.cluster != currentCluster )
523 point.cluster = currentCluster;
530void QgsKMeansClusteringAlgorithm::updateMeans(
const std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers, std::vector<uint> &weights,
const int k )
532 const uint n = points.size();
533 std::fill( weights.begin(), weights.end(), 0 );
534 for (
int i = 0; i < k; i++ )
536 centers[i].setX( 0.0 );
537 centers[i].setY( 0.0 );
539 for ( uint i = 0; i < n; i++ )
541 const int cluster = points[i].cluster;
542 centers[cluster] +=
QgsVector( points[i].point.x(), points[i].point.y() );
543 weights[cluster] += 1;
545 for (
int i = 0; i < k; i++ )
547 centers[i] /= weights[i];
@ VectorAnyGeometry
Any vector layer with geometry.
@ Advanced
Parameter is an advanced parameter which should be hidden from users by default.
Wrapper for iterator of features from vector data provider or vector layer.
bool nextFeature(QgsFeature &f)
Fetch next feature and stores in f, returns true on success.
Wraps a request for features to a vector layer (or directly its vector data provider).
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
The feature class encapsulates a single feature including its unique ID, geometry and a list of field...
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
bool hasGeometry() const
Returns true if the feature has an associated geometry.
bool isCanceled() const
Tells whether the operation has been canceled already.
void setProgress(double progress)
Sets the current progress for the feedback object.
Encapsulate a field in an attribute table or data source.
Container of fields for a vector layer.
bool append(const QgsField &field, Qgis::FieldOrigin origin=Qgis::FieldOrigin::Provider, int originIndex=-1)
Appends a field.
A geometry is the spatial representation of a feature.
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
QgsGeometry centroid() const
Returns the center of mass of a geometry.
Qgis::WkbType wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.).
Contains information about the context in which a processing algorithm is executed.
Custom exception class for processing related exceptions.
Base class for providing feedback from a processing algorithm.
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
virtual void pushWarning(const QString &warning)
Pushes a warning informational message from the algorithm.
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
An enum based parameter for processing algorithms, allowing for selection from predefined values.
A feature sink output for processing algorithms.
An input feature source (such as vector layers) parameter for processing algorithms.
A numeric parameter for processing algorithms.
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
Represent a 2-dimensional vector.
static Qgis::WkbType flatType(Qgis::WkbType type)
Returns the flat type for a WKB type.
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference).
T qgsgeometry_cast(QgsAbstractGeometry *geom)