21#include <unordered_map>
25using namespace Qt::StringLiterals;
29constexpr uint KMEANS_MAX_ITERATIONS = 1000;
31QString QgsKMeansClusteringAlgorithm::name()
const
33 return u
"kmeansclustering"_s;
36QString QgsKMeansClusteringAlgorithm::displayName()
const
38 return QObject::tr(
"K-means clustering" );
41QStringList QgsKMeansClusteringAlgorithm::tags()
const
43 return QObject::tr(
"clustering,clusters,kmeans,points" ).split(
',' );
46QString QgsKMeansClusteringAlgorithm::group()
const
48 return QObject::tr(
"Vector analysis" );
51QString QgsKMeansClusteringAlgorithm::groupId()
const
53 return u
"vectoranalysis"_s;
56void QgsKMeansClusteringAlgorithm::initAlgorithm(
const QVariantMap & )
61 QStringList initializationMethods;
62 initializationMethods << QObject::tr(
"Farthest points" ) << QObject::tr(
"K-means++" );
63 addParameter(
new QgsProcessingParameterEnum( u
"METHOD"_s, QObject::tr(
"Method" ), initializationMethods,
false, 0,
false ) );
65 auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( u
"FIELD_NAME"_s, QObject::tr(
"Cluster field name" ), u
"CLUSTER_ID"_s );
67 addParameter( fieldNameParam.release() );
68 auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( u
"SIZE_FIELD_NAME"_s, QObject::tr(
"Cluster size field name" ), u
"CLUSTER_SIZE"_s );
70 addParameter( sizeFieldNameParam.release() );
75QString QgsKMeansClusteringAlgorithm::shortHelpString()
const
78 "This algorithm calculates the 2D distance based k-means cluster number for each input feature.\n\n"
79 "If input geometries are lines or polygons, the clustering is based on the centroid of the feature.\n\n"
81 "Arthur, David & Vassilvitskii, Sergei. (2007). K-Means++: The Advantages of Careful Seeding. Proc. of the Annu. ACM-SIAM Symp. on Discrete Algorithms. 8.\n\n"
82 "Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, Greedy and Not So Greedy k-means++"
86QString QgsKMeansClusteringAlgorithm::shortDescription()
const
88 return QObject::tr(
"Calculates the 2D distance based k-means cluster number for each input feature." );
91QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance()
const
93 return new QgsKMeansClusteringAlgorithm();
98 std::unique_ptr<QgsProcessingFeatureSource> source( parameterAsSource( parameters, u
"INPUT"_s, context ) );
102 int k = parameterAsInt( parameters, u
"CLUSTERS"_s, context );
103 int initializationMethod = parameterAsInt( parameters, u
"METHOD"_s, context );
105 QgsFields outputFields = source->fields();
107 const QString clusterFieldName = parameterAsString( parameters, u
"FIELD_NAME"_s, context );
108 newFields.
append(
QgsField( clusterFieldName, QMetaType::Type::Int ) );
109 const QString clusterSizeFieldName = parameterAsString( parameters, u
"SIZE_FIELD_NAME"_s, context );
110 newFields.
append(
QgsField( clusterSizeFieldName, QMetaType::Type::Int ) );
114 std::unique_ptr<QgsFeatureSink> sink( parameterAsSink( parameters, u
"OUTPUT"_s, context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
119 feedback->
pushInfo( QObject::tr(
"Collecting input points" ) );
120 const double step = source->featureCount() > 0 ? 50.0 /
static_cast< double >( source->featureCount() ) : 1;
123 int featureWithGeometryCount = 0;
126 std::vector<Feature> clusterFeatures;
128 QHash<QgsFeatureId, std::size_t> idToObj;
140 featureWithGeometryCount++;
156 idToObj[feat.
id()] = clusterFeatures.size();
157 clusterFeatures.emplace_back( Feature( point ) );
162 feedback->
reportError( QObject::tr(
"Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
168 feedback->
pushInfo( QObject::tr(
"Calculating clusters" ) );
171 std::vector<QgsPointXY> centers( k );
172 switch ( initializationMethod )
175 initClustersFarthestPoints( clusterFeatures, centers, k, feedback );
178 initClustersPlusPlus( clusterFeatures, centers, k, feedback );
183 calculateKMeans( clusterFeatures, centers, k, feedback );
187 std::unordered_map<int, int> clusterSize;
188 for (
auto it = idToObj.constBegin(); it != idToObj.constEnd(); ++it )
190 clusterSize[clusterFeatures[it.value()].cluster]++;
193 features = source->getFeatures();
205 const auto obj = idToObj.find( feat.
id() );
208 attr << QVariant() << QVariant();
212 attr << 0 << featureWithGeometryCount;
216 const int cluster = clusterFeatures[*obj].cluster;
217 attr << cluster << clusterSize[cluster];
230 outputs.insert( u
"OUTPUT"_s, dest );
236void QgsKMeansClusteringAlgorithm::initClustersFarthestPoints( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
238 const std::size_t n = points.size();
244 for (
int i = 0; i < k; i++ )
245 centers[i] = points[0].point;
249 std::size_t duplicateCount = 1;
253 double distanceP1 = 0;
254 double distanceP2 = 0;
255 double maxDistance = -1;
256 for ( std::size_t i = 1; i < n; i++ )
258 distanceP1 = points[i].point.sqrDist( points[p1].point );
259 distanceP2 = points[i].point.sqrDist( points[p2].point );
262 if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
264 maxDistance = std::max( distanceP1, distanceP2 );
265 if ( distanceP1 > distanceP2 )
276 if ( feedback && duplicateCount > 1 )
278 feedback->
pushWarning( QObject::tr(
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested",
nullptr,
static_cast< int >( duplicateCount ) ) );
285 centers[0] = points[p1].point;
286 centers[1] = points[p2].point;
291 std::vector<double> distances( n );
294 for ( std::size_t j = 0; j < n; j++ )
296 distances[j] = points[j].point.sqrDist( centers[0] );
302 for (
int i = 2; i < k; i++ )
304 std::size_t candidateCenter = 0;
305 double maxDistance = std::numeric_limits<double>::lowest();
308 for ( std::size_t j = 0; j < n; j++ )
311 if ( distances[j] < 0 )
315 distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
318 if ( distances[j] > maxDistance )
321 maxDistance = distances[j];
326 Q_ASSERT( maxDistance >= 0 );
329 distances[candidateCenter] = -1;
331 centers[i] = points[candidateCenter].point;
336void QgsKMeansClusteringAlgorithm::initClustersPlusPlus( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
338 const std::size_t n = points.size();
344 for (
int i = 0; i < k; i++ )
345 centers[i] = points[0].point;
350 std::random_device rd;
351 std::mt19937 gen( rd() );
352 std::uniform_int_distribution<size_t> distrib( 0, n - 1 );
354 std::size_t p1 = distrib( gen );
355 centers[0] = points[p1].point;
358 std::vector<double> distances( n );
359 double totalError = 0;
360 std::size_t duplicateCount = 1;
361 for (
size_t i = 0; i < n; i++ )
363 double distance = points[i].point.sqrDist( centers[0] );
364 distances[i] = distance;
365 totalError += distance;
371 if ( feedback && duplicateCount > 1 )
373 feedback->
pushWarning( QObject::tr(
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested",
nullptr,
static_cast< int >( duplicateCount ) ) );
381 unsigned int numCandidateCenters = 2 +
static_cast< int >( std::floor( std::log( k ) ) );
382 std::vector<double> randomNumbers( numCandidateCenters );
383 std::vector<size_t> candidateCenters( numCandidateCenters );
385 std::uniform_real_distribution<double> dis( 0.0, 1.0 );
386 for (
int i = 1; i < k; i++ )
389 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
391 randomNumbers[j] = dis( gen ) * totalError;
395 std::vector<double> cumSum = distances;
396 for (
size_t j = 1; j < n; j++ )
398 cumSum[j] += cumSum[j - 1];
402 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
407 while ( low <= high )
409 size_t mid = low + ( high - low ) / 2;
410 if ( cumSum[mid] < randomNumbers[j] )
428 candidateCenters[j] = low;
431 std::vector<std::vector<double>> distancesCandidateCenters( numCandidateCenters, std::vector<double>( n ) );
435 double currentError = 0;
436 double lowestError = std::numeric_limits<double>::max();
437 unsigned int bestCandidateIndex = 0;
438 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
440 for (
size_t z = 0; z < n; z++ )
443 double distance = points[candidateCenters[j]].point.sqrDist( points[z].point );
445 if ( distance > distances[z] )
447 distance = distances[z];
449 distancesCandidateCenters[j][z] = distance;
450 currentError += distance;
452 if ( lowestError > currentError )
454 lowestError = currentError;
455 bestCandidateIndex = j;
460 for (
size_t j = 0; j < n; j++ )
462 distances[j] = distancesCandidateCenters[bestCandidateIndex][j];
465 centers[i] = points[candidateCenters[bestCandidateIndex]].point;
467 totalError = lowestError;
473void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> ¢ers,
int k,
QgsProcessingFeedback *feedback )
475 int converged =
false;
476 bool changed =
false;
479 std::vector<uint> weights( k );
482 for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
487 findNearest( objs, centers, k, changed );
488 updateMeans( objs, centers, weights, k );
489 converged = !changed;
492 if ( !converged && feedback )
493 feedback->
reportError( QObject::tr(
"Clustering did not converge after %n iteration(s)",
nullptr,
static_cast<int>( i ) ) );
495 feedback->
pushInfo( QObject::tr(
"Clustering converged after %n iteration(s)",
nullptr,
static_cast<int>( i ) ) );
500void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points,
const std::vector<QgsPointXY> ¢ers,
const int k,
bool &changed )
503 const std::size_t n = points.size();
504 for ( std::size_t i = 0; i < n; i++ )
506 Feature &point = points[i];
509 double currentDistance = point.point.sqrDist( centers[0] );
510 int currentCluster = 0;
513 for (
int cluster = 1; cluster < k; cluster++ )
515 const double distance = point.point.sqrDist( centers[cluster] );
516 if ( distance < currentDistance )
518 currentDistance = distance;
519 currentCluster = cluster;
524 if ( point.cluster != currentCluster )
527 point.cluster = currentCluster;
534void QgsKMeansClusteringAlgorithm::updateMeans(
const std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers, std::vector<uint> &weights,
const int k )
536 const uint n = points.size();
537 std::fill( weights.begin(), weights.end(), 0 );
538 for (
int i = 0; i < k; i++ )
540 centers[i].setX( 0.0 );
541 centers[i].setY( 0.0 );
543 for ( uint i = 0; i < n; i++ )
545 const int cluster = points[i].cluster;
546 centers[cluster] +=
QgsVector( points[i].point.x(), points[i].point.y() );
547 weights[cluster] += 1;
549 for (
int i = 0; i < k; i++ )
551 centers[i] /= weights[i];
@ VectorAnyGeometry
Any vector layer with geometry.
@ Advanced
Parameter is an advanced parameter which should be hidden from users by default.
Wrapper for iterator of features from vector data provider or vector layer.
bool nextFeature(QgsFeature &f)
Fetch next feature and stores in f, returns true on success.
Wraps a request for features to a vector layer (or directly its vector data provider).
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
The feature class encapsulates a single feature including its unique ID, geometry and a list of field...
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
bool hasGeometry() const
Returns true if the feature has an associated geometry.
bool isCanceled() const
Tells whether the operation has been canceled already.
void setProgress(double progress)
Sets the current progress for the feedback object.
Encapsulate a field in an attribute table or data source.
Container of fields for a vector layer.
bool append(const QgsField &field, Qgis::FieldOrigin origin=Qgis::FieldOrigin::Provider, int originIndex=-1)
Appends a field.
A geometry is the spatial representation of a feature.
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
QgsGeometry centroid() const
Returns the center of mass of a geometry.
Qgis::WkbType wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.).
Contains information about the context in which a processing algorithm is executed.
Custom exception class for processing related exceptions.
Base class for providing feedback from a processing algorithm.
void featureAddedToSink(const QString &output)
Reports that a feature was added to the the sink associated with the specified algorithm output.
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
virtual void pushWarning(const QString &warning)
Pushes a warning informational message from the algorithm.
void featureSinkFinalized(const QString &output)
Reports that a feature sink has been finalized.
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
An enum based parameter for processing algorithms, allowing for selection from predefined values.
A feature sink output for processing algorithms.
An input feature source (such as vector layers) parameter for processing algorithms.
A numeric parameter for processing algorithms.
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
Represent a 2-dimensional vector.
static Qgis::WkbType flatType(Qgis::WkbType type)
Returns the flat type for a WKB type.
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference).
T qgsgeometry_cast(QgsAbstractGeometry *geom)