21#include <unordered_map>
25using namespace Qt::StringLiterals;
29constexpr uint KMEANS_MAX_ITERATIONS = 1000;
31QString QgsKMeansClusteringAlgorithm::name()
const
33 return u
"kmeansclustering"_s;
36QString QgsKMeansClusteringAlgorithm::displayName()
const
38 return QObject::tr(
"K-means clustering" );
41QStringList QgsKMeansClusteringAlgorithm::tags()
const
43 return QObject::tr(
"clustering,clusters,kmeans,points" ).split(
',' );
46QString QgsKMeansClusteringAlgorithm::group()
const
48 return QObject::tr(
"Vector analysis" );
51QString QgsKMeansClusteringAlgorithm::groupId()
const
53 return u
"vectoranalysis"_s;
56void QgsKMeansClusteringAlgorithm::initAlgorithm(
const QVariantMap & )
61 QStringList initializationMethods;
62 initializationMethods << QObject::tr(
"Farthest points" ) << QObject::tr(
"K-means++" );
63 addParameter(
new QgsProcessingParameterEnum( u
"METHOD"_s, QObject::tr(
"Method" ), initializationMethods,
false, 0,
false ) );
65 auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( u
"FIELD_NAME"_s, QObject::tr(
"Cluster field name" ), u
"CLUSTER_ID"_s );
67 addParameter( fieldNameParam.release() );
68 auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( u
"SIZE_FIELD_NAME"_s, QObject::tr(
"Cluster size field name" ), u
"CLUSTER_SIZE"_s );
70 addParameter( sizeFieldNameParam.release() );
75QString QgsKMeansClusteringAlgorithm::shortHelpString()
const
78 "This algorithm calculates the 2D distance based k-means cluster number for each input feature.\n\n"
79 "If input geometries are lines or polygons, the clustering is based on the centroid of the feature.\n\n"
81 "Arthur, David & Vassilvitskii, Sergei. (2007). K-Means++: The Advantages of Careful Seeding. Proc. of the Annu. ACM-SIAM Symp. on Discrete Algorithms. 8.\n\n"
82 "Bhattacharya, Anup & Eube, Jan & Röglin, Heiko & Schmidt, Melanie. (2019). Noisy, Greedy and Not So Greedy k-means++"
86QString QgsKMeansClusteringAlgorithm::shortDescription()
const
88 return QObject::tr(
"Calculates the 2D distance based k-means cluster number for each input feature." );
91QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance()
const
93 return new QgsKMeansClusteringAlgorithm();
98 std::unique_ptr<QgsProcessingFeatureSource> source( parameterAsSource( parameters, u
"INPUT"_s, context ) );
102 int k = parameterAsInt( parameters, u
"CLUSTERS"_s, context );
103 int initializationMethod = parameterAsInt( parameters, u
"METHOD"_s, context );
105 QgsFields outputFields = source->fields();
107 const QString clusterFieldName = parameterAsString( parameters, u
"FIELD_NAME"_s, context );
108 newFields.
append(
QgsField( clusterFieldName, QMetaType::Type::Int ) );
109 const QString clusterSizeFieldName = parameterAsString( parameters, u
"SIZE_FIELD_NAME"_s, context );
110 newFields.
append(
QgsField( clusterSizeFieldName, QMetaType::Type::Int ) );
114 std::unique_ptr<QgsFeatureSink> sink( parameterAsSink( parameters, u
"OUTPUT"_s, context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
119 feedback->
pushInfo( QObject::tr(
"Collecting input points" ) );
120 const double step = source->featureCount() > 0 ? 50.0 /
static_cast< double >( source->featureCount() ) : 1;
123 int featureWithGeometryCount = 0;
126 std::vector<Feature> clusterFeatures;
128 QHash<QgsFeatureId, std::size_t> idToObj;
140 featureWithGeometryCount++;
156 idToObj[feat.
id()] = clusterFeatures.size();
157 clusterFeatures.emplace_back( Feature( point ) );
162 feedback->
reportError( QObject::tr(
"Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
168 feedback->
pushInfo( QObject::tr(
"Calculating clusters" ) );
171 std::vector<QgsPointXY> centers( k );
172 switch ( initializationMethod )
175 initClustersFarthestPoints( clusterFeatures, centers, k, feedback );
178 initClustersPlusPlus( clusterFeatures, centers, k, feedback );
183 calculateKMeans( clusterFeatures, centers, k, feedback );
187 std::unordered_map<int, int> clusterSize;
188 for (
auto it = idToObj.constBegin(); it != idToObj.constEnd(); ++it )
190 clusterSize[clusterFeatures[it.value()].cluster]++;
193 features = source->getFeatures();
205 const auto obj = idToObj.find( feat.
id() );
208 attr << QVariant() << QVariant();
212 attr << 0 << featureWithGeometryCount;
216 const int cluster = clusterFeatures[*obj].cluster;
217 attr << cluster << clusterSize[cluster];
227 outputs.insert( u
"OUTPUT"_s, dest );
233void QgsKMeansClusteringAlgorithm::initClustersFarthestPoints( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
235 const std::size_t n = points.size();
241 for (
int i = 0; i < k; i++ )
242 centers[i] = points[0].point;
246 std::size_t duplicateCount = 1;
250 double distanceP1 = 0;
251 double distanceP2 = 0;
252 double maxDistance = -1;
253 for ( std::size_t i = 1; i < n; i++ )
255 distanceP1 = points[i].point.sqrDist( points[p1].point );
256 distanceP2 = points[i].point.sqrDist( points[p2].point );
259 if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
261 maxDistance = std::max( distanceP1, distanceP2 );
262 if ( distanceP1 > distanceP2 )
273 if ( feedback && duplicateCount > 1 )
275 feedback->
pushWarning( QObject::tr(
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested",
nullptr,
static_cast< int >( duplicateCount ) ) );
282 centers[0] = points[p1].point;
283 centers[1] = points[p2].point;
288 std::vector<double> distances( n );
291 for ( std::size_t j = 0; j < n; j++ )
293 distances[j] = points[j].point.sqrDist( centers[0] );
299 for (
int i = 2; i < k; i++ )
301 std::size_t candidateCenter = 0;
302 double maxDistance = std::numeric_limits<double>::lowest();
305 for ( std::size_t j = 0; j < n; j++ )
308 if ( distances[j] < 0 )
312 distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
315 if ( distances[j] > maxDistance )
318 maxDistance = distances[j];
323 Q_ASSERT( maxDistance >= 0 );
326 distances[candidateCenter] = -1;
328 centers[i] = points[candidateCenter].point;
333void QgsKMeansClusteringAlgorithm::initClustersPlusPlus( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
335 const std::size_t n = points.size();
341 for (
int i = 0; i < k; i++ )
342 centers[i] = points[0].point;
347 std::random_device rd;
348 std::mt19937 gen( rd() );
349 std::uniform_int_distribution<size_t> distrib( 0, n - 1 );
351 std::size_t p1 = distrib( gen );
352 centers[0] = points[p1].point;
355 std::vector<double> distances( n );
356 double totalError = 0;
357 std::size_t duplicateCount = 1;
358 for (
size_t i = 0; i < n; i++ )
360 double distance = points[i].point.sqrDist( centers[0] );
361 distances[i] = distance;
362 totalError += distance;
368 if ( feedback && duplicateCount > 1 )
370 feedback->
pushWarning( QObject::tr(
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested",
nullptr,
static_cast< int >( duplicateCount ) ) );
378 unsigned int numCandidateCenters = 2 +
static_cast< int >( std::floor( std::log( k ) ) );
379 std::vector<double> randomNumbers( numCandidateCenters );
380 std::vector<size_t> candidateCenters( numCandidateCenters );
382 std::uniform_real_distribution<double> dis( 0.0, 1.0 );
383 for (
int i = 1; i < k; i++ )
386 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
388 randomNumbers[j] = dis( gen ) * totalError;
392 std::vector<double> cumSum = distances;
393 for (
size_t j = 1; j < n; j++ )
395 cumSum[j] += cumSum[j - 1];
399 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
404 while ( low <= high )
406 size_t mid = low + ( high - low ) / 2;
407 if ( cumSum[mid] < randomNumbers[j] )
425 candidateCenters[j] = low;
428 std::vector<std::vector<double>> distancesCandidateCenters( numCandidateCenters, std::vector<double>( n ) );
432 double currentError = 0;
433 double lowestError = std::numeric_limits<double>::max();
434 unsigned int bestCandidateIndex = 0;
435 for (
unsigned int j = 0; j < numCandidateCenters; j++ )
437 for (
size_t z = 0; z < n; z++ )
440 double distance = points[candidateCenters[j]].point.sqrDist( points[z].point );
442 if ( distance > distances[z] )
444 distance = distances[z];
446 distancesCandidateCenters[j][z] = distance;
447 currentError += distance;
449 if ( lowestError > currentError )
451 lowestError = currentError;
452 bestCandidateIndex = j;
457 for (
size_t j = 0; j < n; j++ )
459 distances[j] = distancesCandidateCenters[bestCandidateIndex][j];
462 centers[i] = points[candidateCenters[bestCandidateIndex]].point;
464 totalError = lowestError;
470void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> ¢ers,
int k,
QgsProcessingFeedback *feedback )
472 int converged =
false;
473 bool changed =
false;
476 std::vector<uint> weights( k );
479 for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
484 findNearest( objs, centers, k, changed );
485 updateMeans( objs, centers, weights, k );
486 converged = !changed;
489 if ( !converged && feedback )
490 feedback->
reportError( QObject::tr(
"Clustering did not converge after %n iteration(s)",
nullptr,
static_cast<int>( i ) ) );
492 feedback->
pushInfo( QObject::tr(
"Clustering converged after %n iteration(s)",
nullptr,
static_cast<int>( i ) ) );
497void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points,
const std::vector<QgsPointXY> ¢ers,
const int k,
bool &changed )
500 const std::size_t n = points.size();
501 for ( std::size_t i = 0; i < n; i++ )
503 Feature &point = points[i];
506 double currentDistance = point.point.sqrDist( centers[0] );
507 int currentCluster = 0;
510 for (
int cluster = 1; cluster < k; cluster++ )
512 const double distance = point.point.sqrDist( centers[cluster] );
513 if ( distance < currentDistance )
515 currentDistance = distance;
516 currentCluster = cluster;
521 if ( point.cluster != currentCluster )
524 point.cluster = currentCluster;
531void QgsKMeansClusteringAlgorithm::updateMeans(
const std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers, std::vector<uint> &weights,
const int k )
533 const uint n = points.size();
534 std::fill( weights.begin(), weights.end(), 0 );
535 for (
int i = 0; i < k; i++ )
537 centers[i].setX( 0.0 );
538 centers[i].setY( 0.0 );
540 for ( uint i = 0; i < n; i++ )
542 const int cluster = points[i].cluster;
543 centers[cluster] +=
QgsVector( points[i].point.x(), points[i].point.y() );
544 weights[cluster] += 1;
546 for (
int i = 0; i < k; i++ )
548 centers[i] /= weights[i];
@ VectorAnyGeometry
Any vector layer with geometry.
@ Advanced
Parameter is an advanced parameter which should be hidden from users by default.
Wrapper for iterator of features from vector data provider or vector layer.
bool nextFeature(QgsFeature &f)
Fetch next feature and stores in f, returns true on success.
Wraps a request for features to a vector layer (or directly its vector data provider).
@ FastInsert
Use faster inserts, at the cost of updating the passed features to reflect changes made at the provid...
The feature class encapsulates a single feature including its unique ID, geometry and a list of field...
void setAttributes(const QgsAttributes &attrs)
Sets the feature's attributes.
bool hasGeometry() const
Returns true if the feature has an associated geometry.
bool isCanceled() const
Tells whether the operation has been canceled already.
void setProgress(double progress)
Sets the current progress for the feedback object.
Encapsulate a field in an attribute table or data source.
Container of fields for a vector layer.
bool append(const QgsField &field, Qgis::FieldOrigin origin=Qgis::FieldOrigin::Provider, int originIndex=-1)
Appends a field.
A geometry is the spatial representation of a feature.
const QgsAbstractGeometry * constGet() const
Returns a non-modifiable (const) reference to the underlying abstract geometry primitive.
QgsGeometry centroid() const
Returns the center of mass of a geometry.
Qgis::WkbType wkbType() const
Returns type of the geometry as a WKB type (point / linestring / polygon etc.).
Contains information about the context in which a processing algorithm is executed.
Custom exception class for processing related exceptions.
Base class for providing feedback from a processing algorithm.
virtual void pushInfo(const QString &info)
Pushes a general informational message from the algorithm.
virtual void pushWarning(const QString &warning)
Pushes a warning informational message from the algorithm.
virtual void reportError(const QString &error, bool fatalError=false)
Reports that the algorithm encountered an error while executing.
An enum based parameter for processing algorithms, allowing for selection from predefined values.
A feature sink output for processing algorithms.
An input feature source (such as vector layers) parameter for processing algorithms.
A numeric parameter for processing algorithms.
static QgsFields combineFields(const QgsFields &fieldsA, const QgsFields &fieldsB, const QString &fieldsBPrefix=QString())
Combines two field lists, avoiding duplicate field names (in a case-insensitive manner).
Represent a 2-dimensional vector.
static Qgis::WkbType flatType(Qgis::WkbType type)
Returns the flat type for a WKB type.
bool qgsDoubleNear(double a, double b, double epsilon=4 *std::numeric_limits< double >::epsilon())
Compare two doubles (but allow some difference).
T qgsgeometry_cast(QgsAbstractGeometry *geom)