19 #include <unordered_map>
23 const int KMEANS_MAX_ITERATIONS = 1000;
25 QString QgsKMeansClusteringAlgorithm::name()
const
27 return QStringLiteral(
"kmeansclustering" );
30 QString QgsKMeansClusteringAlgorithm::displayName()
const
32 return QObject::tr(
"K-means clustering" );
35 QStringList QgsKMeansClusteringAlgorithm::tags()
const
37 return QObject::tr(
"clustering,clusters,kmeans,points" ).split(
',' );
40 QString QgsKMeansClusteringAlgorithm::group()
const
42 return QObject::tr(
"Vector analysis" );
45 QString QgsKMeansClusteringAlgorithm::groupId()
const
47 return QStringLiteral(
"vectoranalysis" );
50 void QgsKMeansClusteringAlgorithm::initAlgorithm(
const QVariantMap & )
57 auto fieldNameParam = qgis::make_unique<QgsProcessingParameterString>( QStringLiteral(
"FIELD_NAME" ),
58 QObject::tr(
"Cluster field name" ), QStringLiteral(
"CLUSTER_ID" ) );
60 addParameter( fieldNameParam.release() );
61 auto sizeFieldNameParam = qgis::make_unique<QgsProcessingParameterString>( QStringLiteral(
"SIZE_FIELD_NAME" ),
62 QObject::tr(
"Cluster size field name" ), QStringLiteral(
"CLUSTER_SIZE" ) );
64 addParameter( sizeFieldNameParam.release() );
69 QString QgsKMeansClusteringAlgorithm::shortHelpString()
const
71 return QObject::tr(
"Calculates the 2D distance based k-means cluster number for each input feature.\n\n"
72 "If input geometries are lines or polygons, the clustering is based on the centroid of the feature." );
75 QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance()
const
77 return new QgsKMeansClusteringAlgorithm();
82 std::unique_ptr< QgsProcessingFeatureSource > source( parameterAsSource( parameters, QStringLiteral(
"INPUT" ), context ) );
86 int k = parameterAsInt( parameters, QStringLiteral(
"CLUSTERS" ), context );
88 QgsFields outputFields = source->fields();
90 const QString clusterFieldName = parameterAsString( parameters, QStringLiteral(
"FIELD_NAME" ), context );
92 const QString clusterSizeFieldName = parameterAsString( parameters, QStringLiteral(
"SIZE_FIELD_NAME" ), context );
93 newFields.
append(
QgsField( clusterSizeFieldName, QVariant::Int ) );
97 std::unique_ptr< QgsFeatureSink > sink( parameterAsSink( parameters, QStringLiteral(
"OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
102 feedback->
pushInfo( QObject::tr(
"Collecting input points" ) );
103 double step = source->featureCount() > 0 ? 50.0 / source->featureCount() : 1;
106 int featureWithGeometryCount = 0;
109 std::vector< Feature > clusterFeatures;
111 QHash< QgsFeatureId, int > idToObj;
123 featureWithGeometryCount++;
139 idToObj[ feat.
id() ] = clusterFeatures.size();
140 clusterFeatures.emplace_back( Feature( point ) );
145 feedback->
reportError( QObject::tr(
"Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
151 feedback->
pushInfo( QObject::tr(
"Calculating clusters" ) );
154 std::vector< QgsPointXY > centers( k );
156 initClusters( clusterFeatures, centers, k, feedback );
157 calculateKMeans( clusterFeatures, centers, k, feedback );
161 std::unordered_map< int, int> clusterSize;
162 for (
int obj : idToObj )
163 clusterSize[ clusterFeatures[ obj ].cluster ]++;
165 features = source->getFeatures();
177 auto obj = idToObj.find( feat.
id() );
180 attr << QVariant() << QVariant();
184 attr << 0 << featureWithGeometryCount;
188 int cluster = clusterFeatures[ *obj ].cluster;
189 attr << cluster << clusterSize[ cluster ];
196 outputs.insert( QStringLiteral(
"OUTPUT" ), dest );
202 void QgsKMeansClusteringAlgorithm::initClusters( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
204 std::size_t n = points.size();
210 for (
int i = 0; i < k; i++ )
211 centers[ i ] = points[ 0 ].point;
215 long duplicateCount = 1;
219 double distanceP1 = 0;
220 double distanceP2 = 0;
221 double maxDistance = -1;
222 for ( std::size_t i = 1; i < n; i++ )
224 distanceP1 = points[i].point.sqrDist( points[p1].point );
225 distanceP2 = points[i].point.sqrDist( points[p2].point );
228 if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
230 maxDistance = std::max( distanceP1, distanceP2 );
231 if ( distanceP1 > distanceP2 )
242 if ( feedback && duplicateCount > 1 )
244 feedback->
pushInfo( QObject::tr(
"There are at least %1 duplicate inputs, the number of output clusters may be less than was requested" ).arg( duplicateCount ) );
251 centers[0] = points[p1].point;
252 centers[1] = points[p2].point;
257 std::vector< double > distances( n );
260 for ( std::size_t j = 0; j < n; j++ )
262 distances[j] = points[j].point.sqrDist( centers[0] );
268 for (
int i = 2; i < k; i++ )
270 std::size_t candidateCenter = 0;
271 double maxDistance = std::numeric_limits<double>::lowest();
274 for ( std::size_t j = 0; j < n; j++ )
277 if ( distances[j] < 0 )
281 distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
284 if ( distances[j] > maxDistance )
287 maxDistance = distances[j];
292 Q_ASSERT( maxDistance >= 0 );
295 distances[candidateCenter] = -1;
297 centers[i] = points[candidateCenter].point;
304 void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> ¢ers,
int k,
QgsProcessingFeedback *feedback )
306 int converged =
false;
307 bool changed =
false;
310 std::vector< uint > weights( k );
313 for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
318 findNearest( objs, centers, k, changed );
319 updateMeans( objs, centers, weights, k );
320 converged = !changed;
323 if ( !converged && feedback )
324 feedback->
reportError( QObject::tr(
"Clustering did not converge after %1 iterations" ).arg( i ) );
326 feedback->
pushInfo( QObject::tr(
"Clustering converged after %1 iterations" ).arg( i ) );
331 void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points,
const std::vector<QgsPointXY> ¢ers,
const int k,
bool &changed )
334 std::size_t n = points.size();
335 for ( std::size_t i = 0; i < n; i++ )
337 Feature &point = points[i];
340 double currentDistance = point.point.sqrDist( centers[0] );
341 int currentCluster = 0;
344 for (
int cluster = 1; cluster < k; cluster++ )
346 const double distance = point.point.sqrDist( centers[cluster] );
347 if ( distance < currentDistance )
349 currentDistance = distance;
350 currentCluster = cluster;
355 if ( point.cluster != currentCluster )
358 point.cluster = currentCluster;
365 void QgsKMeansClusteringAlgorithm::updateMeans(
const std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers, std::vector<uint> &weights,
const int k )
367 uint n = points.size();
368 std::fill( weights.begin(), weights.end(), 0 );
369 for (
int i = 0; i < k; i++ )
371 centers[i].setX( 0.0 );
372 centers[i].setY( 0.0 );
374 for ( uint i = 0; i < n; i++ )
376 int cluster = points[i].cluster;
377 centers[cluster] +=
QgsVector( points[i].point.x(),
378 points[i].point.y() );
379 weights[cluster] += 1;
381 for (
int i = 0; i < k; i++ )
383 centers[i] /= weights[i];