19 #include <unordered_map>
23 const int KMEANS_MAX_ITERATIONS = 1000;
25 QString QgsKMeansClusteringAlgorithm::name()
const
27 return QStringLiteral(
"kmeansclustering" );
30 QString QgsKMeansClusteringAlgorithm::displayName()
const
32 return QObject::tr(
"K-means clustering" );
35 QStringList QgsKMeansClusteringAlgorithm::tags()
const
37 return QObject::tr(
"clustering,clusters,kmeans,points" ).split(
',' );
40 QString QgsKMeansClusteringAlgorithm::group()
const
42 return QObject::tr(
"Vector analysis" );
45 QString QgsKMeansClusteringAlgorithm::groupId()
const
47 return QStringLiteral(
"vectoranalysis" );
50 void QgsKMeansClusteringAlgorithm::initAlgorithm(
const QVariantMap & )
57 auto fieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral(
"FIELD_NAME" ),
58 QObject::tr(
"Cluster field name" ), QStringLiteral(
"CLUSTER_ID" ) );
60 addParameter( fieldNameParam.release() );
61 auto sizeFieldNameParam = std::make_unique<QgsProcessingParameterString>( QStringLiteral(
"SIZE_FIELD_NAME" ),
62 QObject::tr(
"Cluster size field name" ), QStringLiteral(
"CLUSTER_SIZE" ) );
64 addParameter( sizeFieldNameParam.release() );
69 QString QgsKMeansClusteringAlgorithm::shortHelpString()
const
71 return QObject::tr(
"Calculates the 2D distance based k-means cluster number for each input feature.\n\n"
72 "If input geometries are lines or polygons, the clustering is based on the centroid of the feature." );
75 QgsKMeansClusteringAlgorithm *QgsKMeansClusteringAlgorithm::createInstance()
const
77 return new QgsKMeansClusteringAlgorithm();
82 std::unique_ptr< QgsProcessingFeatureSource > source( parameterAsSource( parameters, QStringLiteral(
"INPUT" ), context ) );
86 int k = parameterAsInt( parameters, QStringLiteral(
"CLUSTERS" ), context );
88 QgsFields outputFields = source->fields();
90 const QString clusterFieldName = parameterAsString( parameters, QStringLiteral(
"FIELD_NAME" ), context );
92 const QString clusterSizeFieldName = parameterAsString( parameters, QStringLiteral(
"SIZE_FIELD_NAME" ), context );
93 newFields.
append(
QgsField( clusterSizeFieldName, QVariant::Int ) );
97 std::unique_ptr< QgsFeatureSink > sink( parameterAsSink( parameters, QStringLiteral(
"OUTPUT" ), context, dest, outputFields, source->wkbType(), source->sourceCrs() ) );
102 feedback->
pushInfo( QObject::tr(
"Collecting input points" ) );
103 const double step = source->featureCount() > 0 ? 50.0 / source->featureCount() : 1;
106 int featureWithGeometryCount = 0;
109 std::vector< Feature > clusterFeatures;
111 QHash< QgsFeatureId, int > idToObj;
123 featureWithGeometryCount++;
139 idToObj[ feat.
id() ] = clusterFeatures.size();
140 clusterFeatures.emplace_back( Feature( point ) );
145 feedback->
reportError( QObject::tr(
"Number of geometries is less than the number of clusters requested, not all clusters will get data" ) );
151 feedback->
pushInfo( QObject::tr(
"Calculating clusters" ) );
154 std::vector< QgsPointXY > centers( k );
156 initClusters( clusterFeatures, centers, k, feedback );
157 calculateKMeans( clusterFeatures, centers, k, feedback );
161 std::unordered_map< int, int> clusterSize;
162 for (
const int obj : idToObj )
163 clusterSize[ clusterFeatures[ obj ].cluster ]++;
165 features = source->getFeatures();
177 const auto obj = idToObj.find( feat.
id() );
180 attr << QVariant() << QVariant();
184 attr << 0 << featureWithGeometryCount;
188 const int cluster = clusterFeatures[ *obj ].cluster;
189 attr << cluster << clusterSize[ cluster ];
197 outputs.insert( QStringLiteral(
"OUTPUT" ), dest );
203 void QgsKMeansClusteringAlgorithm::initClusters( std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers,
const int k,
QgsProcessingFeedback *feedback )
205 const std::size_t n = points.size();
211 for (
int i = 0; i < k; i++ )
212 centers[ i ] = points[ 0 ].point;
216 long duplicateCount = 1;
220 double distanceP1 = 0;
221 double distanceP2 = 0;
222 double maxDistance = -1;
223 for ( std::size_t i = 1; i < n; i++ )
225 distanceP1 = points[i].point.sqrDist( points[p1].point );
226 distanceP2 = points[i].point.sqrDist( points[p2].point );
229 if ( ( distanceP1 > maxDistance ) || ( distanceP2 > maxDistance ) )
231 maxDistance = std::max( distanceP1, distanceP2 );
232 if ( distanceP1 > distanceP2 )
243 if ( feedback && duplicateCount > 1 )
245 feedback->
pushInfo( QObject::tr(
"There are at least %n duplicate input(s), the number of output clusters may be less than was requested",
nullptr, duplicateCount ) );
252 centers[0] = points[p1].point;
253 centers[1] = points[p2].point;
258 std::vector< double > distances( n );
261 for ( std::size_t j = 0; j < n; j++ )
263 distances[j] = points[j].point.sqrDist( centers[0] );
269 for (
int i = 2; i < k; i++ )
271 std::size_t candidateCenter = 0;
272 double maxDistance = std::numeric_limits<double>::lowest();
275 for ( std::size_t j = 0; j < n; j++ )
278 if ( distances[j] < 0 )
282 distances[j] = std::min( points[j].point.sqrDist( centers[i - 1] ), distances[j] );
285 if ( distances[j] > maxDistance )
288 maxDistance = distances[j];
293 Q_ASSERT( maxDistance >= 0 );
296 distances[candidateCenter] = -1;
298 centers[i] = points[candidateCenter].point;
305 void QgsKMeansClusteringAlgorithm::calculateKMeans( std::vector<QgsKMeansClusteringAlgorithm::Feature> &objs, std::vector<QgsPointXY> ¢ers,
int k,
QgsProcessingFeedback *feedback )
307 int converged =
false;
308 bool changed =
false;
311 std::vector< uint > weights( k );
314 for ( i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++ )
319 findNearest( objs, centers, k, changed );
320 updateMeans( objs, centers, weights, k );
321 converged = !changed;
324 if ( !converged && feedback )
325 feedback->
reportError( QObject::tr(
"Clustering did not converge after %n iteration(s)",
nullptr, i ) );
327 feedback->
pushInfo( QObject::tr(
"Clustering converged after %n iteration(s)",
nullptr, i ) );
332 void QgsKMeansClusteringAlgorithm::findNearest( std::vector<QgsKMeansClusteringAlgorithm::Feature> &points,
const std::vector<QgsPointXY> ¢ers,
const int k,
bool &changed )
335 const std::size_t n = points.size();
336 for ( std::size_t i = 0; i < n; i++ )
338 Feature &point = points[i];
341 double currentDistance = point.point.sqrDist( centers[0] );
342 int currentCluster = 0;
345 for (
int cluster = 1; cluster < k; cluster++ )
347 const double distance = point.point.sqrDist( centers[cluster] );
348 if ( distance < currentDistance )
350 currentDistance = distance;
351 currentCluster = cluster;
356 if ( point.cluster != currentCluster )
359 point.cluster = currentCluster;
366 void QgsKMeansClusteringAlgorithm::updateMeans(
const std::vector<Feature> &points, std::vector<QgsPointXY> ¢ers, std::vector<uint> &weights,
const int k )
368 const uint n = points.size();
369 std::fill( weights.begin(), weights.end(), 0 );
370 for (
int i = 0; i < k; i++ )
372 centers[i].setX( 0.0 );
373 centers[i].setY( 0.0 );
375 for ( uint i = 0; i < n; i++ )
377 const int cluster = points[i].cluster;
378 centers[cluster] +=
QgsVector( points[i].point.x(),
379 points[i].point.y() );
380 weights[cluster] += 1;
382 for (
int i = 0; i < k; i++ )
384 centers[i] /= weights[i];