QGIS API Documentation 3.99.0-Master (d270888f95f)
Loading...
Searching...
No Matches
qgsstringutils.cpp
Go to the documentation of this file.
1/***************************************************************************
2 qgsstringutils.cpp
3 ------------------
4 begin : June 2015
5 copyright : (C) 2015 by Nyall Dawson
6 email : nyall dot dawson at gmail dot com
7 ***************************************************************************
8 * *
9 * This program is free software; you can redistribute it and/or modify *
10 * it under the terms of the GNU General Public License as published by *
11 * the Free Software Foundation; either version 2 of the License, or *
12 * (at your option) any later version. *
13 * *
14 ***************************************************************************/
15
16#include "qgsstringutils.h"
17
18#include <cstdlib>
19
20#include "qgslogger.h"
21
22#include <QRegularExpression>
23#include <QString>
24#include <QStringList>
25#include <QTextBoundaryFinder>
26#include <QVector>
27
28using namespace Qt::StringLiterals;
29
31
32QString QgsStringUtils::unaccent( const QString &input )
33{
34 // Normalize input to NFC so that Unicode characters composed of base +
35 // combining marks are converted to their canonical composed form.
36 // This ensures lookups match the keys in UNACCENT_MAP, which are stored
37 // in NFC (e.g. "e" + U+0301 becomes "é", as PostgreSQL does it.).
38 const QString in = input.normalized( QString::NormalizationForm_C );
39 QString out;
40 out.reserve( in.size() );
41
42 qsizetype i = 0;
43 const qsizetype n = in.size();
44
45 while ( i < n )
46 {
47 const QChar c = in.at( i );
48 int len = 1;
49
50 // Detect surrogate pair (non-BMP)
51 if ( c.isHighSurrogate() && i + 1 < n )
52 {
53 const QChar c2 = in.at( i + 1 );
54 if ( c2.isLowSurrogate() )
55 len = 2;
56 }
57
58 const QString key = in.mid( i, len ).normalized( QString::NormalizationForm_C );
59
60 auto it = UNACCENT_MAP.constFind( key );
61 if ( it != UNACCENT_MAP.constEnd() )
62 out.append( it.value() );
63 else
64 out.append( key );
65
66 i += len;
67 }
68
69 return out;
70}
71
72QString QgsStringUtils::capitalize( const QString &string, Qgis::Capitalization capitalization )
73{
74 if ( string.isEmpty() )
75 return QString();
76
77 switch ( capitalization )
78 {
81 return string;
82
84 return string.toUpper();
85
88 return string.toLower();
89
91 {
92 QString temp = string;
93
94 QTextBoundaryFinder wordSplitter( QTextBoundaryFinder::Word, string.constData(), string.length(), nullptr, 0 );
95 QTextBoundaryFinder letterSplitter( QTextBoundaryFinder::Grapheme, string.constData(), string.length(), nullptr, 0 );
96
97 wordSplitter.setPosition( 0 );
98 bool first = true;
99 while ( ( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartOfItem )
100 || wordSplitter.toNextBoundary() >= 0 )
101 {
102 first = false;
103 letterSplitter.setPosition( wordSplitter.position() );
104 ( void )letterSplitter.toNextBoundary();
105 QString substr = string.mid( wordSplitter.position(), letterSplitter.position() - wordSplitter.position() );
106 temp.replace( wordSplitter.position(), substr.length(), substr.toUpper() );
107 }
108 return temp;
109 }
110
112 {
113 // yes, this is MASSIVELY simplifying the problem!!
114
115 static QStringList smallWords;
116 static QStringList newPhraseSeparators;
117 static QRegularExpression splitWords;
118 if ( smallWords.empty() )
119 {
120 smallWords = QObject::tr( "a|an|and|as|at|but|by|en|for|if|in|nor|of|on|or|per|s|the|to|vs.|vs|via" ).split( '|' );
121 newPhraseSeparators = QObject::tr( ".|:" ).split( '|' );
122 splitWords = QRegularExpression( u"\\b"_s, QRegularExpression::UseUnicodePropertiesOption );
123 }
124
125 const bool allSameCase = string.toLower() == string || string.toUpper() == string;
126 const QStringList parts = ( allSameCase ? string.toLower() : string ).split( splitWords, Qt::SkipEmptyParts );
127 QString result;
128 bool firstWord = true;
129 int i = 0;
130 int lastWord = parts.count() - 1;
131 for ( const QString &word : std::as_const( parts ) )
132 {
133 if ( newPhraseSeparators.contains( word.trimmed() ) )
134 {
135 firstWord = true;
136 result += word;
137 }
138 else if ( firstWord || ( i == lastWord ) || !smallWords.contains( word ) )
139 {
140 result += word.at( 0 ).toUpper() + word.mid( 1 );
141 firstWord = false;
142 }
143 else
144 {
145 result += word;
146 }
147 i++;
148 }
149 return result;
150 }
151
153 QString result = QgsStringUtils::capitalize( string.toLower(), Qgis::Capitalization::ForceFirstLetterToCapital ).simplified();
154 result.remove( ' ' );
155 return result;
156 }
157 // no warnings
158 return string;
159}
160
161// original code from http://www.qtcentre.org/threads/52456-HTML-Unicode-ampersand-encoding
162QString QgsStringUtils::ampersandEncode( const QString &string )
163{
164 QString encoded;
165 for ( int i = 0; i < string.size(); ++i )
166 {
167 QChar ch = string.at( i );
168 if ( ch.unicode() > 160 )
169 encoded += u"&#%1;"_s.arg( static_cast< int >( ch.unicode() ) );
170 else if ( ch.unicode() == 38 )
171 encoded += "&amp;"_L1;
172 else if ( ch.unicode() == 60 )
173 encoded += "&lt;"_L1;
174 else if ( ch.unicode() == 62 )
175 encoded += "&gt;"_L1;
176 else
177 encoded += ch;
178 }
179 return encoded;
180}
181
182int QgsStringUtils::levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive )
183{
184 int length1 = string1.length();
185 int length2 = string2.length();
186
187 //empty strings? solution is trivial...
188 if ( string1.isEmpty() )
189 {
190 return length2;
191 }
192 else if ( string2.isEmpty() )
193 {
194 return length1;
195 }
196
197 //handle case sensitive flag (or not)
198 QString s1( caseSensitive ? string1 : string1.toLower() );
199 QString s2( caseSensitive ? string2 : string2.toLower() );
200
201 const QChar *s1Char = s1.constData();
202 const QChar *s2Char = s2.constData();
203
204 //strip out any common prefix
205 int commonPrefixLen = 0;
206 while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
207 {
208 commonPrefixLen++;
209 length1--;
210 length2--;
211 s1Char++;
212 s2Char++;
213 }
214
215 //strip out any common suffix
216 while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
217 {
218 length1--;
219 length2--;
220 }
221
222 //fully checked either string? if so, the answer is easy...
223 if ( length1 == 0 )
224 {
225 return length2;
226 }
227 else if ( length2 == 0 )
228 {
229 return length1;
230 }
231
232 //ensure the inner loop is longer
233 if ( length1 > length2 )
234 {
235 std::swap( s1, s2 );
236 std::swap( length1, length2 );
237 }
238
239 //levenshtein algorithm begins here
240 std::vector< int > col( length2 + 1, 0 );
241 std::vector< int > prevCol;
242 prevCol.reserve( length2 + 1 );
243 for ( int i = 0; i < length2 + 1; ++i )
244 {
245 prevCol.emplace_back( i );
246 }
247 const QChar *s2start = s2Char;
248 for ( int i = 0; i < length1; ++i )
249 {
250 col[0] = i + 1;
251 s2Char = s2start;
252 for ( int j = 0; j < length2; ++j )
253 {
254 col[j + 1] = std::min( std::min( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + ( ( *s1Char == *s2Char ) ? 0 : 1 ) );
255 s2Char++;
256 }
257 col.swap( prevCol );
258 s1Char++;
259 }
260 return prevCol[length2];
261}
262
263QString QgsStringUtils::longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive )
264{
265 if ( string1.isEmpty() || string2.isEmpty() )
266 {
267 //empty strings, solution is trivial...
268 return QString();
269 }
270
271 //handle case sensitive flag (or not)
272 QString s1( caseSensitive ? string1 : string1.toLower() );
273 QString s2( caseSensitive ? string2 : string2.toLower() );
274
275 if ( s1 == s2 )
276 {
277 //another trivial case, identical strings
278 return s1;
279 }
280
281 int *currentScores = new int [ s2.length()];
282 int *previousScores = new int [ s2.length()];
283 int maxCommonLength = 0;
284 int lastMaxBeginIndex = 0;
285
286 const QChar *s1Char = s1.constData();
287 const QChar *s2Char = s2.constData();
288 const QChar *s2Start = s2Char;
289
290 for ( int i = 0; i < s1.length(); ++i )
291 {
292 for ( int j = 0; j < s2.length(); ++j )
293 {
294 if ( *s1Char != *s2Char )
295 {
296 currentScores[j] = 0;
297 }
298 else
299 {
300 if ( i == 0 || j == 0 )
301 {
302 currentScores[j] = 1;
303 }
304 else
305 {
306 currentScores[j] = 1 + previousScores[j - 1];
307 }
308
309 if ( maxCommonLength < currentScores[j] )
310 {
311 maxCommonLength = currentScores[j];
312 lastMaxBeginIndex = i;
313 }
314 }
315 s2Char++;
316 }
317 std::swap( currentScores, previousScores );
318 s1Char++;
319 s2Char = s2Start;
320 }
321 delete [] currentScores;
322 delete [] previousScores;
323 return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
324}
325
326int QgsStringUtils::hammingDistance( const QString &string1, const QString &string2, bool caseSensitive )
327{
328 if ( string1.isEmpty() && string2.isEmpty() )
329 {
330 //empty strings, solution is trivial...
331 return 0;
332 }
333
334 if ( string1.length() != string2.length() )
335 {
336 //invalid inputs
337 return -1;
338 }
339
340 //handle case sensitive flag (or not)
341 QString s1( caseSensitive ? string1 : string1.toLower() );
342 QString s2( caseSensitive ? string2 : string2.toLower() );
343
344 if ( s1 == s2 )
345 {
346 //another trivial case, identical strings
347 return 0;
348 }
349
350 int distance = 0;
351 const QChar *s1Char = s1.constData();
352 const QChar *s2Char = s2.constData();
353
354 for ( int i = 0; i < string1.length(); ++i )
355 {
356 if ( *s1Char != *s2Char )
357 distance++;
358 s1Char++;
359 s2Char++;
360 }
361
362 return distance;
363}
364
365QString QgsStringUtils::soundex( const QString &string )
366{
367 if ( string.isEmpty() )
368 return QString();
369
370 QString tmp = string.toUpper();
371
372 //strip non character codes, and vowel like characters after the first character
373 QChar *char1 = tmp.data();
374 QChar *char2 = tmp.data();
375 int outLen = 0;
376 for ( int i = 0; i < tmp.length(); ++i, ++char2 )
377 {
378 if ( ( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || ( ( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
379 && ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
380 && ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
381 && ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
382 {
383 *char1 = *char2;
384 char1++;
385 outLen++;
386 }
387 }
388 tmp.truncate( outLen );
389
390 QChar *tmpChar = tmp.data();
391 tmpChar++;
392 for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
393 {
394 switch ( ( *tmpChar ).unicode() )
395 {
396 case 0x42:
397 case 0x46:
398 case 0x50:
399 case 0x56:
400 tmp.replace( i, 1, QChar( 0x31 ) );
401 break;
402
403 case 0x43:
404 case 0x47:
405 case 0x4A:
406 case 0x4B:
407 case 0x51:
408 case 0x53:
409 case 0x58:
410 case 0x5A:
411 tmp.replace( i, 1, QChar( 0x32 ) );
412 break;
413
414 case 0x44:
415 case 0x54:
416 tmp.replace( i, 1, QChar( 0x33 ) );
417 break;
418
419 case 0x4C:
420 tmp.replace( i, 1, QChar( 0x34 ) );
421 break;
422
423 case 0x4D:
424 case 0x4E:
425 tmp.replace( i, 1, QChar( 0x35 ) );
426 break;
427
428 case 0x52:
429 tmp.replace( i, 1, QChar( 0x36 ) );
430 break;
431 }
432 }
433
434 //remove adjacent duplicates
435 char1 = tmp.data();
436 char2 = tmp.data();
437 char2++;
438 outLen = 1;
439 for ( int i = 1; i < tmp.length(); ++i, ++char2 )
440 {
441 if ( *char2 != *char1 )
442 {
443 char1++;
444 *char1 = *char2;
445 outLen++;
446 if ( outLen == 4 )
447 break;
448 }
449 }
450 tmp.truncate( outLen );
451 if ( tmp.length() < 4 )
452 {
453 tmp.append( "000" );
454 tmp.truncate( 4 );
455 }
456
457 return tmp;
458}
459
460
461double QgsStringUtils::fuzzyScore( const QString &candidate, const QString &search )
462{
463 QString candidateNormalized = candidate.simplified().normalized( QString:: NormalizationForm_C ).toLower();
464 QString searchNormalized = search.simplified().normalized( QString:: NormalizationForm_C ).toLower();
465
466 int candidateLength = candidateNormalized.length();
467 int searchLength = searchNormalized.length();
468 int score = 0;
469
470 // if the candidate and the search term are empty, no other option than 0 score
471 if ( candidateLength == 0 || searchLength == 0 )
472 return score;
473
474 int candidateIdx = 0;
475 int searchIdx = 0;
476 // there is always at least one word
477 int maxScore = FUZZY_SCORE_WORD_MATCH;
478
479 bool isPreviousIndexMatching = false;
480 bool isWordOpen = true;
481
482 // loop trough each candidate char and calculate the potential max score
483 while ( candidateIdx < candidateLength )
484 {
485 QChar candidateChar = candidateNormalized[ candidateIdx++ ];
486 bool isCandidateCharWordEnd = candidateChar == ' ' || candidateChar.isPunct();
487
488 // the first char is always the default score
489 if ( candidateIdx == 1 )
490 maxScore += FUZZY_SCORE_NEW_MATCH;
491 // every space character or underscore is a opportunity for a new word
492 else if ( isCandidateCharWordEnd )
493 maxScore += FUZZY_SCORE_WORD_MATCH;
494 // potentially we can match every other character
495 else
497
498 // we looped through all the characters
499 if ( searchIdx >= searchLength )
500 continue;
501
502 QChar searchChar = searchNormalized[ searchIdx ];
503 bool isSearchCharWordEnd = searchChar == ' ' || searchChar.isPunct();
504
505 // match!
506 if ( candidateChar == searchChar || ( isCandidateCharWordEnd && isSearchCharWordEnd ) )
507 {
508 searchIdx++;
509
510 // if we have just successfully finished a word, give higher score
511 if ( isSearchCharWordEnd )
512 {
513 if ( isWordOpen )
514 score += FUZZY_SCORE_WORD_MATCH;
515 else if ( isPreviousIndexMatching )
517 else
518 score += FUZZY_SCORE_NEW_MATCH;
519
520 isWordOpen = true;
521 }
522 // if we have consecutive characters matching, give higher score
523 else if ( isPreviousIndexMatching )
524 {
526 }
527 // normal score for new independent character that matches
528 else
529 {
530 score += FUZZY_SCORE_NEW_MATCH;
531 }
532
533 isPreviousIndexMatching = true;
534 }
535 // if the current character does NOT match, we are sure we cannot build a word for now
536 else
537 {
538 isPreviousIndexMatching = false;
539 isWordOpen = false;
540 }
541
542 // if the search string is covered, check if the last match is end of word
543 if ( searchIdx >= searchLength )
544 {
545 bool isEndOfWord = ( candidateIdx >= candidateLength )
546 ? true
547 : candidateNormalized[candidateIdx] == ' ' || candidateNormalized[candidateIdx].isPunct();
548
549 if ( isEndOfWord )
550 score += FUZZY_SCORE_WORD_MATCH;
551 }
552
553 // QgsLogger::debug( u"TMP: %1 | %2 | %3 | %4 | %5"_s.arg( candidateChar, searchChar, QString::number(score), QString::number(isCandidateCharWordEnd), QString::number(isSearchCharWordEnd) ) + QStringLiteral( __FILE__ ) );
554 }
555
556 // QgsLogger::debug( u"RES: %1 | %2"_s.arg( QString::number(maxScore), QString::number(score) ) + QStringLiteral( __FILE__ ) );
557 // we didn't loop through all the search chars, it means, that they are not present in the current candidate
558 if ( searchIdx < searchLength )
559 score = 0;
560
561 return static_cast<float>( std::max( score, 0 ) ) / std::max( maxScore, 1 );
562}
563
564
565QString QgsStringUtils::insertLinks( const QString &string, bool *foundLinks )
566{
567 QString converted = string;
568
569 // http://alanstorm.com/url_regex_explained
570 // note - there's more robust implementations available
571 const thread_local QRegularExpression urlRegEx( u"((?:(?:http|https|ftp|file)://[^\\s]+[^\\s,.]+)|(?:\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^!\"#$%&'()*+,\\-./:;<=>?@[\\\\\\]^_`{|}~\\s]|/)))))"_s );
572 const thread_local QRegularExpression protoRegEx( u"^(?:f|ht)tps?://|file://"_s );
573 const thread_local QRegularExpression emailRegEx( u"([\\w._%+-]+@[\\w.-]+\\.[A-Za-z]+)"_s );
574
575 int offset = 0;
576 bool found = false;
577 QRegularExpressionMatch match = urlRegEx.match( converted );
578 while ( match.hasMatch() )
579 {
580 found = true;
581 QString url = match.captured( 1 );
582 QString protoUrl = url;
583 if ( !protoRegEx.match( protoUrl ).hasMatch() )
584 {
585 protoUrl.prepend( "http://" );
586 }
587 QString anchor = u"<a href=\"%1\">%2</a>"_s.arg( protoUrl.toHtmlEscaped(), url.toHtmlEscaped() );
588 converted.replace( match.capturedStart( 1 ), url.length(), anchor );
589 offset = match.capturedStart( 1 ) + anchor.length();
590 match = urlRegEx.match( converted, offset );
591 }
592
593 offset = 0;
594 match = emailRegEx.match( converted );
595 while ( match.hasMatch() )
596 {
597 found = true;
598 QString email = match.captured( 1 );
599 QString anchor = u"<a href=\"mailto:%1\">%1</a>"_s.arg( email.toHtmlEscaped() );
600 converted.replace( match.capturedStart( 1 ), email.length(), anchor );
601 offset = match.capturedStart( 1 ) + anchor.length();
602 match = emailRegEx.match( converted, offset );
603 }
604
605 if ( foundLinks )
606 *foundLinks = found;
607
608 return converted;
609}
610
611bool QgsStringUtils::isUrl( const QString &string )
612{
613 const thread_local QRegularExpression rxUrl( u"^(http|https|ftp|file)://\\S+$"_s );
614 return rxUrl.match( string ).hasMatch();
615}
616
617QString QgsStringUtils::htmlToMarkdown( const QString &html )
618{
619 // Any changes in this function must be copied to qgscrashreport.cpp too
620 QString converted = html;
621 converted.replace( "<br>"_L1, "\n"_L1 );
622 converted.replace( "<b>"_L1, "**"_L1 );
623 converted.replace( "</b>"_L1, "**"_L1 );
624 converted.replace( "<pre>"_L1, "\n```\n"_L1 );
625 converted.replace( "</pre>"_L1, "```\n"_L1 );
626
627 const thread_local QRegularExpression hrefRegEx( u"<a\\s+href\\s*=\\s*([^<>]*)\\s*>([^<>]*)</a>"_s );
628
629 int offset = 0;
630 QRegularExpressionMatch match = hrefRegEx.match( converted );
631 while ( match.hasMatch() )
632 {
633 QString url = match.captured( 1 ).replace( "\""_L1, QString() );
634 url.replace( '\'', QString() );
635 QString name = match.captured( 2 );
636 QString anchor = u"[%1](%2)"_s.arg( name, url );
637 converted.replace( match.capturedStart(), match.capturedLength(), anchor );
638 offset = match.capturedStart() + anchor.length();
639 match = hrefRegEx.match( converted, offset );
640 }
641
642 return converted;
643}
644
645QString QgsStringUtils::wordWrap( const QString &string, const int length, const bool useMaxLineLength, const QString &customDelimiter )
646{
647 if ( string.isEmpty() || length == 0 )
648 return string;
649
650 QString newstr;
651 QRegularExpression rx;
652 int delimiterLength = 0;
653
654 if ( !customDelimiter.isEmpty() )
655 {
656 rx.setPattern( QRegularExpression::escape( customDelimiter ) );
657 delimiterLength = customDelimiter.length();
658 }
659 else
660 {
661 // \x{200B} is a ZERO-WIDTH SPACE, needed for worwrap to support a number of complex scripts (Indic, Arabic, etc.)
662 rx.setPattern( u"[\\x{200B}\\s]"_s );
663 delimiterLength = 1;
664 }
665
666 const QStringList lines = string.split( '\n' );
667 int strLength, strCurrent, strHit, lastHit;
668
669 for ( int i = 0; i < lines.size(); i++ )
670 {
671 const QString line = lines.at( i );
672 strLength = line.length();
673 if ( strLength <= length )
674 {
675 // shortcut, no wrapping required
676 newstr.append( line );
677 if ( i < lines.size() - 1 )
678 newstr.append( '\n' );
679 continue;
680 }
681 strCurrent = 0;
682 strHit = 0;
683 lastHit = 0;
684
685 while ( strCurrent < strLength )
686 {
687 // positive wrap value = desired maximum line width to wrap
688 // negative wrap value = desired minimum line width before wrap
689 if ( useMaxLineLength )
690 {
691 //first try to locate delimiter backwards
692 strHit = ( strCurrent + length >= strLength ) ? -1 : line.lastIndexOf( rx, strCurrent + length );
693 if ( strHit == lastHit || strHit == -1 )
694 {
695 //if no new backward delimiter found, try to locate forward
696 strHit = ( strCurrent + std::abs( length ) >= strLength ) ? -1 : line.indexOf( rx, strCurrent + std::abs( length ) );
697 }
698 lastHit = strHit;
699 }
700 else
701 {
702 strHit = ( strCurrent + std::abs( length ) >= strLength ) ? -1 : line.indexOf( rx, strCurrent + std::abs( length ) );
703 }
704 if ( strHit > -1 )
705 {
706 newstr.append( QStringView {line} .mid( strCurrent, strHit - strCurrent ) );
707 newstr.append( '\n' );
708 strCurrent = strHit + delimiterLength;
709 }
710 else
711 {
712 newstr.append( QStringView {line} .mid( strCurrent ) );
713 strCurrent = strLength;
714 }
715 }
716 if ( i < lines.size() - 1 )
717 newstr.append( '\n' );
718 }
719
720 return newstr;
721}
722
724{
725 string = string.replace( ',', QChar( 65040 ) ).replace( QChar( 8229 ), QChar( 65072 ) ); // comma & two-dot leader
726 string = string.replace( QChar( 12289 ), QChar( 65041 ) ).replace( QChar( 12290 ), QChar( 65042 ) ); // ideographic comma & full stop
727 string = string.replace( ':', QChar( 65043 ) ).replace( ';', QChar( 65044 ) );
728 string = string.replace( '!', QChar( 65045 ) ).replace( '?', QChar( 65046 ) );
729 string = string.replace( QChar( 12310 ), QChar( 65047 ) ).replace( QChar( 12311 ), QChar( 65048 ) ); // white lenticular brackets
730 string = string.replace( QChar( 8230 ), QChar( 65049 ) ); // three-dot ellipse
731 string = string.replace( QChar( 8212 ), QChar( 65073 ) ).replace( QChar( 8211 ), QChar( 65074 ) ); // em & en dash
732 string = string.replace( '_', QChar( 65075 ) ).replace( QChar( 65103 ), QChar( 65076 ) ); // low line & wavy low line
733 string = string.replace( '(', QChar( 65077 ) ).replace( ')', QChar( 65078 ) );
734 string = string.replace( '{', QChar( 65079 ) ).replace( '}', QChar( 65080 ) );
735 string = string.replace( '<', QChar( 65087 ) ).replace( '>', QChar( 65088 ) );
736 string = string.replace( '[', QChar( 65095 ) ).replace( ']', QChar( 65096 ) );
737 string = string.replace( QChar( 12308 ), QChar( 65081 ) ).replace( QChar( 12309 ), QChar( 65082 ) ); // tortoise shell brackets
738 string = string.replace( QChar( 12304 ), QChar( 65083 ) ).replace( QChar( 12305 ), QChar( 65084 ) ); // black lenticular brackets
739 string = string.replace( QChar( 12298 ), QChar( 65085 ) ).replace( QChar( 12299 ), QChar( 65086 ) ); // double angle brackets
740 string = string.replace( QChar( 12300 ), QChar( 65089 ) ).replace( QChar( 12301 ), QChar( 65090 ) ); // corner brackets
741 string = string.replace( QChar( 12302 ), QChar( 65091 ) ).replace( QChar( 12303 ), QChar( 65092 ) ); // white corner brackets
742 return string;
743}
744
745QString QgsStringUtils::qRegExpEscape( const QString &string )
746{
747 // code and logic taken from the Qt source code
748 const QLatin1Char backslash( '\\' );
749 const int count = string.count();
750
751 QString escaped;
752 escaped.reserve( count * 2 );
753 for ( int i = 0; i < count; i++ )
754 {
755 switch ( string.at( i ).toLatin1() )
756 {
757 case '$':
758 case '(':
759 case ')':
760 case '*':
761 case '+':
762 case '.':
763 case '?':
764 case '[':
765 case '\\':
766 case ']':
767 case '^':
768 case '{':
769 case '|':
770 case '}':
771 escaped.append( backslash );
772 }
773 escaped.append( string.at( i ) );
774 }
775 return escaped;
776}
777
778QString QgsStringUtils::truncateMiddleOfString( const QString &string, int maxLength )
779{
780 const int charactersToTruncate = string.length() - maxLength;
781 if ( charactersToTruncate <= 0 )
782 return string;
783
784 // note we actually truncate an extra character, as we'll be replacing it with the ... character
785 const int truncateFrom = string.length() / 2 - ( charactersToTruncate + 1 ) / 2;
786 if ( truncateFrom <= 0 )
787 return QChar( 0x2026 );
788
789 return QStringView( string ).first( truncateFrom ) + QString( QChar( 0x2026 ) ) + QStringView( string ).sliced( truncateFrom + charactersToTruncate + 1 );
790}
791
792bool QgsStringUtils::containsByWord( const QString &candidate, const QString &words, Qt::CaseSensitivity sensitivity )
793{
794 if ( candidate.trimmed().isEmpty() )
795 return false;
796
797 const thread_local QRegularExpression rxWhitespace( u"\\s+"_s );
798 const QStringList parts = words.split( rxWhitespace, Qt::SkipEmptyParts );
799 if ( parts.empty() )
800 return false;
801 for ( const QString &word : parts )
802 {
803 if ( !candidate.contains( word, sensitivity ) )
804 return false;
805 }
806 return true;
807}
808
810 : mMatch( match )
811 , mReplacement( replacement )
812 , mCaseSensitive( caseSensitive )
813 , mWholeWordOnly( wholeWordOnly )
814{
815 if ( mWholeWordOnly )
816 {
817 mRx.setPattern( u"\\b%1\\b"_s.arg( mMatch ) );
818 mRx.setPatternOptions( mCaseSensitive ? QRegularExpression::NoPatternOption : QRegularExpression::CaseInsensitiveOption );
819 }
820}
821
822QString QgsStringReplacement::process( const QString &input ) const
823{
824 QString result = input;
825 if ( !mWholeWordOnly )
826 {
827 return result.replace( mMatch, mReplacement, mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
828 }
829 else
830 {
831 return result.replace( mRx, mReplacement );
832 }
833}
834
836{
837 QgsStringMap map;
838 map.insert( u"match"_s, mMatch );
839 map.insert( u"replace"_s, mReplacement );
840 map.insert( u"caseSensitive"_s, mCaseSensitive ? u"1"_s : u"0"_s );
841 map.insert( u"wholeWord"_s, mWholeWordOnly ? u"1"_s : u"0"_s );
842 return map;
843}
844
846{
847 return QgsStringReplacement( properties.value( u"match"_s ),
848 properties.value( u"replace"_s ),
849 properties.value( u"caseSensitive"_s, u"0"_s ) == "1"_L1,
850 properties.value( u"wholeWord"_s, u"0"_s ) == "1"_L1 );
851}
852
853QString QgsStringReplacementCollection::process( const QString &input ) const
854{
855 QString result = input;
856 for ( const QgsStringReplacement &r : mReplacements )
857 {
858 result = r.process( result );
859 }
860 return result;
861}
862
863void QgsStringReplacementCollection::writeXml( QDomElement &elem, QDomDocument &doc ) const
864{
865 for ( const QgsStringReplacement &r : mReplacements )
866 {
867 QgsStringMap props = r.properties();
868 QDomElement propEl = doc.createElement( u"replacement"_s );
869 QgsStringMap::const_iterator it = props.constBegin();
870 for ( ; it != props.constEnd(); ++it )
871 {
872 propEl.setAttribute( it.key(), it.value() );
873 }
874 elem.appendChild( propEl );
875 }
876}
877
878void QgsStringReplacementCollection::readXml( const QDomElement &elem )
879{
880 mReplacements.clear();
881 QDomNodeList nodelist = elem.elementsByTagName( u"replacement"_s );
882 for ( int i = 0; i < nodelist.count(); i++ )
883 {
884 QDomElement replacementElem = nodelist.at( i ).toElement();
885 QDomNamedNodeMap nodeMap = replacementElem.attributes();
886
887 QgsStringMap props;
888 for ( int j = 0; j < nodeMap.count(); ++j )
889 {
890 props.insert( nodeMap.item( j ).nodeName(), nodeMap.item( j ).nodeValue() );
891 }
892 mReplacements << QgsStringReplacement::fromProperties( props );
893 }
894
895}
Capitalization
String capitalization options.
Definition qgis.h:3448
@ AllSmallCaps
Force all characters to small caps.
Definition qgis.h:3456
@ MixedCase
Mixed case, ie no change.
Definition qgis.h:3449
@ UpperCamelCase
Convert the string to upper camel case. Note that this method does not unaccent characters.
Definition qgis.h:3455
@ AllLowercase
Convert all characters to lowercase.
Definition qgis.h:3451
@ TitleCase
Simple title case conversion - does not fully grammatically parse the text and uses simple rules only...
Definition qgis.h:3454
@ SmallCaps
Mixed case small caps.
Definition qgis.h:3453
@ ForceFirstLetterToCapital
Convert just the first letter of each word to uppercase, leave the rest untouched.
Definition qgis.h:3452
@ AllUppercase
Convert all characters to uppercase.
Definition qgis.h:3450
void readXml(const QDomElement &elem)
Reads the collection state from an XML element.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made using QgsStringR...
void writeXml(QDomElement &elem, QDomDocument &doc) const
Writes the collection state to an XML element.
A representation of a single string replacement.
static QgsStringReplacement fromProperties(const QgsStringMap &properties)
Creates a new QgsStringReplacement from an encoded properties map.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made.
bool wholeWordOnly() const
Returns true if match only applies to whole words, or false if partial word matches are permitted.
QString replacement() const
Returns the string to replace matches with.
bool caseSensitive() const
Returns true if match is case sensitive.
QgsStringReplacement(const QString &match, const QString &replacement, bool caseSensitive=false, bool wholeWordOnly=false)
Constructor for QgsStringReplacement.
QString match() const
Returns the string matched by this object.
QgsStringMap properties() const
Returns a map of the replacement properties.
static int hammingDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Hamming distance between two strings.
static QString soundex(const QString &string)
Returns the Soundex representation of a string.
static QHash< QString, QString > UNACCENT_MAP
Lookup table used by unaccent().
static int levenshteinDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Levenshtein edit distance between two strings.
static QString htmlToMarkdown(const QString &html)
Convert simple HTML to markdown.
static QString longestCommonSubstring(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the longest common substring between two strings.
static QString capitalize(const QString &string, Qgis::Capitalization capitalization)
Converts a string by applying capitalization rules to the string.
static QString substituteVerticalCharacters(QString string)
Returns a string with characters having vertical representation form substituted.
static QString unaccent(const QString &input)
Removes accents and other diacritical marks from a string, replacing accented characters with their u...
static bool containsByWord(const QString &candidate, const QString &words, Qt::CaseSensitivity sensitivity=Qt::CaseInsensitive)
Given a candidate string, returns true if the candidate contains all the individual words from anothe...
static QString insertLinks(const QString &string, bool *foundLinks=nullptr)
Returns a string with any URL (e.g., http(s)/ftp) and mailto: text converted to valid HTML <a ....
static double fuzzyScore(const QString &candidate, const QString &search)
Tests a candidate string to see how likely it is a match for a specified search string.
static QString qRegExpEscape(const QString &string)
Returns an escaped string matching the behavior of QRegExp::escape.
static QString ampersandEncode(const QString &string)
Makes a raw string safe for inclusion as a HTML/XML string literal.
static QString wordWrap(const QString &string, int length, bool useMaxLineLength=true, const QString &customDelimiter=QString())
Automatically wraps a string by inserting new line characters at appropriate locations in the string.
static bool isUrl(const QString &string)
Returns whether the string is a URL (http,https,ftp,file).
static QString truncateMiddleOfString(const QString &string, int maxLength)
Truncates a string to the specified maximum character length.
static QHash< QString, QString > createUnaccentMap()
Generates the unaccent mapping table (auto-generated by script at build time).
As part of the API refactoring and improvements which landed in the Processing API was substantially reworked from the x version This was done in order to allow much of the underlying Processing framework to be ported into c
QMap< QString, QString > QgsStringMap
Definition qgis.h:7413
#define FUZZY_SCORE_CONSECUTIVE_MATCH
#define FUZZY_SCORE_WORD_MATCH
#define FUZZY_SCORE_NEW_MATCH