QGIS API Documentation 3.99.0-Master (a8882ad4560)
Loading...
Searching...
No Matches
qgsstringutils.cpp
Go to the documentation of this file.
1/***************************************************************************
2 qgsstringutils.cpp
3 ------------------
4 begin : June 2015
5 copyright : (C) 2015 by Nyall Dawson
6 email : nyall dot dawson at gmail dot com
7 ***************************************************************************
8 * *
9 * This program is free software; you can redistribute it and/or modify *
10 * it under the terms of the GNU General Public License as published by *
11 * the Free Software Foundation; either version 2 of the License, or *
12 * (at your option) any later version. *
13 * *
14 ***************************************************************************/
15
16#include "qgsstringutils.h"
17
18#include <cstdlib>
19
20#include "qgslogger.h"
21
22#include <QRegularExpression>
23#include <QStringList>
24#include <QTextBoundaryFinder>
25#include <QVector>
26
28
29QString QgsStringUtils::unaccent( const QString &input )
30{
31 // Normalize input to NFC so that Unicode characters composed of base +
32 // combining marks are converted to their canonical composed form.
33 // This ensures lookups match the keys in UNACCENT_MAP, which are stored
34 // in NFC (e.g. "e" + U+0301 becomes "é", as PostgreSQL does it.).
35 const QString in = input.normalized( QString::NormalizationForm_C );
36 QString out;
37 out.reserve( in.size() );
38
39 qsizetype i = 0;
40 const qsizetype n = in.size();
41
42 while ( i < n )
43 {
44 const QChar c = in.at( i );
45 int len = 1;
46
47 // Detect surrogate pair (non-BMP)
48 if ( c.isHighSurrogate() && i + 1 < n )
49 {
50 const QChar c2 = in.at( i + 1 );
51 if ( c2.isLowSurrogate() )
52 len = 2;
53 }
54
55 const QString key = in.mid( i, len ).normalized( QString::NormalizationForm_C );
56
57 auto it = UNACCENT_MAP.constFind( key );
58 if ( it != UNACCENT_MAP.constEnd() )
59 out.append( it.value() );
60 else
61 out.append( key );
62
63 i += len;
64 }
65
66 return out;
67}
68
69QString QgsStringUtils::capitalize( const QString &string, Qgis::Capitalization capitalization )
70{
71 if ( string.isEmpty() )
72 return QString();
73
74 switch ( capitalization )
75 {
78 return string;
79
81 return string.toUpper();
82
85 return string.toLower();
86
88 {
89 QString temp = string;
90
91 QTextBoundaryFinder wordSplitter( QTextBoundaryFinder::Word, string.constData(), string.length(), nullptr, 0 );
92 QTextBoundaryFinder letterSplitter( QTextBoundaryFinder::Grapheme, string.constData(), string.length(), nullptr, 0 );
93
94 wordSplitter.setPosition( 0 );
95 bool first = true;
96 while ( ( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartOfItem )
97 || wordSplitter.toNextBoundary() >= 0 )
98 {
99 first = false;
100 letterSplitter.setPosition( wordSplitter.position() );
101 ( void )letterSplitter.toNextBoundary();
102 QString substr = string.mid( wordSplitter.position(), letterSplitter.position() - wordSplitter.position() );
103 temp.replace( wordSplitter.position(), substr.length(), substr.toUpper() );
104 }
105 return temp;
106 }
107
109 {
110 // yes, this is MASSIVELY simplifying the problem!!
111
112 static QStringList smallWords;
113 static QStringList newPhraseSeparators;
114 static QRegularExpression splitWords;
115 if ( smallWords.empty() )
116 {
117 smallWords = QObject::tr( "a|an|and|as|at|but|by|en|for|if|in|nor|of|on|or|per|s|the|to|vs.|vs|via" ).split( '|' );
118 newPhraseSeparators = QObject::tr( ".|:" ).split( '|' );
119 splitWords = QRegularExpression( u"\\b"_s, QRegularExpression::UseUnicodePropertiesOption );
120 }
121
122 const bool allSameCase = string.toLower() == string || string.toUpper() == string;
123 const QStringList parts = ( allSameCase ? string.toLower() : string ).split( splitWords, Qt::SkipEmptyParts );
124 QString result;
125 bool firstWord = true;
126 int i = 0;
127 int lastWord = parts.count() - 1;
128 for ( const QString &word : std::as_const( parts ) )
129 {
130 if ( newPhraseSeparators.contains( word.trimmed() ) )
131 {
132 firstWord = true;
133 result += word;
134 }
135 else if ( firstWord || ( i == lastWord ) || !smallWords.contains( word ) )
136 {
137 result += word.at( 0 ).toUpper() + word.mid( 1 );
138 firstWord = false;
139 }
140 else
141 {
142 result += word;
143 }
144 i++;
145 }
146 return result;
147 }
148
150 QString result = QgsStringUtils::capitalize( string.toLower(), Qgis::Capitalization::ForceFirstLetterToCapital ).simplified();
151 result.remove( ' ' );
152 return result;
153 }
154 // no warnings
155 return string;
156}
157
158// original code from http://www.qtcentre.org/threads/52456-HTML-Unicode-ampersand-encoding
159QString QgsStringUtils::ampersandEncode( const QString &string )
160{
161 QString encoded;
162 for ( int i = 0; i < string.size(); ++i )
163 {
164 QChar ch = string.at( i );
165 if ( ch.unicode() > 160 )
166 encoded += u"&#%1;"_s.arg( static_cast< int >( ch.unicode() ) );
167 else if ( ch.unicode() == 38 )
168 encoded += "&amp;"_L1;
169 else if ( ch.unicode() == 60 )
170 encoded += "&lt;"_L1;
171 else if ( ch.unicode() == 62 )
172 encoded += "&gt;"_L1;
173 else
174 encoded += ch;
175 }
176 return encoded;
177}
178
179int QgsStringUtils::levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive )
180{
181 int length1 = string1.length();
182 int length2 = string2.length();
183
184 //empty strings? solution is trivial...
185 if ( string1.isEmpty() )
186 {
187 return length2;
188 }
189 else if ( string2.isEmpty() )
190 {
191 return length1;
192 }
193
194 //handle case sensitive flag (or not)
195 QString s1( caseSensitive ? string1 : string1.toLower() );
196 QString s2( caseSensitive ? string2 : string2.toLower() );
197
198 const QChar *s1Char = s1.constData();
199 const QChar *s2Char = s2.constData();
200
201 //strip out any common prefix
202 int commonPrefixLen = 0;
203 while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
204 {
205 commonPrefixLen++;
206 length1--;
207 length2--;
208 s1Char++;
209 s2Char++;
210 }
211
212 //strip out any common suffix
213 while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
214 {
215 length1--;
216 length2--;
217 }
218
219 //fully checked either string? if so, the answer is easy...
220 if ( length1 == 0 )
221 {
222 return length2;
223 }
224 else if ( length2 == 0 )
225 {
226 return length1;
227 }
228
229 //ensure the inner loop is longer
230 if ( length1 > length2 )
231 {
232 std::swap( s1, s2 );
233 std::swap( length1, length2 );
234 }
235
236 //levenshtein algorithm begins here
237 std::vector< int > col( length2 + 1, 0 );
238 std::vector< int > prevCol;
239 prevCol.reserve( length2 + 1 );
240 for ( int i = 0; i < length2 + 1; ++i )
241 {
242 prevCol.emplace_back( i );
243 }
244 const QChar *s2start = s2Char;
245 for ( int i = 0; i < length1; ++i )
246 {
247 col[0] = i + 1;
248 s2Char = s2start;
249 for ( int j = 0; j < length2; ++j )
250 {
251 col[j + 1] = std::min( std::min( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + ( ( *s1Char == *s2Char ) ? 0 : 1 ) );
252 s2Char++;
253 }
254 col.swap( prevCol );
255 s1Char++;
256 }
257 return prevCol[length2];
258}
259
260QString QgsStringUtils::longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive )
261{
262 if ( string1.isEmpty() || string2.isEmpty() )
263 {
264 //empty strings, solution is trivial...
265 return QString();
266 }
267
268 //handle case sensitive flag (or not)
269 QString s1( caseSensitive ? string1 : string1.toLower() );
270 QString s2( caseSensitive ? string2 : string2.toLower() );
271
272 if ( s1 == s2 )
273 {
274 //another trivial case, identical strings
275 return s1;
276 }
277
278 int *currentScores = new int [ s2.length()];
279 int *previousScores = new int [ s2.length()];
280 int maxCommonLength = 0;
281 int lastMaxBeginIndex = 0;
282
283 const QChar *s1Char = s1.constData();
284 const QChar *s2Char = s2.constData();
285 const QChar *s2Start = s2Char;
286
287 for ( int i = 0; i < s1.length(); ++i )
288 {
289 for ( int j = 0; j < s2.length(); ++j )
290 {
291 if ( *s1Char != *s2Char )
292 {
293 currentScores[j] = 0;
294 }
295 else
296 {
297 if ( i == 0 || j == 0 )
298 {
299 currentScores[j] = 1;
300 }
301 else
302 {
303 currentScores[j] = 1 + previousScores[j - 1];
304 }
305
306 if ( maxCommonLength < currentScores[j] )
307 {
308 maxCommonLength = currentScores[j];
309 lastMaxBeginIndex = i;
310 }
311 }
312 s2Char++;
313 }
314 std::swap( currentScores, previousScores );
315 s1Char++;
316 s2Char = s2Start;
317 }
318 delete [] currentScores;
319 delete [] previousScores;
320 return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
321}
322
323int QgsStringUtils::hammingDistance( const QString &string1, const QString &string2, bool caseSensitive )
324{
325 if ( string1.isEmpty() && string2.isEmpty() )
326 {
327 //empty strings, solution is trivial...
328 return 0;
329 }
330
331 if ( string1.length() != string2.length() )
332 {
333 //invalid inputs
334 return -1;
335 }
336
337 //handle case sensitive flag (or not)
338 QString s1( caseSensitive ? string1 : string1.toLower() );
339 QString s2( caseSensitive ? string2 : string2.toLower() );
340
341 if ( s1 == s2 )
342 {
343 //another trivial case, identical strings
344 return 0;
345 }
346
347 int distance = 0;
348 const QChar *s1Char = s1.constData();
349 const QChar *s2Char = s2.constData();
350
351 for ( int i = 0; i < string1.length(); ++i )
352 {
353 if ( *s1Char != *s2Char )
354 distance++;
355 s1Char++;
356 s2Char++;
357 }
358
359 return distance;
360}
361
362QString QgsStringUtils::soundex( const QString &string )
363{
364 if ( string.isEmpty() )
365 return QString();
366
367 QString tmp = string.toUpper();
368
369 //strip non character codes, and vowel like characters after the first character
370 QChar *char1 = tmp.data();
371 QChar *char2 = tmp.data();
372 int outLen = 0;
373 for ( int i = 0; i < tmp.length(); ++i, ++char2 )
374 {
375 if ( ( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || ( ( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
376 && ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
377 && ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
378 && ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
379 {
380 *char1 = *char2;
381 char1++;
382 outLen++;
383 }
384 }
385 tmp.truncate( outLen );
386
387 QChar *tmpChar = tmp.data();
388 tmpChar++;
389 for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
390 {
391 switch ( ( *tmpChar ).unicode() )
392 {
393 case 0x42:
394 case 0x46:
395 case 0x50:
396 case 0x56:
397 tmp.replace( i, 1, QChar( 0x31 ) );
398 break;
399
400 case 0x43:
401 case 0x47:
402 case 0x4A:
403 case 0x4B:
404 case 0x51:
405 case 0x53:
406 case 0x58:
407 case 0x5A:
408 tmp.replace( i, 1, QChar( 0x32 ) );
409 break;
410
411 case 0x44:
412 case 0x54:
413 tmp.replace( i, 1, QChar( 0x33 ) );
414 break;
415
416 case 0x4C:
417 tmp.replace( i, 1, QChar( 0x34 ) );
418 break;
419
420 case 0x4D:
421 case 0x4E:
422 tmp.replace( i, 1, QChar( 0x35 ) );
423 break;
424
425 case 0x52:
426 tmp.replace( i, 1, QChar( 0x36 ) );
427 break;
428 }
429 }
430
431 //remove adjacent duplicates
432 char1 = tmp.data();
433 char2 = tmp.data();
434 char2++;
435 outLen = 1;
436 for ( int i = 1; i < tmp.length(); ++i, ++char2 )
437 {
438 if ( *char2 != *char1 )
439 {
440 char1++;
441 *char1 = *char2;
442 outLen++;
443 if ( outLen == 4 )
444 break;
445 }
446 }
447 tmp.truncate( outLen );
448 if ( tmp.length() < 4 )
449 {
450 tmp.append( "000" );
451 tmp.truncate( 4 );
452 }
453
454 return tmp;
455}
456
457
458double QgsStringUtils::fuzzyScore( const QString &candidate, const QString &search )
459{
460 QString candidateNormalized = candidate.simplified().normalized( QString:: NormalizationForm_C ).toLower();
461 QString searchNormalized = search.simplified().normalized( QString:: NormalizationForm_C ).toLower();
462
463 int candidateLength = candidateNormalized.length();
464 int searchLength = searchNormalized.length();
465 int score = 0;
466
467 // if the candidate and the search term are empty, no other option than 0 score
468 if ( candidateLength == 0 || searchLength == 0 )
469 return score;
470
471 int candidateIdx = 0;
472 int searchIdx = 0;
473 // there is always at least one word
474 int maxScore = FUZZY_SCORE_WORD_MATCH;
475
476 bool isPreviousIndexMatching = false;
477 bool isWordOpen = true;
478
479 // loop trough each candidate char and calculate the potential max score
480 while ( candidateIdx < candidateLength )
481 {
482 QChar candidateChar = candidateNormalized[ candidateIdx++ ];
483 bool isCandidateCharWordEnd = candidateChar == ' ' || candidateChar.isPunct();
484
485 // the first char is always the default score
486 if ( candidateIdx == 1 )
487 maxScore += FUZZY_SCORE_NEW_MATCH;
488 // every space character or underscore is a opportunity for a new word
489 else if ( isCandidateCharWordEnd )
490 maxScore += FUZZY_SCORE_WORD_MATCH;
491 // potentially we can match every other character
492 else
494
495 // we looped through all the characters
496 if ( searchIdx >= searchLength )
497 continue;
498
499 QChar searchChar = searchNormalized[ searchIdx ];
500 bool isSearchCharWordEnd = searchChar == ' ' || searchChar.isPunct();
501
502 // match!
503 if ( candidateChar == searchChar || ( isCandidateCharWordEnd && isSearchCharWordEnd ) )
504 {
505 searchIdx++;
506
507 // if we have just successfully finished a word, give higher score
508 if ( isSearchCharWordEnd )
509 {
510 if ( isWordOpen )
511 score += FUZZY_SCORE_WORD_MATCH;
512 else if ( isPreviousIndexMatching )
514 else
515 score += FUZZY_SCORE_NEW_MATCH;
516
517 isWordOpen = true;
518 }
519 // if we have consecutive characters matching, give higher score
520 else if ( isPreviousIndexMatching )
521 {
523 }
524 // normal score for new independent character that matches
525 else
526 {
527 score += FUZZY_SCORE_NEW_MATCH;
528 }
529
530 isPreviousIndexMatching = true;
531 }
532 // if the current character does NOT match, we are sure we cannot build a word for now
533 else
534 {
535 isPreviousIndexMatching = false;
536 isWordOpen = false;
537 }
538
539 // if the search string is covered, check if the last match is end of word
540 if ( searchIdx >= searchLength )
541 {
542 bool isEndOfWord = ( candidateIdx >= candidateLength )
543 ? true
544 : candidateNormalized[candidateIdx] == ' ' || candidateNormalized[candidateIdx].isPunct();
545
546 if ( isEndOfWord )
547 score += FUZZY_SCORE_WORD_MATCH;
548 }
549
550 // QgsLogger::debug( u"TMP: %1 | %2 | %3 | %4 | %5"_s.arg( candidateChar, searchChar, QString::number(score), QString::number(isCandidateCharWordEnd), QString::number(isSearchCharWordEnd) ) + QStringLiteral( __FILE__ ) );
551 }
552
553 // QgsLogger::debug( u"RES: %1 | %2"_s.arg( QString::number(maxScore), QString::number(score) ) + QStringLiteral( __FILE__ ) );
554 // we didn't loop through all the search chars, it means, that they are not present in the current candidate
555 if ( searchIdx < searchLength )
556 score = 0;
557
558 return static_cast<float>( std::max( score, 0 ) ) / std::max( maxScore, 1 );
559}
560
561
562QString QgsStringUtils::insertLinks( const QString &string, bool *foundLinks )
563{
564 QString converted = string;
565
566 // http://alanstorm.com/url_regex_explained
567 // note - there's more robust implementations available
568 const thread_local QRegularExpression urlRegEx( u"((?:(?:http|https|ftp|file)://[^\\s]+[^\\s,.]+)|(?:\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^!\"#$%&'()*+,\\-./:;<=>?@[\\\\\\]^_`{|}~\\s]|/)))))"_s );
569 const thread_local QRegularExpression protoRegEx( u"^(?:f|ht)tps?://|file://"_s );
570 const thread_local QRegularExpression emailRegEx( u"([\\w._%+-]+@[\\w.-]+\\.[A-Za-z]+)"_s );
571
572 int offset = 0;
573 bool found = false;
574 QRegularExpressionMatch match = urlRegEx.match( converted );
575 while ( match.hasMatch() )
576 {
577 found = true;
578 QString url = match.captured( 1 );
579 QString protoUrl = url;
580 if ( !protoRegEx.match( protoUrl ).hasMatch() )
581 {
582 protoUrl.prepend( "http://" );
583 }
584 QString anchor = u"<a href=\"%1\">%2</a>"_s.arg( protoUrl.toHtmlEscaped(), url.toHtmlEscaped() );
585 converted.replace( match.capturedStart( 1 ), url.length(), anchor );
586 offset = match.capturedStart( 1 ) + anchor.length();
587 match = urlRegEx.match( converted, offset );
588 }
589
590 offset = 0;
591 match = emailRegEx.match( converted );
592 while ( match.hasMatch() )
593 {
594 found = true;
595 QString email = match.captured( 1 );
596 QString anchor = u"<a href=\"mailto:%1\">%1</a>"_s.arg( email.toHtmlEscaped() );
597 converted.replace( match.capturedStart( 1 ), email.length(), anchor );
598 offset = match.capturedStart( 1 ) + anchor.length();
599 match = emailRegEx.match( converted, offset );
600 }
601
602 if ( foundLinks )
603 *foundLinks = found;
604
605 return converted;
606}
607
608bool QgsStringUtils::isUrl( const QString &string )
609{
610 const thread_local QRegularExpression rxUrl( u"^(http|https|ftp|file)://\\S+$"_s );
611 return rxUrl.match( string ).hasMatch();
612}
613
614QString QgsStringUtils::htmlToMarkdown( const QString &html )
615{
616 // Any changes in this function must be copied to qgscrashreport.cpp too
617 QString converted = html;
618 converted.replace( "<br>"_L1, "\n"_L1 );
619 converted.replace( "<b>"_L1, "**"_L1 );
620 converted.replace( "</b>"_L1, "**"_L1 );
621 converted.replace( "<pre>"_L1, "\n```\n"_L1 );
622 converted.replace( "</pre>"_L1, "```\n"_L1 );
623
624 const thread_local QRegularExpression hrefRegEx( u"<a\\s+href\\s*=\\s*([^<>]*)\\s*>([^<>]*)</a>"_s );
625
626 int offset = 0;
627 QRegularExpressionMatch match = hrefRegEx.match( converted );
628 while ( match.hasMatch() )
629 {
630 QString url = match.captured( 1 ).replace( "\""_L1, QString() );
631 url.replace( '\'', QString() );
632 QString name = match.captured( 2 );
633 QString anchor = u"[%1](%2)"_s.arg( name, url );
634 converted.replace( match.capturedStart(), match.capturedLength(), anchor );
635 offset = match.capturedStart() + anchor.length();
636 match = hrefRegEx.match( converted, offset );
637 }
638
639 return converted;
640}
641
642QString QgsStringUtils::wordWrap( const QString &string, const int length, const bool useMaxLineLength, const QString &customDelimiter )
643{
644 if ( string.isEmpty() || length == 0 )
645 return string;
646
647 QString newstr;
648 QRegularExpression rx;
649 int delimiterLength = 0;
650
651 if ( !customDelimiter.isEmpty() )
652 {
653 rx.setPattern( QRegularExpression::escape( customDelimiter ) );
654 delimiterLength = customDelimiter.length();
655 }
656 else
657 {
658 // \x{200B} is a ZERO-WIDTH SPACE, needed for worwrap to support a number of complex scripts (Indic, Arabic, etc.)
659 rx.setPattern( u"[\\x{200B}\\s]"_s );
660 delimiterLength = 1;
661 }
662
663 const QStringList lines = string.split( '\n' );
664 int strLength, strCurrent, strHit, lastHit;
665
666 for ( int i = 0; i < lines.size(); i++ )
667 {
668 const QString line = lines.at( i );
669 strLength = line.length();
670 if ( strLength <= length )
671 {
672 // shortcut, no wrapping required
673 newstr.append( line );
674 if ( i < lines.size() - 1 )
675 newstr.append( '\n' );
676 continue;
677 }
678 strCurrent = 0;
679 strHit = 0;
680 lastHit = 0;
681
682 while ( strCurrent < strLength )
683 {
684 // positive wrap value = desired maximum line width to wrap
685 // negative wrap value = desired minimum line width before wrap
686 if ( useMaxLineLength )
687 {
688 //first try to locate delimiter backwards
689 strHit = ( strCurrent + length >= strLength ) ? -1 : line.lastIndexOf( rx, strCurrent + length );
690 if ( strHit == lastHit || strHit == -1 )
691 {
692 //if no new backward delimiter found, try to locate forward
693 strHit = ( strCurrent + std::abs( length ) >= strLength ) ? -1 : line.indexOf( rx, strCurrent + std::abs( length ) );
694 }
695 lastHit = strHit;
696 }
697 else
698 {
699 strHit = ( strCurrent + std::abs( length ) >= strLength ) ? -1 : line.indexOf( rx, strCurrent + std::abs( length ) );
700 }
701 if ( strHit > -1 )
702 {
703 newstr.append( QStringView {line} .mid( strCurrent, strHit - strCurrent ) );
704 newstr.append( '\n' );
705 strCurrent = strHit + delimiterLength;
706 }
707 else
708 {
709 newstr.append( QStringView {line} .mid( strCurrent ) );
710 strCurrent = strLength;
711 }
712 }
713 if ( i < lines.size() - 1 )
714 newstr.append( '\n' );
715 }
716
717 return newstr;
718}
719
721{
722 string = string.replace( ',', QChar( 65040 ) ).replace( QChar( 8229 ), QChar( 65072 ) ); // comma & two-dot leader
723 string = string.replace( QChar( 12289 ), QChar( 65041 ) ).replace( QChar( 12290 ), QChar( 65042 ) ); // ideographic comma & full stop
724 string = string.replace( ':', QChar( 65043 ) ).replace( ';', QChar( 65044 ) );
725 string = string.replace( '!', QChar( 65045 ) ).replace( '?', QChar( 65046 ) );
726 string = string.replace( QChar( 12310 ), QChar( 65047 ) ).replace( QChar( 12311 ), QChar( 65048 ) ); // white lenticular brackets
727 string = string.replace( QChar( 8230 ), QChar( 65049 ) ); // three-dot ellipse
728 string = string.replace( QChar( 8212 ), QChar( 65073 ) ).replace( QChar( 8211 ), QChar( 65074 ) ); // em & en dash
729 string = string.replace( '_', QChar( 65075 ) ).replace( QChar( 65103 ), QChar( 65076 ) ); // low line & wavy low line
730 string = string.replace( '(', QChar( 65077 ) ).replace( ')', QChar( 65078 ) );
731 string = string.replace( '{', QChar( 65079 ) ).replace( '}', QChar( 65080 ) );
732 string = string.replace( '<', QChar( 65087 ) ).replace( '>', QChar( 65088 ) );
733 string = string.replace( '[', QChar( 65095 ) ).replace( ']', QChar( 65096 ) );
734 string = string.replace( QChar( 12308 ), QChar( 65081 ) ).replace( QChar( 12309 ), QChar( 65082 ) ); // tortoise shell brackets
735 string = string.replace( QChar( 12304 ), QChar( 65083 ) ).replace( QChar( 12305 ), QChar( 65084 ) ); // black lenticular brackets
736 string = string.replace( QChar( 12298 ), QChar( 65085 ) ).replace( QChar( 12299 ), QChar( 65086 ) ); // double angle brackets
737 string = string.replace( QChar( 12300 ), QChar( 65089 ) ).replace( QChar( 12301 ), QChar( 65090 ) ); // corner brackets
738 string = string.replace( QChar( 12302 ), QChar( 65091 ) ).replace( QChar( 12303 ), QChar( 65092 ) ); // white corner brackets
739 return string;
740}
741
742QString QgsStringUtils::qRegExpEscape( const QString &string )
743{
744 // code and logic taken from the Qt source code
745 const QLatin1Char backslash( '\\' );
746 const int count = string.count();
747
748 QString escaped;
749 escaped.reserve( count * 2 );
750 for ( int i = 0; i < count; i++ )
751 {
752 switch ( string.at( i ).toLatin1() )
753 {
754 case '$':
755 case '(':
756 case ')':
757 case '*':
758 case '+':
759 case '.':
760 case '?':
761 case '[':
762 case '\\':
763 case ']':
764 case '^':
765 case '{':
766 case '|':
767 case '}':
768 escaped.append( backslash );
769 }
770 escaped.append( string.at( i ) );
771 }
772 return escaped;
773}
774
775QString QgsStringUtils::truncateMiddleOfString( const QString &string, int maxLength )
776{
777 const int charactersToTruncate = string.length() - maxLength;
778 if ( charactersToTruncate <= 0 )
779 return string;
780
781 // note we actually truncate an extra character, as we'll be replacing it with the ... character
782 const int truncateFrom = string.length() / 2 - ( charactersToTruncate + 1 ) / 2;
783 if ( truncateFrom <= 0 )
784 return QChar( 0x2026 );
785
786 return QStringView( string ).first( truncateFrom ) + QString( QChar( 0x2026 ) ) + QStringView( string ).sliced( truncateFrom + charactersToTruncate + 1 );
787}
788
789bool QgsStringUtils::containsByWord( const QString &candidate, const QString &words, Qt::CaseSensitivity sensitivity )
790{
791 if ( candidate.trimmed().isEmpty() )
792 return false;
793
794 const thread_local QRegularExpression rxWhitespace( u"\\s+"_s );
795 const QStringList parts = words.split( rxWhitespace, Qt::SkipEmptyParts );
796 if ( parts.empty() )
797 return false;
798 for ( const QString &word : parts )
799 {
800 if ( !candidate.contains( word, sensitivity ) )
801 return false;
802 }
803 return true;
804}
805
807 : mMatch( match )
808 , mReplacement( replacement )
809 , mCaseSensitive( caseSensitive )
810 , mWholeWordOnly( wholeWordOnly )
811{
812 if ( mWholeWordOnly )
813 {
814 mRx.setPattern( u"\\b%1\\b"_s.arg( mMatch ) );
815 mRx.setPatternOptions( mCaseSensitive ? QRegularExpression::NoPatternOption : QRegularExpression::CaseInsensitiveOption );
816 }
817}
818
819QString QgsStringReplacement::process( const QString &input ) const
820{
821 QString result = input;
822 if ( !mWholeWordOnly )
823 {
824 return result.replace( mMatch, mReplacement, mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
825 }
826 else
827 {
828 return result.replace( mRx, mReplacement );
829 }
830}
831
833{
834 QgsStringMap map;
835 map.insert( u"match"_s, mMatch );
836 map.insert( u"replace"_s, mReplacement );
837 map.insert( u"caseSensitive"_s, mCaseSensitive ? u"1"_s : u"0"_s );
838 map.insert( u"wholeWord"_s, mWholeWordOnly ? u"1"_s : u"0"_s );
839 return map;
840}
841
843{
844 return QgsStringReplacement( properties.value( u"match"_s ),
845 properties.value( u"replace"_s ),
846 properties.value( u"caseSensitive"_s, u"0"_s ) == "1"_L1,
847 properties.value( u"wholeWord"_s, u"0"_s ) == "1"_L1 );
848}
849
850QString QgsStringReplacementCollection::process( const QString &input ) const
851{
852 QString result = input;
853 for ( const QgsStringReplacement &r : mReplacements )
854 {
855 result = r.process( result );
856 }
857 return result;
858}
859
860void QgsStringReplacementCollection::writeXml( QDomElement &elem, QDomDocument &doc ) const
861{
862 for ( const QgsStringReplacement &r : mReplacements )
863 {
864 QgsStringMap props = r.properties();
865 QDomElement propEl = doc.createElement( u"replacement"_s );
866 QgsStringMap::const_iterator it = props.constBegin();
867 for ( ; it != props.constEnd(); ++it )
868 {
869 propEl.setAttribute( it.key(), it.value() );
870 }
871 elem.appendChild( propEl );
872 }
873}
874
875void QgsStringReplacementCollection::readXml( const QDomElement &elem )
876{
877 mReplacements.clear();
878 QDomNodeList nodelist = elem.elementsByTagName( u"replacement"_s );
879 for ( int i = 0; i < nodelist.count(); i++ )
880 {
881 QDomElement replacementElem = nodelist.at( i ).toElement();
882 QDomNamedNodeMap nodeMap = replacementElem.attributes();
883
884 QgsStringMap props;
885 for ( int j = 0; j < nodeMap.count(); ++j )
886 {
887 props.insert( nodeMap.item( j ).nodeName(), nodeMap.item( j ).nodeValue() );
888 }
889 mReplacements << QgsStringReplacement::fromProperties( props );
890 }
891
892}
Capitalization
String capitalization options.
Definition qgis.h:3436
@ AllSmallCaps
Force all characters to small caps.
Definition qgis.h:3444
@ MixedCase
Mixed case, ie no change.
Definition qgis.h:3437
@ UpperCamelCase
Convert the string to upper camel case. Note that this method does not unaccent characters.
Definition qgis.h:3443
@ AllLowercase
Convert all characters to lowercase.
Definition qgis.h:3439
@ TitleCase
Simple title case conversion - does not fully grammatically parse the text and uses simple rules only...
Definition qgis.h:3442
@ SmallCaps
Mixed case small caps.
Definition qgis.h:3441
@ ForceFirstLetterToCapital
Convert just the first letter of each word to uppercase, leave the rest untouched.
Definition qgis.h:3440
@ AllUppercase
Convert all characters to uppercase.
Definition qgis.h:3438
void readXml(const QDomElement &elem)
Reads the collection state from an XML element.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made using QgsStringR...
void writeXml(QDomElement &elem, QDomDocument &doc) const
Writes the collection state to an XML element.
A representation of a single string replacement.
static QgsStringReplacement fromProperties(const QgsStringMap &properties)
Creates a new QgsStringReplacement from an encoded properties map.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made.
bool wholeWordOnly() const
Returns true if match only applies to whole words, or false if partial word matches are permitted.
QString replacement() const
Returns the string to replace matches with.
bool caseSensitive() const
Returns true if match is case sensitive.
QgsStringReplacement(const QString &match, const QString &replacement, bool caseSensitive=false, bool wholeWordOnly=false)
Constructor for QgsStringReplacement.
QString match() const
Returns the string matched by this object.
QgsStringMap properties() const
Returns a map of the replacement properties.
static int hammingDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Hamming distance between two strings.
static QString soundex(const QString &string)
Returns the Soundex representation of a string.
static QHash< QString, QString > UNACCENT_MAP
Lookup table used by unaccent().
static int levenshteinDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Levenshtein edit distance between two strings.
static QString htmlToMarkdown(const QString &html)
Convert simple HTML to markdown.
static QString longestCommonSubstring(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the longest common substring between two strings.
static QString capitalize(const QString &string, Qgis::Capitalization capitalization)
Converts a string by applying capitalization rules to the string.
static QString substituteVerticalCharacters(QString string)
Returns a string with characters having vertical representation form substituted.
static QString unaccent(const QString &input)
Removes accents and other diacritical marks from a string, replacing accented characters with their u...
static bool containsByWord(const QString &candidate, const QString &words, Qt::CaseSensitivity sensitivity=Qt::CaseInsensitive)
Given a candidate string, returns true if the candidate contains all the individual words from anothe...
static QString insertLinks(const QString &string, bool *foundLinks=nullptr)
Returns a string with any URL (e.g., http(s)/ftp) and mailto: text converted to valid HTML <a ....
static double fuzzyScore(const QString &candidate, const QString &search)
Tests a candidate string to see how likely it is a match for a specified search string.
static QString qRegExpEscape(const QString &string)
Returns an escaped string matching the behavior of QRegExp::escape.
static QString ampersandEncode(const QString &string)
Makes a raw string safe for inclusion as a HTML/XML string literal.
static QString wordWrap(const QString &string, int length, bool useMaxLineLength=true, const QString &customDelimiter=QString())
Automatically wraps a string by inserting new line characters at appropriate locations in the string.
static bool isUrl(const QString &string)
Returns whether the string is a URL (http,https,ftp,file).
static QString truncateMiddleOfString(const QString &string, int maxLength)
Truncates a string to the specified maximum character length.
static QHash< QString, QString > createUnaccentMap()
Generates the unaccent mapping table (auto-generated by script at build time).
As part of the API refactoring and improvements which landed in the Processing API was substantially reworked from the x version This was done in order to allow much of the underlying Processing framework to be ported into c
QMap< QString, QString > QgsStringMap
Definition qgis.h:7401
#define FUZZY_SCORE_CONSECUTIVE_MATCH
#define FUZZY_SCORE_WORD_MATCH
#define FUZZY_SCORE_NEW_MATCH