QGIS API Documentation 4.1.0-Master (467af3bbe65)
Loading...
Searching...
No Matches
qgsstringutils.cpp
Go to the documentation of this file.
1/***************************************************************************
2 qgsstringutils.cpp
3 ------------------
4 begin : June 2015
5 copyright : (C) 2015 by Nyall Dawson
6 email : nyall dot dawson at gmail dot com
7 ***************************************************************************
8 * *
9 * This program is free software; you can redistribute it and/or modify *
10 * it under the terms of the GNU General Public License as published by *
11 * the Free Software Foundation; either version 2 of the License, or *
12 * (at your option) any later version. *
13 * *
14 ***************************************************************************/
15
16#include "qgsstringutils.h"
17
18#include <cstdlib>
19
20#include "qgslogger.h"
21
22#include <QRegularExpression>
23#include <QString>
24#include <QStringList>
25#include <QTextBoundaryFinder>
26#include <QVector>
27
28using namespace Qt::StringLiterals;
29
31
32QString QgsStringUtils::unaccent( const QString &input )
33{
34 // Normalize input to NFC so that Unicode characters composed of base +
35 // combining marks are converted to their canonical composed form.
36 // This ensures lookups match the keys in UNACCENT_MAP, which are stored
37 // in NFC (e.g. "e" + U+0301 becomes "é", as PostgreSQL does it.).
38 const QString in = input.normalized( QString::NormalizationForm_C );
39 QString out;
40 out.reserve( in.size() );
41
42 qsizetype i = 0;
43 const qsizetype n = in.size();
44
45 while ( i < n )
46 {
47 const QChar c = in.at( i );
48 int len = 1;
49
50 // Detect surrogate pair (non-BMP)
51 if ( c.isHighSurrogate() && i + 1 < n )
52 {
53 const QChar c2 = in.at( i + 1 );
54 if ( c2.isLowSurrogate() )
55 len = 2;
56 }
57
58 const QString key = in.mid( i, len ).normalized( QString::NormalizationForm_C );
59
60 auto it = UNACCENT_MAP.constFind( key );
61 if ( it != UNACCENT_MAP.constEnd() )
62 out.append( it.value() );
63 else
64 out.append( key );
65
66 i += len;
67 }
68
69 return out;
70}
71
72QString QgsStringUtils::capitalize( const QString &string, Qgis::Capitalization capitalization )
73{
74 if ( string.isEmpty() )
75 return QString();
76
77 switch ( capitalization )
78 {
81 return string;
82
84 return string.toUpper();
85
88 return string.toLower();
89
91 {
92 QString temp = string;
93
94 QTextBoundaryFinder wordSplitter( QTextBoundaryFinder::Word, string.constData(), string.length(), nullptr, 0 );
95 QTextBoundaryFinder letterSplitter( QTextBoundaryFinder::Grapheme, string.constData(), string.length(), nullptr, 0 );
96
97 wordSplitter.setPosition( 0 );
98 bool first = true;
99 while ( ( first && wordSplitter.boundaryReasons() & QTextBoundaryFinder::StartOfItem ) || wordSplitter.toNextBoundary() >= 0 )
100 {
101 first = false;
102 letterSplitter.setPosition( wordSplitter.position() );
103 ( void ) letterSplitter.toNextBoundary();
104 QString substr = string.mid( wordSplitter.position(), letterSplitter.position() - wordSplitter.position() );
105 temp.replace( wordSplitter.position(), substr.length(), substr.toUpper() );
106 }
107 return temp;
108 }
109
111 {
112 // yes, this is MASSIVELY simplifying the problem!!
113
114 static QStringList smallWords;
115 static QStringList newPhraseSeparators;
116 static QRegularExpression splitWords;
117 if ( smallWords.empty() )
118 {
119 smallWords = QObject::tr( "a|an|and|as|at|but|by|en|for|if|in|nor|of|on|or|per|s|the|to|vs.|vs|via" ).split( '|' );
120 newPhraseSeparators = QObject::tr( ".|:" ).split( '|' );
121 splitWords = QRegularExpression( u"\\b"_s, QRegularExpression::UseUnicodePropertiesOption );
122 }
123
124 const bool allSameCase = string.toLower() == string || string.toUpper() == string;
125 const QStringList parts = ( allSameCase ? string.toLower() : string ).split( splitWords, Qt::SkipEmptyParts );
126 QString result;
127 bool firstWord = true;
128 int i = 0;
129 int lastWord = parts.count() - 1;
130 for ( const QString &word : std::as_const( parts ) )
131 {
132 if ( newPhraseSeparators.contains( word.trimmed() ) )
133 {
134 firstWord = true;
135 result += word;
136 }
137 else if ( firstWord || ( i == lastWord ) || !smallWords.contains( word ) )
138 {
139 result += word.at( 0 ).toUpper() + word.mid( 1 );
140 firstWord = false;
141 }
142 else
143 {
144 result += word;
145 }
146 i++;
147 }
148 return result;
149 }
150
152 QString result = QgsStringUtils::capitalize( string.toLower(), Qgis::Capitalization::ForceFirstLetterToCapital ).simplified();
153 result.remove( ' ' );
154 return result;
155 }
156 // no warnings
157 return string;
158}
159
160// original code from http://www.qtcentre.org/threads/52456-HTML-Unicode-ampersand-encoding
161QString QgsStringUtils::ampersandEncode( const QString &string )
162{
163 QString encoded;
164 for ( int i = 0; i < string.size(); ++i )
165 {
166 QChar ch = string.at( i );
167 if ( ch.unicode() > 160 )
168 encoded += u"&#%1;"_s.arg( static_cast< int >( ch.unicode() ) );
169 else if ( ch.unicode() == 38 )
170 encoded += "&amp;"_L1;
171 else if ( ch.unicode() == 60 )
172 encoded += "&lt;"_L1;
173 else if ( ch.unicode() == 62 )
174 encoded += "&gt;"_L1;
175 else
176 encoded += ch;
177 }
178 return encoded;
179}
180
181int QgsStringUtils::levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive )
182{
183 int length1 = string1.length();
184 int length2 = string2.length();
185
186 //empty strings? solution is trivial...
187 if ( string1.isEmpty() )
188 {
189 return length2;
190 }
191 else if ( string2.isEmpty() )
192 {
193 return length1;
194 }
195
196 //handle case sensitive flag (or not)
197 QString s1( caseSensitive ? string1 : string1.toLower() );
198 QString s2( caseSensitive ? string2 : string2.toLower() );
199
200 const QChar *s1Char = s1.constData();
201 const QChar *s2Char = s2.constData();
202
203 //strip out any common prefix
204 int commonPrefixLen = 0;
205 while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
206 {
207 commonPrefixLen++;
208 length1--;
209 length2--;
210 s1Char++;
211 s2Char++;
212 }
213
214 //strip out any common suffix
215 while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
216 {
217 length1--;
218 length2--;
219 }
220
221 //fully checked either string? if so, the answer is easy...
222 if ( length1 == 0 )
223 {
224 return length2;
225 }
226 else if ( length2 == 0 )
227 {
228 return length1;
229 }
230
231 //ensure the inner loop is longer
232 if ( length1 > length2 )
233 {
234 std::swap( s1, s2 );
235 std::swap( length1, length2 );
236 }
237
238 //levenshtein algorithm begins here
239 std::vector< int > col( length2 + 1, 0 );
240 std::vector< int > prevCol;
241 prevCol.reserve( length2 + 1 );
242 for ( int i = 0; i < length2 + 1; ++i )
243 {
244 prevCol.emplace_back( i );
245 }
246 const QChar *s2start = s2Char;
247 for ( int i = 0; i < length1; ++i )
248 {
249 col[0] = i + 1;
250 s2Char = s2start;
251 for ( int j = 0; j < length2; ++j )
252 {
253 col[j + 1] = std::min( std::min( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + ( ( *s1Char == *s2Char ) ? 0 : 1 ) );
254 s2Char++;
255 }
256 col.swap( prevCol );
257 s1Char++;
258 }
259 return prevCol[length2];
260}
261
262QString QgsStringUtils::longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive )
263{
264 if ( string1.isEmpty() || string2.isEmpty() )
265 {
266 //empty strings, solution is trivial...
267 return QString();
268 }
269
270 //handle case sensitive flag (or not)
271 QString s1( caseSensitive ? string1 : string1.toLower() );
272 QString s2( caseSensitive ? string2 : string2.toLower() );
273
274 if ( s1 == s2 )
275 {
276 //another trivial case, identical strings
277 return s1;
278 }
279
280 int *currentScores = new int[s2.length()];
281 int *previousScores = new int[s2.length()];
282 int maxCommonLength = 0;
283 int lastMaxBeginIndex = 0;
284
285 const QChar *s1Char = s1.constData();
286 const QChar *s2Char = s2.constData();
287 const QChar *s2Start = s2Char;
288
289 for ( int i = 0; i < s1.length(); ++i )
290 {
291 for ( int j = 0; j < s2.length(); ++j )
292 {
293 if ( *s1Char != *s2Char )
294 {
295 currentScores[j] = 0;
296 }
297 else
298 {
299 if ( i == 0 || j == 0 )
300 {
301 currentScores[j] = 1;
302 }
303 else
304 {
305 currentScores[j] = 1 + previousScores[j - 1];
306 }
307
308 if ( maxCommonLength < currentScores[j] )
309 {
310 maxCommonLength = currentScores[j];
311 lastMaxBeginIndex = i;
312 }
313 }
314 s2Char++;
315 }
316 std::swap( currentScores, previousScores );
317 s1Char++;
318 s2Char = s2Start;
319 }
320 delete[] currentScores;
321 delete[] previousScores;
322 return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
323}
324
325int QgsStringUtils::hammingDistance( const QString &string1, const QString &string2, bool caseSensitive )
326{
327 if ( string1.isEmpty() && string2.isEmpty() )
328 {
329 //empty strings, solution is trivial...
330 return 0;
331 }
332
333 if ( string1.length() != string2.length() )
334 {
335 //invalid inputs
336 return -1;
337 }
338
339 //handle case sensitive flag (or not)
340 QString s1( caseSensitive ? string1 : string1.toLower() );
341 QString s2( caseSensitive ? string2 : string2.toLower() );
342
343 if ( s1 == s2 )
344 {
345 //another trivial case, identical strings
346 return 0;
347 }
348
349 int distance = 0;
350 const QChar *s1Char = s1.constData();
351 const QChar *s2Char = s2.constData();
352
353 for ( int i = 0; i < string1.length(); ++i )
354 {
355 if ( *s1Char != *s2Char )
356 distance++;
357 s1Char++;
358 s2Char++;
359 }
360
361 return distance;
362}
363
364QString QgsStringUtils::soundex( const QString &string )
365{
366 if ( string.isEmpty() )
367 return QString();
368
369 QString tmp = string.toUpper();
370
371 //strip non character codes, and vowel like characters after the first character
372 QChar *char1 = tmp.data();
373 QChar *char2 = tmp.data();
374 int outLen = 0;
375 for ( int i = 0; i < tmp.length(); ++i, ++char2 )
376 {
377 if ( ( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || ( ( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
378 && ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
379 && ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
380 && ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
381 {
382 *char1 = *char2;
383 char1++;
384 outLen++;
385 }
386 }
387 tmp.truncate( outLen );
388
389 QChar *tmpChar = tmp.data();
390 tmpChar++;
391 for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
392 {
393 switch ( ( *tmpChar ).unicode() )
394 {
395 case 0x42:
396 case 0x46:
397 case 0x50:
398 case 0x56:
399 tmp.replace( i, 1, QChar( 0x31 ) );
400 break;
401
402 case 0x43:
403 case 0x47:
404 case 0x4A:
405 case 0x4B:
406 case 0x51:
407 case 0x53:
408 case 0x58:
409 case 0x5A:
410 tmp.replace( i, 1, QChar( 0x32 ) );
411 break;
412
413 case 0x44:
414 case 0x54:
415 tmp.replace( i, 1, QChar( 0x33 ) );
416 break;
417
418 case 0x4C:
419 tmp.replace( i, 1, QChar( 0x34 ) );
420 break;
421
422 case 0x4D:
423 case 0x4E:
424 tmp.replace( i, 1, QChar( 0x35 ) );
425 break;
426
427 case 0x52:
428 tmp.replace( i, 1, QChar( 0x36 ) );
429 break;
430 }
431 }
432
433 //remove adjacent duplicates
434 char1 = tmp.data();
435 char2 = tmp.data();
436 char2++;
437 outLen = 1;
438 for ( int i = 1; i < tmp.length(); ++i, ++char2 )
439 {
440 if ( *char2 != *char1 )
441 {
442 char1++;
443 *char1 = *char2;
444 outLen++;
445 if ( outLen == 4 )
446 break;
447 }
448 }
449 tmp.truncate( outLen );
450 if ( tmp.length() < 4 )
451 {
452 tmp.append( "000" );
453 tmp.truncate( 4 );
454 }
455
456 return tmp;
457}
458
459
460double QgsStringUtils::fuzzyScore( const QString &candidate, const QString &search )
461{
462 QString candidateNormalized = candidate.simplified().normalized( QString::NormalizationForm_C ).toLower();
463 QString searchNormalized = search.simplified().normalized( QString::NormalizationForm_C ).toLower();
464
465 int candidateLength = candidateNormalized.length();
466 int searchLength = searchNormalized.length();
467 int score = 0;
468
469 // if the candidate and the search term are empty, no other option than 0 score
470 if ( candidateLength == 0 || searchLength == 0 )
471 return score;
472
473 int candidateIdx = 0;
474 int searchIdx = 0;
475 // there is always at least one word
476 int maxScore = FUZZY_SCORE_WORD_MATCH;
477
478 bool isPreviousIndexMatching = false;
479 bool isWordOpen = true;
480
481 // loop trough each candidate char and calculate the potential max score
482 while ( candidateIdx < candidateLength )
483 {
484 QChar candidateChar = candidateNormalized[candidateIdx++];
485 bool isCandidateCharWordEnd = candidateChar == ' ' || candidateChar.isPunct();
486
487 // the first char is always the default score
488 if ( candidateIdx == 1 )
489 maxScore += FUZZY_SCORE_NEW_MATCH;
490 // every space character or underscore is a opportunity for a new word
491 else if ( isCandidateCharWordEnd )
492 maxScore += FUZZY_SCORE_WORD_MATCH;
493 // potentially we can match every other character
494 else
496
497 // we looped through all the characters
498 if ( searchIdx >= searchLength )
499 continue;
500
501 QChar searchChar = searchNormalized[searchIdx];
502 bool isSearchCharWordEnd = searchChar == ' ' || searchChar.isPunct();
503
504 // match!
505 if ( candidateChar == searchChar || ( isCandidateCharWordEnd && isSearchCharWordEnd ) )
506 {
507 searchIdx++;
508
509 // if we have just successfully finished a word, give higher score
510 if ( isSearchCharWordEnd )
511 {
512 if ( isWordOpen )
513 score += FUZZY_SCORE_WORD_MATCH;
514 else if ( isPreviousIndexMatching )
516 else
517 score += FUZZY_SCORE_NEW_MATCH;
518
519 isWordOpen = true;
520 }
521 // if we have consecutive characters matching, give higher score
522 else if ( isPreviousIndexMatching )
523 {
525 }
526 // normal score for new independent character that matches
527 else
528 {
529 score += FUZZY_SCORE_NEW_MATCH;
530 }
531
532 isPreviousIndexMatching = true;
533 }
534 // if the current character does NOT match, we are sure we cannot build a word for now
535 else
536 {
537 isPreviousIndexMatching = false;
538 isWordOpen = false;
539 }
540
541 // if the search string is covered, check if the last match is end of word
542 if ( searchIdx >= searchLength )
543 {
544 bool isEndOfWord = ( candidateIdx >= candidateLength ) ? true : candidateNormalized[candidateIdx] == ' ' || candidateNormalized[candidateIdx].isPunct();
545
546 if ( isEndOfWord )
547 score += FUZZY_SCORE_WORD_MATCH;
548 }
549
550 // QgsLogger::debug( u"TMP: %1 | %2 | %3 | %4 | %5"_s.arg( candidateChar, searchChar, QString::number(score), QString::number(isCandidateCharWordEnd), QString::number(isSearchCharWordEnd) ) + QStringLiteral( __FILE__ ) );
551 }
552
553 // QgsLogger::debug( u"RES: %1 | %2"_s.arg( QString::number(maxScore), QString::number(score) ) + QStringLiteral( __FILE__ ) );
554 // we didn't loop through all the search chars, it means, that they are not present in the current candidate
555 if ( searchIdx < searchLength )
556 score = 0;
557
558 return static_cast<float>( std::max( score, 0 ) ) / std::max( maxScore, 1 );
559}
560
561
562QString QgsStringUtils::insertLinks( const QString &string, bool *foundLinks )
563{
564 QString converted = string;
565
566 // http://alanstorm.com/url_regex_explained
567 // note - there's more robust implementations available
568 const thread_local QRegularExpression urlRegEx(
569 u"((?:(?:['\"\\(]?http|https|ftp|file)://[^\\s]+[^\\s,.]+)|(?:\\b(([\\w-]+://?|www[.])[^\\s()<>]+(?:\\([\\w\\d]+\\)|([^!\"#$%&'()*+,\\-./:;<=>?@[\\\\\\]^_`{|}~\\s]|/)))))"_s
570 );
571 const thread_local QRegularExpression groupedStringRegEx( u"^(['\"\\(]+)(.*?)(?:['\")]+)"_s );
572 const thread_local QRegularExpression protoRegEx( u"^(?:f|ht)tps?://|file://"_s );
573 const thread_local QRegularExpression emailRegEx( u"([\\w._%+-]+@[\\w.-]+\\.[A-Za-z]+)"_s );
574
575 std::size_t offset = 0;
576 bool found = false;
577 QRegularExpressionMatch match = urlRegEx.match( converted );
578 while ( match.hasMatch() )
579 {
580 found = true;
581 QString url = match.captured( 1 );
582 std::size_t urlStart = match.capturedStart( 1 );
583
584 QString protoUrl = url;
585 const QRegularExpressionMatch groupedStringMatch = groupedStringRegEx.match( protoUrl );
586 if ( groupedStringMatch.hasMatch() )
587 {
588 url = groupedStringMatch.captured( 2 );
589 protoUrl = url;
590 urlStart += groupedStringMatch.capturedLength( 1 );
591 }
592 if ( !protoRegEx.match( protoUrl ).hasMatch() )
593 {
594 protoUrl.prepend( "http://" );
595 }
596 QString anchor = u"<a href=\"%1\">%2</a>"_s.arg( protoUrl.toHtmlEscaped(), url.toHtmlEscaped() );
597 converted.replace( urlStart, url.length(), anchor );
598 offset = urlStart + anchor.length();
599 match = urlRegEx.match( converted, offset );
600 }
601
602 match = emailRegEx.match( converted );
603 while ( match.hasMatch() )
604 {
605 found = true;
606 QString email = match.captured( 1 );
607 QString anchor = u"<a href=\"mailto:%1\">%1</a>"_s.arg( email.toHtmlEscaped() );
608 converted.replace( match.capturedStart( 1 ), email.length(), anchor );
609 offset = match.capturedStart( 1 ) + anchor.length();
610 match = emailRegEx.match( converted, offset );
611 }
612
613 if ( foundLinks )
614 *foundLinks = found;
615
616 return converted;
617}
618
619bool QgsStringUtils::isUrl( const QString &string )
620{
621 const thread_local QRegularExpression rxUrl( u"^(http|https|ftp|file)://\\S+$"_s );
622 return rxUrl.match( string ).hasMatch();
623}
624
625QString QgsStringUtils::htmlToMarkdown( const QString &html )
626{
627 // Any changes in this function must be copied to qgscrashreport.cpp too
628 QString converted = html;
629 converted.replace( "<br>"_L1, "\n"_L1 );
630 converted.replace( "<b>"_L1, "**"_L1 );
631 converted.replace( "</b>"_L1, "**"_L1 );
632 converted.replace( "<pre>"_L1, "\n```\n"_L1 );
633 converted.replace( "</pre>"_L1, "```\n"_L1 );
634
635 const thread_local QRegularExpression hrefRegEx( u"<a\\s+href\\s*=\\s*([^<>]*)\\s*>([^<>]*)</a>"_s );
636
637 int offset = 0;
638 QRegularExpressionMatch match = hrefRegEx.match( converted );
639 while ( match.hasMatch() )
640 {
641 QString url = match.captured( 1 ).replace( "\""_L1, QString() );
642 url.replace( '\'', QString() );
643 QString name = match.captured( 2 );
644 QString anchor = u"[%1](%2)"_s.arg( name, url );
645 converted.replace( match.capturedStart(), match.capturedLength(), anchor );
646 offset = match.capturedStart() + anchor.length();
647 match = hrefRegEx.match( converted, offset );
648 }
649
650 return converted;
651}
652
653QString QgsStringUtils::wordWrap( const QString &string, const int length, const bool useMaxLineLength, const QString &customDelimiter )
654{
655 if ( string.isEmpty() || length == 0 )
656 return string;
657
658 QString newstr;
659 QRegularExpression rx;
660 int delimiterLength = 0;
661
662 if ( !customDelimiter.isEmpty() )
663 {
664 rx.setPattern( QRegularExpression::escape( customDelimiter ) );
665 delimiterLength = customDelimiter.length();
666 }
667 else
668 {
669 // \x{200B} is a ZERO-WIDTH SPACE, needed for worwrap to support a number of complex scripts (Indic, Arabic, etc.)
670 rx.setPattern( u"[\\x{200B}\\s]"_s );
671 delimiterLength = 1;
672 }
673
674 const QStringList lines = string.split( '\n' );
675 int strLength, strCurrent, strHit, lastHit;
676
677 for ( int i = 0; i < lines.size(); i++ )
678 {
679 const QString line = lines.at( i );
680 strLength = line.length();
681 if ( strLength <= length )
682 {
683 // shortcut, no wrapping required
684 newstr.append( line );
685 if ( i < lines.size() - 1 )
686 newstr.append( '\n' );
687 continue;
688 }
689 strCurrent = 0;
690 strHit = 0;
691 lastHit = 0;
692
693 while ( strCurrent < strLength )
694 {
695 // positive wrap value = desired maximum line width to wrap
696 // negative wrap value = desired minimum line width before wrap
697 if ( useMaxLineLength )
698 {
699 //first try to locate delimiter backwards
700 strHit = ( strCurrent + length >= strLength ) ? -1 : line.lastIndexOf( rx, strCurrent + length );
701 if ( strHit == lastHit || strHit == -1 )
702 {
703 //if no new backward delimiter found, try to locate forward
704 strHit = ( strCurrent + std::abs( length ) >= strLength ) ? -1 : line.indexOf( rx, strCurrent + std::abs( length ) );
705 }
706 lastHit = strHit;
707 }
708 else
709 {
710 strHit = ( strCurrent + std::abs( length ) >= strLength ) ? -1 : line.indexOf( rx, strCurrent + std::abs( length ) );
711 }
712 if ( strHit > -1 )
713 {
714 newstr.append( QStringView { line }.mid( strCurrent, strHit - strCurrent ) );
715 newstr.append( '\n' );
716 strCurrent = strHit + delimiterLength;
717 }
718 else
719 {
720 newstr.append( QStringView { line }.mid( strCurrent ) );
721 strCurrent = strLength;
722 }
723 }
724 if ( i < lines.size() - 1 )
725 newstr.append( '\n' );
726 }
727
728 return newstr;
729}
730
732{
733 string = string.replace( ',', QChar( 65040 ) ).replace( QChar( 8229 ), QChar( 65072 ) ); // comma & two-dot leader
734 string = string.replace( QChar( 12289 ), QChar( 65041 ) ).replace( QChar( 12290 ), QChar( 65042 ) ); // ideographic comma & full stop
735 string = string.replace( ':', QChar( 65043 ) ).replace( ';', QChar( 65044 ) );
736 string = string.replace( '!', QChar( 65045 ) ).replace( '?', QChar( 65046 ) );
737 string = string.replace( QChar( 12310 ), QChar( 65047 ) ).replace( QChar( 12311 ), QChar( 65048 ) ); // white lenticular brackets
738 string = string.replace( QChar( 8230 ), QChar( 65049 ) ); // three-dot ellipse
739 string = string.replace( QChar( 8212 ), QChar( 65073 ) ).replace( QChar( 8211 ), QChar( 65074 ) ); // em & en dash
740 string = string.replace( '_', QChar( 65075 ) ).replace( QChar( 65103 ), QChar( 65076 ) ); // low line & wavy low line
741 string = string.replace( '(', QChar( 65077 ) ).replace( ')', QChar( 65078 ) );
742 string = string.replace( '{', QChar( 65079 ) ).replace( '}', QChar( 65080 ) );
743 string = string.replace( '<', QChar( 65087 ) ).replace( '>', QChar( 65088 ) );
744 string = string.replace( '[', QChar( 65095 ) ).replace( ']', QChar( 65096 ) );
745 string = string.replace( QChar( 12308 ), QChar( 65081 ) ).replace( QChar( 12309 ), QChar( 65082 ) ); // tortoise shell brackets
746 string = string.replace( QChar( 12304 ), QChar( 65083 ) ).replace( QChar( 12305 ), QChar( 65084 ) ); // black lenticular brackets
747 string = string.replace( QChar( 12298 ), QChar( 65085 ) ).replace( QChar( 12299 ), QChar( 65086 ) ); // double angle brackets
748 string = string.replace( QChar( 12300 ), QChar( 65089 ) ).replace( QChar( 12301 ), QChar( 65090 ) ); // corner brackets
749 string = string.replace( QChar( 12302 ), QChar( 65091 ) ).replace( QChar( 12303 ), QChar( 65092 ) ); // white corner brackets
750 return string;
751}
752
753QString QgsStringUtils::qRegExpEscape( const QString &string )
754{
755 // code and logic taken from the Qt source code
756 const QLatin1Char backslash( '\\' );
757 const int count = string.count();
758
759 QString escaped;
760 escaped.reserve( count * 2 );
761 for ( int i = 0; i < count; i++ )
762 {
763 switch ( string.at( i ).toLatin1() )
764 {
765 case '$':
766 case '(':
767 case ')':
768 case '*':
769 case '+':
770 case '.':
771 case '?':
772 case '[':
773 case '\\':
774 case ']':
775 case '^':
776 case '{':
777 case '|':
778 case '}':
779 escaped.append( backslash );
780 }
781 escaped.append( string.at( i ) );
782 }
783 return escaped;
784}
785
786QString QgsStringUtils::truncateMiddleOfString( const QString &string, int maxLength )
787{
788 const int charactersToTruncate = string.length() - maxLength;
789 if ( charactersToTruncate <= 0 )
790 return string;
791
792 // note we actually truncate an extra character, as we'll be replacing it with the ... character
793 const int truncateFrom = string.length() / 2 - ( charactersToTruncate + 1 ) / 2;
794 if ( truncateFrom <= 0 )
795 return QChar( 0x2026 );
796
797 return QStringView( string ).first( truncateFrom ) + QString( QChar( 0x2026 ) ) + QStringView( string ).sliced( truncateFrom + charactersToTruncate + 1 );
798}
799
800bool QgsStringUtils::containsByWord( const QString &candidate, const QString &words, Qt::CaseSensitivity sensitivity )
801{
802 if ( candidate.trimmed().isEmpty() )
803 return false;
804
805 const thread_local QRegularExpression rxWhitespace( u"\\s+"_s );
806 const QStringList parts = words.split( rxWhitespace, Qt::SkipEmptyParts );
807 if ( parts.empty() )
808 return false;
809 for ( const QString &word : parts )
810 {
811 if ( !candidate.contains( word, sensitivity ) )
812 return false;
813 }
814 return true;
815}
816
818 : mMatch( match )
819 , mReplacement( replacement )
820 , mCaseSensitive( caseSensitive )
821 , mWholeWordOnly( wholeWordOnly )
822{
823 if ( mWholeWordOnly )
824 {
825 mRx.setPattern( u"\\b%1\\b"_s.arg( mMatch ) );
826 mRx.setPatternOptions( mCaseSensitive ? QRegularExpression::NoPatternOption : QRegularExpression::CaseInsensitiveOption );
827 }
828}
829
830QString QgsStringReplacement::process( const QString &input ) const
831{
832 QString result = input;
833 if ( !mWholeWordOnly )
834 {
835 return result.replace( mMatch, mReplacement, mCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive );
836 }
837 else
838 {
839 return result.replace( mRx, mReplacement );
840 }
841}
842
844{
845 QgsStringMap map;
846 map.insert( u"match"_s, mMatch );
847 map.insert( u"replace"_s, mReplacement );
848 map.insert( u"caseSensitive"_s, mCaseSensitive ? u"1"_s : u"0"_s );
849 map.insert( u"wholeWord"_s, mWholeWordOnly ? u"1"_s : u"0"_s );
850 return map;
851}
852
854{
855 return QgsStringReplacement( properties.value( u"match"_s ), properties.value( u"replace"_s ), properties.value( u"caseSensitive"_s, u"0"_s ) == "1"_L1, properties.value( u"wholeWord"_s, u"0"_s ) == "1"_L1 );
856}
857
858QString QgsStringReplacementCollection::process( const QString &input ) const
859{
860 QString result = input;
861 for ( const QgsStringReplacement &r : mReplacements )
862 {
863 result = r.process( result );
864 }
865 return result;
866}
867
868void QgsStringReplacementCollection::writeXml( QDomElement &elem, QDomDocument &doc ) const
869{
870 for ( const QgsStringReplacement &r : mReplacements )
871 {
872 QgsStringMap props = r.properties();
873 QDomElement propEl = doc.createElement( u"replacement"_s );
874 QgsStringMap::const_iterator it = props.constBegin();
875 for ( ; it != props.constEnd(); ++it )
876 {
877 propEl.setAttribute( it.key(), it.value() );
878 }
879 elem.appendChild( propEl );
880 }
881}
882
883void QgsStringReplacementCollection::readXml( const QDomElement &elem )
884{
885 mReplacements.clear();
886 QDomNodeList nodelist = elem.elementsByTagName( u"replacement"_s );
887 for ( int i = 0; i < nodelist.count(); i++ )
888 {
889 QDomElement replacementElem = nodelist.at( i ).toElement();
890 QDomNamedNodeMap nodeMap = replacementElem.attributes();
891
892 QgsStringMap props;
893 for ( int j = 0; j < nodeMap.count(); ++j )
894 {
895 props.insert( nodeMap.item( j ).nodeName(), nodeMap.item( j ).nodeValue() );
896 }
897 mReplacements << QgsStringReplacement::fromProperties( props );
898 }
899}
Capitalization
String capitalization options.
Definition qgis.h:3555
@ AllSmallCaps
Force all characters to small caps.
Definition qgis.h:3563
@ MixedCase
Mixed case, ie no change.
Definition qgis.h:3556
@ UpperCamelCase
Convert the string to upper camel case. Note that this method does not unaccent characters.
Definition qgis.h:3562
@ AllLowercase
Convert all characters to lowercase.
Definition qgis.h:3558
@ TitleCase
Simple title case conversion - does not fully grammatically parse the text and uses simple rules only...
Definition qgis.h:3561
@ SmallCaps
Mixed case small caps.
Definition qgis.h:3560
@ ForceFirstLetterToCapital
Convert just the first letter of each word to uppercase, leave the rest untouched.
Definition qgis.h:3559
@ AllUppercase
Convert all characters to uppercase.
Definition qgis.h:3557
void readXml(const QDomElement &elem)
Reads the collection state from an XML element.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made using QgsStringR...
void writeXml(QDomElement &elem, QDomDocument &doc) const
Writes the collection state to an XML element.
A representation of a single string replacement.
static QgsStringReplacement fromProperties(const QgsStringMap &properties)
Creates a new QgsStringReplacement from an encoded properties map.
QString process(const QString &input) const
Processes a given input string, applying any valid replacements which should be made.
bool wholeWordOnly() const
Returns true if match only applies to whole words, or false if partial word matches are permitted.
QString replacement() const
Returns the string to replace matches with.
bool caseSensitive() const
Returns true if match is case sensitive.
QgsStringReplacement(const QString &match, const QString &replacement, bool caseSensitive=false, bool wholeWordOnly=false)
Constructor for QgsStringReplacement.
QString match() const
Returns the string matched by this object.
QgsStringMap properties() const
Returns a map of the replacement properties.
static int hammingDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Hamming distance between two strings.
static QString soundex(const QString &string)
Returns the Soundex representation of a string.
static QHash< QString, QString > UNACCENT_MAP
Lookup table used by unaccent().
static int levenshteinDistance(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the Levenshtein edit distance between two strings.
static QString htmlToMarkdown(const QString &html)
Convert simple HTML to markdown.
static QString longestCommonSubstring(const QString &string1, const QString &string2, bool caseSensitive=false)
Returns the longest common substring between two strings.
static QString capitalize(const QString &string, Qgis::Capitalization capitalization)
Converts a string by applying capitalization rules to the string.
static QString substituteVerticalCharacters(QString string)
Returns a string with characters having vertical representation form substituted.
static QString unaccent(const QString &input)
Removes accents and other diacritical marks from a string, replacing accented characters with their u...
static bool containsByWord(const QString &candidate, const QString &words, Qt::CaseSensitivity sensitivity=Qt::CaseInsensitive)
Given a candidate string, returns true if the candidate contains all the individual words from anothe...
static QString insertLinks(const QString &string, bool *foundLinks=nullptr)
Returns a string with any URL (e.g., http(s)/ftp) and mailto: text converted to valid HTML <a ....
static double fuzzyScore(const QString &candidate, const QString &search)
Tests a candidate string to see how likely it is a match for a specified search string.
static QString qRegExpEscape(const QString &string)
Returns an escaped string matching the behavior of QRegExp::escape.
static QString ampersandEncode(const QString &string)
Makes a raw string safe for inclusion as a HTML/XML string literal.
static QString wordWrap(const QString &string, int length, bool useMaxLineLength=true, const QString &customDelimiter=QString())
Automatically wraps a string by inserting new line characters at appropriate locations in the string.
static bool isUrl(const QString &string)
Returns whether the string is a URL (http,https,ftp,file).
static QString truncateMiddleOfString(const QString &string, int maxLength)
Truncates a string to the specified maximum character length.
static QHash< QString, QString > createUnaccentMap()
Generates the unaccent mapping table (auto-generated by script at build time).
As part of the API refactoring and improvements which landed in the Processing API was substantially reworked from the x version This was done in order to allow much of the underlying Processing framework to be ported into c
QMap< QString, QString > QgsStringMap
Definition qgis.h:7592
#define FUZZY_SCORE_CONSECUTIVE_MATCH
#define FUZZY_SCORE_WORD_MATCH
#define FUZZY_SCORE_NEW_MATCH