Loading media/libmedia/CharacterEncodingDetector.cpp +96 −19 Original line number Diff line number Diff line Loading @@ -90,6 +90,7 @@ void CharacterEncodingDetector::detectAndConvert() { char buf[1024]; buf[0] = 0; int idx; bool allprintable = true; for (int i = 0; i < size; i++) { const char *name = mNames.getEntry(i); const char *value = mValues.getEntry(i); Loading @@ -103,19 +104,61 @@ void CharacterEncodingDetector::detectAndConvert() { strlcat(buf, value, sizeof(buf)); // separate tags by space so ICU's ngram detector can do its job strlcat(buf, " ", sizeof(buf)); allprintable = false; } } ucsdet_setText(csd, buf, strlen(buf), &status); const char *combinedenc = "UTF-8"; if (allprintable) { // since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so // no need to even call it ALOGV("all tags are printable, assuming ascii (%d)", strlen(buf)); } else { ucsdet_setText(csd, buf, strlen(buf), &status); int32_t matches; const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status); const char *combinedenc = "???"; const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches); bool goodmatch = true; const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches, &goodmatch); if (!goodmatch && strlen(buf) < 20) { ALOGV("not a good match, trying with more data"); // This string might be too short for ICU to do anything useful with. // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because // the ISO detector reports a confidence of 0, while the GB18030 detector reports // a confidence of 10 with no invalid characters) // Append artist, album and title if they were previously omitted because they // were printable ascii. bool added = false; for (int i = 0; i < size; i++) { const char *name = mNames.getEntry(i); const char *value = mValues.getEntry(i); if (isPrintableAscii(value, strlen(value)) && ( !strcmp(name, "artist") || !strcmp(name, "album") || !strcmp(name, "title"))) { strlcat(buf, value, sizeof(buf)); strlcat(buf, " ", sizeof(buf)); added = true; } } if (added) { ucsdet_setText(csd, buf, strlen(buf), &status); ucma = ucsdet_detectAll(csd, &matches, &status); bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches, &goodmatch); if (!goodmatch) { ALOGV("still not a good match after adding printable tags"); } } else { ALOGV("no printable tags to add"); } } if (bestCombinedMatch != NULL) { combinedenc = ucsdet_getName(bestCombinedMatch, &status); } } for (int i = 0; i < size; i++) { const char *name = mNames.getEntry(i); Loading @@ -128,7 +171,7 @@ void CharacterEncodingDetector::detectAndConvert() { int32_t inputLength = strlen(s); const char *enc; if (!strcmp(name, "artist") || if (!allprintable && !strcmp(name, "artist") || !strcmp(name, "albumartist") || !strcmp(name, "composer") || !strcmp(name, "genre") || Loading @@ -136,6 +179,10 @@ void CharacterEncodingDetector::detectAndConvert() { !strcmp(name, "title")) { // use encoding determined from the combination of artist/album/title etc. enc = combinedenc; } else { if (isPrintableAscii(s, inputLength)) { enc = "UTF-8"; ALOGV("@@@@ %s is ascii", mNames.getEntry(i)); } else { ucsdet_setText(csd, s, inputLength, &status); ucm = ucsdet_detect(csd, &status); Loading @@ -147,6 +194,7 @@ void CharacterEncodingDetector::detectAndConvert() { ALOGV("@@@@ recognized charset: %s for %s confidence %d", enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status)); } } if (strcmp(enc,"UTF-8") != 0) { // only convert if the source encoding isn't already UTF-8 Loading Loading @@ -207,10 +255,15 @@ void CharacterEncodingDetector::detectAndConvert() { * algorithm and larger frequent character lists than ICU * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc) * - pick the highest match * - signal to the caller whether this match is considered good: confidence > 15, and confidence * delta with the next runner up > 15 */ const UCharsetMatch *CharacterEncodingDetector::getPreferred( const char *input, size_t len, const UCharsetMatch** ucma, size_t nummatches) { const char *input, size_t len, const UCharsetMatch** ucma, size_t nummatches, bool *goodmatch) { *goodmatch = false; Vector<const UCharsetMatch*> matches; UErrorCode status = U_ZERO_ERROR; Loading @@ -227,6 +280,10 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred( return NULL; } if (num == 1) { int confidence = ucsdet_getConfidence(matches[0], &status); if (confidence > 15) { *goodmatch = true; } return matches[0]; } Loading Loading @@ -326,15 +383,35 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred( // find match with highest confidence after adjusting for unlikely characters int highest = newconfidence[0]; size_t highestidx = 0; int runnerup = -10000; int runnerupidx = -10000; num = newconfidence.size(); for (size_t i = 1; i < num; i++) { if (newconfidence[i] > highest) { runnerup = highest; runnerupidx = highestidx; highest = newconfidence[i]; highestidx = i; } else if (newconfidence[i] > runnerup){ runnerup = newconfidence[i]; runnerupidx = i; } } status = U_ZERO_ERROR; ALOGV("selecting '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest); ALOGV("selecting: '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest); if (runnerupidx < 0) { ALOGV("no runner up"); if (highest > 15) { *goodmatch = true; } } else { ALOGV("runner up: '%s' w/ %d confidence", ucsdet_getName(matches[runnerupidx], &status), runnerup); if ((highest - runnerup) > 15) { *goodmatch = true; } } return matches[highestidx]; } Loading media/libmedia/CharacterEncodingDetector.h +3 −1 Original line number Diff line number Diff line Loading @@ -41,7 +41,9 @@ class CharacterEncodingDetector { private: const UCharsetMatch *getPreferred( const char *input, size_t len, const UCharsetMatch** ucma, size_t matches); const char *input, size_t len, const UCharsetMatch** ucma, size_t matches, bool *goodmatch); bool isFrequent(const uint16_t *values, uint32_t c); Loading Loading
media/libmedia/CharacterEncodingDetector.cpp +96 −19 Original line number Diff line number Diff line Loading @@ -90,6 +90,7 @@ void CharacterEncodingDetector::detectAndConvert() { char buf[1024]; buf[0] = 0; int idx; bool allprintable = true; for (int i = 0; i < size; i++) { const char *name = mNames.getEntry(i); const char *value = mValues.getEntry(i); Loading @@ -103,19 +104,61 @@ void CharacterEncodingDetector::detectAndConvert() { strlcat(buf, value, sizeof(buf)); // separate tags by space so ICU's ngram detector can do its job strlcat(buf, " ", sizeof(buf)); allprintable = false; } } ucsdet_setText(csd, buf, strlen(buf), &status); const char *combinedenc = "UTF-8"; if (allprintable) { // since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so // no need to even call it ALOGV("all tags are printable, assuming ascii (%d)", strlen(buf)); } else { ucsdet_setText(csd, buf, strlen(buf), &status); int32_t matches; const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status); const char *combinedenc = "???"; const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches); bool goodmatch = true; const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches, &goodmatch); if (!goodmatch && strlen(buf) < 20) { ALOGV("not a good match, trying with more data"); // This string might be too short for ICU to do anything useful with. // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because // the ISO detector reports a confidence of 0, while the GB18030 detector reports // a confidence of 10 with no invalid characters) // Append artist, album and title if they were previously omitted because they // were printable ascii. bool added = false; for (int i = 0; i < size; i++) { const char *name = mNames.getEntry(i); const char *value = mValues.getEntry(i); if (isPrintableAscii(value, strlen(value)) && ( !strcmp(name, "artist") || !strcmp(name, "album") || !strcmp(name, "title"))) { strlcat(buf, value, sizeof(buf)); strlcat(buf, " ", sizeof(buf)); added = true; } } if (added) { ucsdet_setText(csd, buf, strlen(buf), &status); ucma = ucsdet_detectAll(csd, &matches, &status); bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches, &goodmatch); if (!goodmatch) { ALOGV("still not a good match after adding printable tags"); } } else { ALOGV("no printable tags to add"); } } if (bestCombinedMatch != NULL) { combinedenc = ucsdet_getName(bestCombinedMatch, &status); } } for (int i = 0; i < size; i++) { const char *name = mNames.getEntry(i); Loading @@ -128,7 +171,7 @@ void CharacterEncodingDetector::detectAndConvert() { int32_t inputLength = strlen(s); const char *enc; if (!strcmp(name, "artist") || if (!allprintable && !strcmp(name, "artist") || !strcmp(name, "albumartist") || !strcmp(name, "composer") || !strcmp(name, "genre") || Loading @@ -136,6 +179,10 @@ void CharacterEncodingDetector::detectAndConvert() { !strcmp(name, "title")) { // use encoding determined from the combination of artist/album/title etc. enc = combinedenc; } else { if (isPrintableAscii(s, inputLength)) { enc = "UTF-8"; ALOGV("@@@@ %s is ascii", mNames.getEntry(i)); } else { ucsdet_setText(csd, s, inputLength, &status); ucm = ucsdet_detect(csd, &status); Loading @@ -147,6 +194,7 @@ void CharacterEncodingDetector::detectAndConvert() { ALOGV("@@@@ recognized charset: %s for %s confidence %d", enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status)); } } if (strcmp(enc,"UTF-8") != 0) { // only convert if the source encoding isn't already UTF-8 Loading Loading @@ -207,10 +255,15 @@ void CharacterEncodingDetector::detectAndConvert() { * algorithm and larger frequent character lists than ICU * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc) * - pick the highest match * - signal to the caller whether this match is considered good: confidence > 15, and confidence * delta with the next runner up > 15 */ const UCharsetMatch *CharacterEncodingDetector::getPreferred( const char *input, size_t len, const UCharsetMatch** ucma, size_t nummatches) { const char *input, size_t len, const UCharsetMatch** ucma, size_t nummatches, bool *goodmatch) { *goodmatch = false; Vector<const UCharsetMatch*> matches; UErrorCode status = U_ZERO_ERROR; Loading @@ -227,6 +280,10 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred( return NULL; } if (num == 1) { int confidence = ucsdet_getConfidence(matches[0], &status); if (confidence > 15) { *goodmatch = true; } return matches[0]; } Loading Loading @@ -326,15 +383,35 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred( // find match with highest confidence after adjusting for unlikely characters int highest = newconfidence[0]; size_t highestidx = 0; int runnerup = -10000; int runnerupidx = -10000; num = newconfidence.size(); for (size_t i = 1; i < num; i++) { if (newconfidence[i] > highest) { runnerup = highest; runnerupidx = highestidx; highest = newconfidence[i]; highestidx = i; } else if (newconfidence[i] > runnerup){ runnerup = newconfidence[i]; runnerupidx = i; } } status = U_ZERO_ERROR; ALOGV("selecting '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest); ALOGV("selecting: '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest); if (runnerupidx < 0) { ALOGV("no runner up"); if (highest > 15) { *goodmatch = true; } } else { ALOGV("runner up: '%s' w/ %d confidence", ucsdet_getName(matches[runnerupidx], &status), runnerup); if ((highest - runnerup) > 15) { *goodmatch = true; } } return matches[highestidx]; } Loading
media/libmedia/CharacterEncodingDetector.h +3 −1 Original line number Diff line number Diff line Loading @@ -41,7 +41,9 @@ class CharacterEncodingDetector { private: const UCharsetMatch *getPreferred( const char *input, size_t len, const UCharsetMatch** ucma, size_t matches); const char *input, size_t len, const UCharsetMatch** ucma, size_t matches, bool *goodmatch); bool isFrequent(const uint16_t *values, uint32_t c); Loading