Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit bfd55f24 authored by Marco Nelissen's avatar Marco Nelissen
Browse files

Use more tags to help the ICU detector.

The detector only gave non-ascii data to ICU. In some cases that could
result in very short data, for which ICU would issue a low confidence
level for the actual encoding. By padding the data with additional
(ascii) tags, we improve accuracy for such files. Becauses this can
reduce accuracy in other cases, only do this when the initial confidence
is low.

b/13473604

Change-Id: I63d932043155c310b0e358cdf2d37787961e94b7
parent e848bd9a
Loading
Loading
Loading
Loading
+96 −19
Original line number Original line Diff line number Diff line
@@ -90,6 +90,7 @@ void CharacterEncodingDetector::detectAndConvert() {
        char buf[1024];
        char buf[1024];
        buf[0] = 0;
        buf[0] = 0;
        int idx;
        int idx;
        bool allprintable = true;
        for (int i = 0; i < size; i++) {
        for (int i = 0; i < size; i++) {
            const char *name = mNames.getEntry(i);
            const char *name = mNames.getEntry(i);
            const char *value = mValues.getEntry(i);
            const char *value = mValues.getEntry(i);
@@ -103,19 +104,61 @@ void CharacterEncodingDetector::detectAndConvert() {
                strlcat(buf, value, sizeof(buf));
                strlcat(buf, value, sizeof(buf));
                // separate tags by space so ICU's ngram detector can do its job
                // separate tags by space so ICU's ngram detector can do its job
                strlcat(buf, " ", sizeof(buf));
                strlcat(buf, " ", sizeof(buf));
                allprintable = false;
            }
            }
        }
        }
        ucsdet_setText(csd, buf, strlen(buf), &status);


        const char *combinedenc = "UTF-8";
        if (allprintable) {
            // since 'buf' is empty, ICU would return a UTF-8 matcher with low confidence, so
            // no need to even call it
            ALOGV("all tags are printable, assuming ascii (%d)", strlen(buf));
        } else {
            ucsdet_setText(csd, buf, strlen(buf), &status);
            int32_t matches;
            int32_t matches;
            const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
            const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
        const char *combinedenc = "???";
            bool goodmatch = true;

            const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf),
        const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches);
                    ucma, matches, &goodmatch);

            if (!goodmatch && strlen(buf) < 20) {
                ALOGV("not a good match, trying with more data");
                // This string might be too short for ICU to do anything useful with.
                // (real world example: "Björk" in ISO-8859-1 might be detected as GB18030, because
                //  the ISO detector reports a confidence of 0, while the GB18030 detector reports
                //  a confidence of 10 with no invalid characters)
                // Append artist, album and title if they were previously omitted because they
                // were printable ascii.
                bool added = false;
                for (int i = 0; i < size; i++) {
                    const char *name = mNames.getEntry(i);
                    const char *value = mValues.getEntry(i);
                    if (isPrintableAscii(value, strlen(value)) && (
                                !strcmp(name, "artist") ||
                                !strcmp(name, "album") ||
                                !strcmp(name, "title"))) {
                        strlcat(buf, value, sizeof(buf));
                        strlcat(buf, " ", sizeof(buf));
                        added = true;
                    }
                }
                if (added) {
                    ucsdet_setText(csd, buf, strlen(buf), &status);
                    ucma = ucsdet_detectAll(csd, &matches, &status);
                    bestCombinedMatch = getPreferred(buf, strlen(buf),
                            ucma, matches, &goodmatch);
                    if (!goodmatch) {
                        ALOGV("still not a good match after adding printable tags");
                    }
                } else {
                    ALOGV("no printable tags to add");
                }
            }


            if (bestCombinedMatch != NULL) {
            if (bestCombinedMatch != NULL) {
                combinedenc = ucsdet_getName(bestCombinedMatch, &status);
                combinedenc = ucsdet_getName(bestCombinedMatch, &status);
            }
            }
        }


        for (int i = 0; i < size; i++) {
        for (int i = 0; i < size; i++) {
            const char *name = mNames.getEntry(i);
            const char *name = mNames.getEntry(i);
@@ -128,7 +171,7 @@ void CharacterEncodingDetector::detectAndConvert() {
            int32_t inputLength = strlen(s);
            int32_t inputLength = strlen(s);
            const char *enc;
            const char *enc;


            if (!strcmp(name, "artist") ||
            if (!allprintable && !strcmp(name, "artist") ||
                    !strcmp(name, "albumartist") ||
                    !strcmp(name, "albumartist") ||
                    !strcmp(name, "composer") ||
                    !strcmp(name, "composer") ||
                    !strcmp(name, "genre") ||
                    !strcmp(name, "genre") ||
@@ -136,6 +179,10 @@ void CharacterEncodingDetector::detectAndConvert() {
                    !strcmp(name, "title")) {
                    !strcmp(name, "title")) {
                // use encoding determined from the combination of artist/album/title etc.
                // use encoding determined from the combination of artist/album/title etc.
                enc = combinedenc;
                enc = combinedenc;
            } else {
                if (isPrintableAscii(s, inputLength)) {
                    enc = "UTF-8";
                    ALOGV("@@@@ %s is ascii", mNames.getEntry(i));
                } else {
                } else {
                    ucsdet_setText(csd, s, inputLength, &status);
                    ucsdet_setText(csd, s, inputLength, &status);
                    ucm = ucsdet_detect(csd, &status);
                    ucm = ucsdet_detect(csd, &status);
@@ -147,6 +194,7 @@ void CharacterEncodingDetector::detectAndConvert() {
                    ALOGV("@@@@ recognized charset: %s for %s confidence %d",
                    ALOGV("@@@@ recognized charset: %s for %s confidence %d",
                            enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
                            enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
                }
                }
            }


            if (strcmp(enc,"UTF-8") != 0) {
            if (strcmp(enc,"UTF-8") != 0) {
                // only convert if the source encoding isn't already UTF-8
                // only convert if the source encoding isn't already UTF-8
@@ -207,10 +255,15 @@ void CharacterEncodingDetector::detectAndConvert() {
 *   algorithm and larger frequent character lists than ICU
 *   algorithm and larger frequent character lists than ICU
 * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
 * - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
 * - pick the highest match
 * - pick the highest match
 * - signal to the caller whether this match is considered good: confidence > 15, and confidence
 *   delta with the next runner up > 15
 */
 */
const UCharsetMatch *CharacterEncodingDetector::getPreferred(
const UCharsetMatch *CharacterEncodingDetector::getPreferred(
        const char *input, size_t len, const UCharsetMatch** ucma, size_t nummatches) {
        const char *input, size_t len,
        const UCharsetMatch** ucma, size_t nummatches,
        bool *goodmatch) {


    *goodmatch = false;
    Vector<const UCharsetMatch*> matches;
    Vector<const UCharsetMatch*> matches;
    UErrorCode status = U_ZERO_ERROR;
    UErrorCode status = U_ZERO_ERROR;


@@ -227,6 +280,10 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred(
        return NULL;
        return NULL;
    }
    }
    if (num == 1) {
    if (num == 1) {
        int confidence = ucsdet_getConfidence(matches[0], &status);
        if (confidence > 15) {
            *goodmatch = true;
        }
        return matches[0];
        return matches[0];
    }
    }


@@ -326,15 +383,35 @@ const UCharsetMatch *CharacterEncodingDetector::getPreferred(
    // find match with highest confidence after adjusting for unlikely characters
    // find match with highest confidence after adjusting for unlikely characters
    int highest = newconfidence[0];
    int highest = newconfidence[0];
    size_t highestidx = 0;
    size_t highestidx = 0;
    int runnerup = -10000;
    int runnerupidx = -10000;
    num = newconfidence.size();
    num = newconfidence.size();
    for (size_t i = 1; i < num; i++) {
    for (size_t i = 1; i < num; i++) {
        if (newconfidence[i] > highest) {
        if (newconfidence[i] > highest) {
            runnerup = highest;
            runnerupidx = highestidx;
            highest = newconfidence[i];
            highest = newconfidence[i];
            highestidx = i;
            highestidx = i;
        } else if (newconfidence[i] > runnerup){
            runnerup = newconfidence[i];
            runnerupidx = i;
        }
        }
    }
    }
    status = U_ZERO_ERROR;
    status = U_ZERO_ERROR;
    ALOGV("selecting '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest);
    ALOGV("selecting: '%s' w/ %d confidence",
            ucsdet_getName(matches[highestidx], &status), highest);
    if (runnerupidx < 0) {
        ALOGV("no runner up");
        if (highest > 15) {
            *goodmatch = true;
        }
    } else {
        ALOGV("runner up: '%s' w/ %d confidence",
                ucsdet_getName(matches[runnerupidx], &status), runnerup);
        if ((highest - runnerup) > 15) {
            *goodmatch = true;
        }
    }
    return matches[highestidx];
    return matches[highestidx];
}
}


+3 −1
Original line number Original line Diff line number Diff line
@@ -41,7 +41,9 @@ class CharacterEncodingDetector {


    private:
    private:
        const UCharsetMatch *getPreferred(
        const UCharsetMatch *getPreferred(
                const char *input, size_t len, const UCharsetMatch** ucma, size_t matches);
                const char *input, size_t len,
                const UCharsetMatch** ucma, size_t matches,
                bool *goodmatch);


        bool isFrequent(const uint16_t *values, uint32_t c);
        bool isFrequent(const uint16_t *values, uint32_t c);