Merge "Better character set encoding detection" (bd432dec) · Commits · e / os / android_frameworks_av

include/media/mediascanner.h

+5 −9

Original line number	Diff line number	Diff line
		@@ -21,6 +21,7 @@
		#include <utils/threads.h>
		#include <utils/List.h>
		#include <utils/Errors.h>
		#include <utils/String8.h>
		#include <pthread.h>

		struct dirent;
		@@ -29,6 +30,7 @@ namespace android {

		class MediaScannerClient;
		class StringArray;
		class CharacterEncodingDetector;

		enum MediaScanResult {
		// This file or directory was scanned successfully.
		@@ -94,15 +96,9 @@ public:
		virtual status_t setMimeType(const char* mimeType) = 0;

		protected:
		void convertValues(uint32_t encoding);

		protected:
		// cached name and value strings, for native encoding support.
		StringArray* mNames;
		StringArray* mValues;

		// default encoding based on MediaScanner::mLocale string
		uint32_t mLocaleEncoding;
		// default encoding from MediaScanner::mLocale
		String8 mLocale;
		CharacterEncodingDetector *mEncodingDetector;
		};

		}; // namespace android

media/libmedia/Android.mk

+3 −2

Original line number	Diff line number	Diff line
		@@ -44,7 +44,7 @@ LOCAL_SRC_FILES:= \
		IAudioPolicyService.cpp \
		MediaScanner.cpp \
		MediaScannerClient.cpp \
		autodetect.cpp \
		CharacterEncodingDetector.cpp \
		IMediaDeathNotifier.cpp \
		MediaProfiles.cpp \
		IEffect.cpp \
		@@ -65,7 +65,7 @@ LOCAL_CFLAGS += -DSINGLE_STATE_QUEUE_INSTANTIATIONS='"SingleStateQueueInstantiat
		# Consider a separate a library for SingleStateQueueInstantiations.

		LOCAL_SHARED_LIBRARIES := \
		libui liblog libcutils libutils libbinder libsonivox libicuuc libexpat \
		libui liblog libcutils libutils libbinder libsonivox libicuuc libicui18n libexpat \
		libcamera_client libstagefright_foundation \
		libgui libdl libaudioutils

		@@ -77,6 +77,7 @@ LOCAL_C_INCLUDES := \
		$(call include-path-for, graphics corecg) \
		$(TOP)/frameworks/native/include/media/openmax \
		external/icu4c/common \
		external/icu4c/i18n \
		$(call include-path-for, audio-effects) \
		$(call include-path-for, audio-utils)

media/libmedia/CharacterEncodingDetector.cpp

0 → 100644

+364 −0

Original line number	Diff line number	Diff line
		/*
		* Copyright (C) 2013 The Android Open Source Project
		*
		* Licensed under the Apache License, Version 2.0 (the "License");
		* you may not use this file except in compliance with the License.
		* You may obtain a copy of the License at
		*
		* http://www.apache.org/licenses/LICENSE-2.0
		*
		* Unless required by applicable law or agreed to in writing, software
		* distributed under the License is distributed on an "AS IS" BASIS,
		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		* See the License for the specific language governing permissions and
		* limitations under the License.
		*/

		//#define LOG_NDEBUG 0
		#define LOG_TAG "CharacterEncodingDector"
		#include <utils/Log.h>

		#include "CharacterEncodingDetector.h"
		#include "CharacterEncodingDetectorTables.h"

		#include "utils/Vector.h"
		#include "StringArray.h"

		#include "unicode/ucnv.h"
		#include "unicode/ucsdet.h"
		#include "unicode/ustring.h"

		namespace android {

		CharacterEncodingDetector::CharacterEncodingDetector() {

		UErrorCode status = U_ZERO_ERROR;
		mUtf8Conv = ucnv_open("UTF-8", &status);
		if (U_FAILURE(status)) {
		ALOGE("could not create UConverter for UTF-8");
		mUtf8Conv = NULL;
		}
		}

		CharacterEncodingDetector::~CharacterEncodingDetector() {
		ucnv_close(mUtf8Conv);
		}

		void CharacterEncodingDetector::addTag(const char name, const char value) {
		mNames.push_back(name);
		mValues.push_back(value);
		}

		size_t CharacterEncodingDetector::size() {
		return mNames.size();
		}

		status_t CharacterEncodingDetector::getTag(int index, const char name, const charvalue) {
		if (index >= mNames.size()) {
		return BAD_VALUE;
		}

		*name = mNames.getEntry(index);
		*value = mValues.getEntry(index);
		return OK;
		}

		static bool isPrintableAscii(const char *value, size_t len) {
		for (size_t i = 0; i < len; i++) {
		if ((value[i] & 0x80) \|\| value[i] < 0x20 \|\| value[i] == 0x7f) {
		return false;
		}
		}
		return true;
		}

		void CharacterEncodingDetector::detectAndConvert() {

		int size = mNames.size();
		ALOGV("%d tags before conversion", size);
		for (int i = 0; i < size; i++) {
		ALOGV("%s: %s", mNames.getEntry(i), mValues.getEntry(i));
		}

		if (size && mUtf8Conv) {

		UErrorCode status = U_ZERO_ERROR;
		UCharsetDetector *csd = ucsdet_open(&status);
		const UCharsetMatch *ucm;

		// try combined detection of artist/album/title etc.
		char buf[1024];
		buf[0] = 0;
		int idx;
		for (int i = 0; i < size; i++) {
		const char *name = mNames.getEntry(i);
		const char *value = mValues.getEntry(i);
		if (!isPrintableAscii(value, strlen(value)) && (
		!strcmp(name, "artist") \|\|
		!strcmp(name, "albumartist") \|\|
		!strcmp(name, "composer") \|\|
		!strcmp(name, "genre") \|\|
		!strcmp(name, "album") \|\|
		!strcmp(name, "title"))) {
		strlcat(buf, value, sizeof(buf));
		// separate tags by space so ICU's ngram detector can do its job
		strlcat(buf, " ", sizeof(buf));
		}
		}
		ucsdet_setText(csd, buf, strlen(buf), &status);

		int32_t matches;
		const UCharsetMatch** ucma = ucsdet_detectAll(csd, &matches, &status);
		const char *combinedenc = "???";

		const UCharsetMatch* bestCombinedMatch = getPreferred(buf, strlen(buf), ucma, matches);

		if (bestCombinedMatch != NULL) {
		combinedenc = ucsdet_getName(bestCombinedMatch, &status);
		}

		for (int i = 0; i < size; i++) {
		const char *name = mNames.getEntry(i);
		uint8_t* src = (uint8_t *)mValues.getEntry(i);
		int len = strlen((char *)src);
		uint8_t* dest = src;

		ALOGV("@@@ checking %s", name);
		const char *s = mValues.getEntry(i);
		int32_t inputLength = strlen(s);
		const char *enc;

		if (!strcmp(name, "artist") \|\|
		!strcmp(name, "albumartist") \|\|
		!strcmp(name, "composer") \|\|
		!strcmp(name, "genre") \|\|
		!strcmp(name, "album") \|\|
		!strcmp(name, "title")) {
		// use encoding determined from the combination of artist/album/title etc.
		enc = combinedenc;
		} else {
		ucsdet_setText(csd, s, inputLength, &status);
		ucm = ucsdet_detect(csd, &status);
		if (!ucm) {
		mValues.setEntry(i, "???");
		continue;
		}
		enc = ucsdet_getName(ucm, &status);
		ALOGV("@@@@ recognized charset: %s for %s confidence %d",
		enc, mNames.getEntry(i), ucsdet_getConfidence(ucm, &status));
		}

		if (strcmp(enc,"UTF-8") != 0) {
		// only convert if the source encoding isn't already UTF-8
		ALOGV("@@@ using converter %s for %s", enc, mNames.getEntry(i));
		UConverter *conv = ucnv_open(enc, &status);
		if (U_FAILURE(status)) {
		ALOGE("could not create UConverter for %s", enc);
		continue;
		}

		// convert from native encoding to UTF-8
		const char* source = mValues.getEntry(i);
		int targetLength = len * 3 + 1;
		char* buffer = new char[targetLength];
		// don't normally check for NULL, but in this case targetLength may be large
		if (!buffer)
		break;
		char* target = buffer;

		ucnv_convertEx(mUtf8Conv, conv, &target, target + targetLength,
		&source, source + strlen(source),
		NULL, NULL, NULL, NULL, TRUE, TRUE, &status);

		if (U_FAILURE(status)) {
		ALOGE("ucnv_convertEx failed: %d", status);
		mValues.setEntry(i, "???");
		} else {
		// zero terminate
		*target = 0;
		mValues.setEntry(i, buffer);
		}

		delete[] buffer;

		ucnv_close(conv);
		}
		}

		for (int i = size - 1; i >= 0; --i) {
		if (strlen(mValues.getEntry(i)) == 0) {
		ALOGV("erasing %s because entry is empty", mNames.getEntry(i));
		mNames.erase(i);
		mValues.erase(i);
		}
		}

		ucsdet_close(csd);
		}
		}

		/*
		* When ICU detects multiple encoding matches, apply additional heuristics to determine
		* which one is the best match, since ICU can't always be trusted to make the right choice.
		*
		* What this method does is:
		* - decode the input using each of the matches found
		* - recalculate the starting confidence level for multibyte encodings using a different
		* algorithm and larger frequent character lists than ICU
		* - devalue encoding where the conversion contains unlikely characters (symbols, reserved, etc)
		* - pick the highest match
		*/
		const UCharsetMatch *CharacterEncodingDetector::getPreferred(
		const char input, size_t len, const UCharsetMatch* ucma, size_t nummatches) {

		Vector<const UCharsetMatch*> matches;
		UErrorCode status = U_ZERO_ERROR;

		ALOGV("%d matches", nummatches);
		for (size_t i = 0; i < nummatches; i++) {
		const char *encname = ucsdet_getName(ucma[i], &status);
		int confidence = ucsdet_getConfidence(ucma[i], &status);
		ALOGV("%d: %s %d", i, encname, confidence);
		matches.push_back(ucma[i]);
		}

		size_t num = matches.size();
		if (num == 0) {
		return NULL;
		}
		if (num == 1) {
		return matches[0];
		}

		ALOGV("considering %d matches", num);

		// keep track of how many "special" characters result when converting the input using each
		// encoding
		Vector<int> newconfidence;
		for (size_t i = 0; i < num; i++) {
		const uint16_t *freqdata = NULL;
		float freqcoverage = 0;
		status = U_ZERO_ERROR;
		const char *encname = ucsdet_getName(matches[i], &status);
		int confidence = ucsdet_getConfidence(matches[i], &status);
		if (!strcmp("GB18030", encname)) {
		freqdata = frequent_zhCN;
		freqcoverage = frequent_zhCN_coverage;
		} else if (!strcmp("Big5", encname)) {
		freqdata = frequent_zhTW;
		freqcoverage = frequent_zhTW_coverage;
		} else if (!strcmp("EUC-KR", encname)) {
		freqdata = frequent_ko;
		freqcoverage = frequent_ko_coverage;
		} else if (!strcmp("EUC-JP", encname)) {
		freqdata = frequent_ja;
		freqcoverage = frequent_ja_coverage;
		} else if (!strcmp("Shift_JIS", encname)) {
		freqdata = frequent_ja;
		freqcoverage = frequent_ja_coverage;
		}

		ALOGV("%d: %s %d", i, encname, confidence);
		UConverter *conv = ucnv_open(encname, &status);
		const char *source = input;
		const char *sourceLimit = input + len;
		status = U_ZERO_ERROR;
		int demerit = 0;
		int frequentchars = 0;
		int totalchars = 0;
		while (true) {
		// demerit the current encoding for each "special" character found after conversion.
		// The amount of demerit is somewhat arbitrarily chosen.
		int inchar;
		if (source != sourceLimit) {
		inchar = (source[0] << 8) + source[1];
		}
		UChar32 c = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
		if (!U_SUCCESS(status)) {
		break;
		}
		if (c < 0x20 \|\| (c >= 0x7f && c <= 0x009f)) {
		ALOGV("control character %x", c);
		demerit += 100;
		} else if ((c >= 0xa0 && c <= 0xbe) // symbols, superscripts
		\|\| (c == 0xd7) \|\| (c == 0xf7) // multiplication and division signs
		\|\| (c >= 0x2000 && c <= 0x209f)) { // punctuation, superscripts
		ALOGV("unlikely character %x", c);
		demerit += 10;
		} else if (c >= 0xe000 && c <= 0xf8ff) {
		ALOGV("private use character %x", c);
		demerit += 30;
		} else if (c >= 0x2190 && c <= 0x2bff) {
		// this range comprises various symbol ranges that are unlikely to appear in
		// music file metadata.
		ALOGV("symbol %x", c);
		demerit += 10;
		} else if (c == 0xfffd) {
		ALOGV("replacement character");
		demerit += 50;
		} else if (c >= 0xfff0 && c <= 0xfffc) {
		ALOGV("unicode special %x", c);
		demerit += 50;
		} else if (freqdata != NULL) {
		totalchars++;
		if (isFrequent(freqdata, c)) {
		frequentchars++;
		}
		}
		}
		if (freqdata != NULL && totalchars != 0) {
		int myconfidence = 10 + float((100 * frequentchars) / totalchars) / freqcoverage;
		ALOGV("ICU confidence: %d, my confidence: %d (%d %d)", confidence, myconfidence,
		totalchars, frequentchars);
		if (myconfidence > 100) myconfidence = 100;
		if (myconfidence < 0) myconfidence = 0;
		confidence = myconfidence;
		}
		ALOGV("%d-%d=%d", confidence, demerit, confidence - demerit);
		newconfidence.push_back(confidence - demerit);
		ucnv_close(conv);
		if (i == 0 && (confidence - demerit) == 100) {
		// no need to check any further, we'll end up using this match anyway
		break;
		}
		}

		// find match with highest confidence after adjusting for unlikely characters
		int highest = newconfidence[0];
		size_t highestidx = 0;
		num = newconfidence.size();
		for (size_t i = 1; i < num; i++) {
		if (newconfidence[i] > highest) {
		highest = newconfidence[i];
		highestidx = i;
		}
		}
		status = U_ZERO_ERROR;
		ALOGV("selecting '%s' w/ %d confidence", ucsdet_getName(matches[highestidx], &status), highest);
		return matches[highestidx];
		}


		bool CharacterEncodingDetector::isFrequent(const uint16_t *values, uint32_t c) {

		int start = 0;
		int end = 511; // All the tables have 512 entries
		int mid = (start+end)/2;

		while(start <= end) {
		if(c == values[mid]) {
		return true;
		} else if (c > values[mid]) {
		start = mid + 1;
		} else {
		end = mid - 1;
		}

		mid = (start + end) / 2;
		}

		return false;
		}


		} // namespace android

media/libmedia/autodetect.h→media/libmedia/CharacterEncodingDetector.h

+61 −0

Original line number	Diff line number	Diff line
		/*
		* Copyright (C) 2008 The Android Open Source Project
		* Copyright (C) 2013 The Android Open Source Project
		*
		* Licensed under the Apache License, Version 2.0 (the "License");
		* you may not use this file except in compliance with the License.
		@@ -14,24 +14,48 @@
		* limitations under the License.
		*/

		#ifndef AUTODETECT_H
		#define AUTODETECT_H
		#ifndef _CHARACTER_ENCODING_DETECTOR_H
		#define _CHARACTER_ENCODING_DETECTOR_H

		#include <inttypes.h>
		#include <media/mediascanner.h>

		// flags used for native encoding detection
		enum {
		kEncodingNone = 0,
		kEncodingShiftJIS = (1 << 0),
		kEncodingGBK = (1 << 1),
		kEncodingBig5 = (1 << 2),
		kEncodingEUCKR = (1 << 3),
		#include "StringArray.h"

		kEncodingAll = (kEncodingShiftJIS \| kEncodingGBK \| kEncodingBig5 \| kEncodingEUCKR),
		#include "unicode/ucnv.h"
		#include "unicode/ucsdet.h"
		#include "unicode/ustring.h"

		namespace android {

		class CharacterEncodingDetector {

		public:
		CharacterEncodingDetector();
		~CharacterEncodingDetector();

		void addTag(const char name, const char value);
		size_t size();

		void detectAndConvert();
		status_t getTag(int index, const char name, const charvalue);

		private:
		const UCharsetMatch *getPreferred(
		const char input, size_t len, const UCharsetMatch* ucma, size_t matches);

		bool isFrequent(const uint16_t *values, uint32_t c);

		// cached name and value strings, for native encoding support.
		// TODO: replace these with byte blob arrays that don't require the data to be
		// singlenullbyte-terminated
		StringArray mNames;
		StringArray mValues;

		UConverter* mUtf8Conv;
		};


		// returns a bitfield containing the possible native encodings for the given character
		extern uint32_t findPossibleEncodings(int ch);

		#endif // AUTODETECT_H
		}; // namespace android

		#endif

media/libmedia/CharacterEncodingDetectorTables.h

0 → 100644

+2092 −0

File added.

Preview size limit exceeded, changes collapsed.