Skip to content

Commit

Permalink
opt: remove qtext codec from indexzip.cc (#2057)
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaoyifang authored Jan 3, 2025
1 parent 3d809b7 commit 24720a7
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 92 deletions.
11 changes: 11 additions & 0 deletions src/common/iconv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,14 @@ QString Iconv::toQString( char const * fromEncoding, void const * fromData, size
Iconv ic( fromEncoding );
return ic.convert( fromData, dataSize );
}
QString Iconv::findValidEncoding( const QStringList & encodings )
{
for ( const QString & encoding : encodings ) {
iconv_t cd = iconv_open( "UTF-8", encoding.toUtf8().constData() );
if ( cd != (iconv_t)-1 ) {
iconv_close( cd );
return encoding;
}
}
return {};
}
4 changes: 3 additions & 1 deletion src/common/iconv.hh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "ex.hh"
#include "text.hh"
#include <QString>
#include <QStringList>
#include <iconv.h>

/// "Internationalization conversion" for char encoding conversion, currently implemented with iconv()
Expand Down Expand Up @@ -34,7 +35,8 @@ public:
static std::string toUtf8( char const * fromEncoding, std::u32string_view str );

static QString toQString( char const * fromEncoding, void const * fromData, size_t dataSize );

// tries to find a valid encoding from the given list of encodings.
static QString findValidEncoding( const QStringList & encodings );
// Copying/assigning isn't supported
Q_DISABLE_COPY_MOVE( Iconv );
};
106 changes: 15 additions & 91 deletions src/dict/utils/indexedzip.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
#include <zlib.h>
#include "text.hh"
#include "iconv.hh"
#include <QtCore5Compat/QTextCodec>

#include <QMutexLocker>

using namespace BtreeIndexing;
Expand Down Expand Up @@ -160,10 +158,6 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32
}

// File seems to be a valid zip file


QTextCodec * localeCodec = QTextCodec::codecForLocale();

ZipFile::CentralDirEntry entry;

bool alreadyCounted;
Expand All @@ -177,102 +171,32 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32
continue;
}

// Check if the file name has some non-ascii letters.

unsigned char const * ptr = (unsigned char const *)entry.fileName.constData();

bool hasNonAscii = false;

for ( ;; ) {
if ( *ptr & 0x80 ) {
hasNonAscii = true;
break;
}
else if ( !*ptr++ ) {
break;
}
}

alreadyCounted = false;

if ( !hasNonAscii ) {
// Add entry as is

if ( entry.fileNameInUTF8 ) {
zipFileNames.addSingleWord( Text::toUtf32( entry.fileName.data() ), entry.localHeaderOffset );
if ( filesCount ) {
*filesCount += 1;
}
}
else {
// Try assuming different encodings. Those are UTF8, system locale and two
// Russian ones (Windows and Windows OEM). Unfortunately, zip
// files do not say which encoding they utilize.

// Utf8
try {
std::u32string decoded = Text::toUtf32( entry.fileName.constData() );

zipFileNames.addSingleWord( decoded, entry.localHeaderOffset );
if ( filesCount != 0 && !alreadyCounted ) {
*filesCount += 1;
alreadyCounted = true;
//detect encoding.
auto encoding = Iconv::findValidEncoding( { "LOCAL", "IBM437", "CP866", "CP1251", "UTF-8" } );
if ( encoding.isEmpty() ) {
qWarning() << "Zip warning: failed to detect encoding -- skipping file" << entry.fileName.data();
continue;
}
}
catch ( Text::exCantDecode & ) {
// Failed to decode
}
std::u32string nameInSystemLocale =
Iconv::toWstring( encoding.toUtf8().constData(), entry.fileName.constData(), entry.fileName.size() );
if ( !nameInSystemLocale.empty() ) {
zipFileNames.addSingleWord( nameInSystemLocale, entry.localHeaderOffset );

if ( !entry.fileNameInUTF8 ) {
std::u32string nameInSystemLocale;

// System locale
if ( localeCodec ) {
QString name = localeCodec->toUnicode( entry.fileName.constData(), entry.fileName.size() );
nameInSystemLocale = name.toStdU32String();
if ( !nameInSystemLocale.empty() ) {
zipFileNames.addSingleWord( nameInSystemLocale, entry.localHeaderOffset );

if ( filesCount != 0 && !alreadyCounted ) {
*filesCount += 1;
alreadyCounted = true;
}
if ( filesCount != 0 ) {
*filesCount += 1;
}
}


// CP866
try {
std::u32string decoded = Iconv::toWstring( "CP866", entry.fileName.constData(), entry.fileName.size() );

if ( nameInSystemLocale != decoded ) {
zipFileNames.addSingleWord( decoded, entry.localHeaderOffset );

if ( filesCount != 0 && !alreadyCounted ) {
*filesCount += 1;
alreadyCounted = true;
}
}
}
catch ( Iconv::Ex & ) {
// Failed to decode
}

// CP1251
try {
std::u32string decoded = Iconv::toWstring( "CP1251", entry.fileName.constData(), entry.fileName.size() );

if ( nameInSystemLocale != decoded ) {
zipFileNames.addSingleWord( decoded, entry.localHeaderOffset );

if ( filesCount != 0 && !alreadyCounted ) {
*filesCount += 1;
alreadyCounted = true;
}
}
}
catch ( Iconv::Ex & ) {
// Failed to decode
}
}
catch ( Iconv::Ex & ) {
// Failed to decode
}
}
}
Expand Down

0 comments on commit 24720a7

Please sign in to comment.