Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

opt: remove qtext codec from indexzip.cc #2057

Merged
merged 7 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/common/iconv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,14 @@ QString Iconv::toQString( char const * fromEncoding, void const * fromData, size
Iconv ic( fromEncoding );
return ic.convert( fromData, dataSize );
}
QString Iconv::findValidEncoding( const QStringList & encodings )
{
for ( const QString & encoding : encodings ) {
iconv_t cd = iconv_open( "UTF-8", encoding.toUtf8().constData() );
if ( cd != (iconv_t)-1 ) {
iconv_close( cd );
return encoding;
}
}
return {};
}
4 changes: 3 additions & 1 deletion src/common/iconv.hh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "ex.hh"
#include "text.hh"
#include <QString>
#include <QStringList>
#include <iconv.h>

/// "Internationalization conversion" for char encoding conversion, currently implemented with iconv()
Expand Down Expand Up @@ -34,7 +35,8 @@ public:
static std::string toUtf8( char const * fromEncoding, std::u32string_view str );

static QString toQString( char const * fromEncoding, void const * fromData, size_t dataSize );

// tries to find a valid encoding from the given list of encodings.
static QString findValidEncoding( const QStringList & encodings );
// Copying/assigning isn't supported
Q_DISABLE_COPY_MOVE( Iconv );
};
106 changes: 15 additions & 91 deletions src/dict/utils/indexedzip.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
#include <zlib.h>
#include "text.hh"
#include "iconv.hh"
#include <QtCore5Compat/QTextCodec>

#include <QMutexLocker>

using namespace BtreeIndexing;
Expand Down Expand Up @@ -160,10 +158,6 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32
}

// File seems to be a valid zip file


QTextCodec * localeCodec = QTextCodec::codecForLocale();

ZipFile::CentralDirEntry entry;

bool alreadyCounted;
Expand All @@ -177,102 +171,32 @@ bool IndexedZip::indexFile( BtreeIndexing::IndexedWords & zipFileNames, quint32
continue;
}

// Check if the file name has some non-ascii letters.

unsigned char const * ptr = (unsigned char const *)entry.fileName.constData();

bool hasNonAscii = false;

for ( ;; ) {
if ( *ptr & 0x80 ) {
hasNonAscii = true;
break;
}
else if ( !*ptr++ ) {
break;
}
}

alreadyCounted = false;

if ( !hasNonAscii ) {
// Add entry as is

if ( entry.fileNameInUTF8 ) {
zipFileNames.addSingleWord( Text::toUtf32( entry.fileName.data() ), entry.localHeaderOffset );
if ( filesCount ) {
*filesCount += 1;
}
}
else {
// Try assuming different encodings. Those are UTF8, system locale and two
// Russian ones (Windows and Windows OEM). Unfortunately, zip
// files do not say which encoding they utilize.

// Utf8
try {
std::u32string decoded = Text::toUtf32( entry.fileName.constData() );

zipFileNames.addSingleWord( decoded, entry.localHeaderOffset );
if ( filesCount != 0 && !alreadyCounted ) {
*filesCount += 1;
alreadyCounted = true;
//detect encoding.
auto encoding = Iconv::findValidEncoding( { "LOCAL", "IBM437", "CP866", "CP1251", "UTF-8" } );
if ( encoding.isEmpty() ) {
qWarning() << "Zip warning: failed to detect encoding -- skipping file" << entry.fileName.data();
continue;
}
}
catch ( Text::exCantDecode & ) {
// Failed to decode
}
std::u32string nameInSystemLocale =
Iconv::toWstring( encoding.toUtf8().constData(), entry.fileName.constData(), entry.fileName.size() );
if ( !nameInSystemLocale.empty() ) {
zipFileNames.addSingleWord( nameInSystemLocale, entry.localHeaderOffset );

if ( !entry.fileNameInUTF8 ) {
std::u32string nameInSystemLocale;

// System locale
if ( localeCodec ) {
QString name = localeCodec->toUnicode( entry.fileName.constData(), entry.fileName.size() );
nameInSystemLocale = name.toStdU32String();
if ( !nameInSystemLocale.empty() ) {
zipFileNames.addSingleWord( nameInSystemLocale, entry.localHeaderOffset );

if ( filesCount != 0 && !alreadyCounted ) {
*filesCount += 1;
alreadyCounted = true;
}
if ( filesCount != 0 ) {
*filesCount += 1;
}
}


// CP866
try {
std::u32string decoded = Iconv::toWstring( "CP866", entry.fileName.constData(), entry.fileName.size() );

if ( nameInSystemLocale != decoded ) {
zipFileNames.addSingleWord( decoded, entry.localHeaderOffset );

if ( filesCount != 0 && !alreadyCounted ) {
*filesCount += 1;
alreadyCounted = true;
}
}
}
catch ( Iconv::Ex & ) {
// Failed to decode
}

// CP1251
try {
std::u32string decoded = Iconv::toWstring( "CP1251", entry.fileName.constData(), entry.fileName.size() );

if ( nameInSystemLocale != decoded ) {
zipFileNames.addSingleWord( decoded, entry.localHeaderOffset );

if ( filesCount != 0 && !alreadyCounted ) {
*filesCount += 1;
alreadyCounted = true;
}
}
}
catch ( Iconv::Ex & ) {
// Failed to decode
}
}
catch ( Iconv::Ex & ) {
// Failed to decode
}
}
}
Expand Down
Loading