diff --git a/.gitignore b/.gitignore index 9739bee..fe9b640 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ dist *.pyc *.egg-info *.so +*.patch~ *.pyd RELEASE-VERSION /.pc diff --git a/README.md b/README.md index 3e2bf36..3d1a913 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Python bindings for LZMA -* **PyLZMA** Copyright (C) 2004-2015 Joachim Bauch -* **7-Zip** Copyright (C) 1999-2010 Igor Pavlov -* **LZMA SDK** Copyright (C) 1999-2010 Igor Pavlov +* **PyLZMA** Copyright (C) 2004-2025 Joachim Bauch +* **7-Zip** Copyright (C) 1999-2025 Igor Pavlov +* **LZMA SDK** Copyright (C) 1999-2025 Igor Pavlov [![Linux Build Status](https://github.com/fancycode/pylzma/workflows/test/badge.svg)](https://github.com/fancycode/pylzma/actions) [![Windows Build Status](https://ci.appveyor.com/api/projects/status/5a7k7v9k2a0eiuom/branch/master?svg=true diff --git a/patches/series b/patches/series index 1502ea8..6bcb25c 100644 --- a/patches/series +++ b/patches/series @@ -1,2 +1 @@ streaming_encoder.patch -strict_prototype.patch diff --git a/patches/streaming_encoder.patch b/patches/streaming_encoder.patch index 5dbeec8..2e1cedf 100644 --- a/patches/streaming_encoder.patch +++ b/patches/streaming_encoder.patch @@ -2,10 +2,11 @@ Index: pylzma/src/sdk/C/LzmaEnc.c =================================================================== --- pylzma.orig/src/sdk/C/LzmaEnc.c +++ pylzma/src/sdk/C/LzmaEnc.c -@@ -2259,8 +2259,9 @@ void LzmaEnc_Destroy(CLzmaEncHandle p, I +@@ -2385,9 +2385,9 @@ void LzmaEnc_Destroy(CLzmaEncHandle p, I } +-Z7_NO_INLINE -static SRes LzmaEnc_CodeOneBlock(CLzmaEnc *p, UInt32 maxPackSize, UInt32 maxUnpackSize) +SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpackSize) { @@ -13,36 +14,39 @@ Index: pylzma/src/sdk/C/LzmaEnc.c UInt32 nowPos32, startPos32; if (p->needInit) { -@@ -2715,7 +2716,7 @@ static SRes LzmaEnc_AllocAndInit(CLzmaEn +@@ -2872,7 +2872,7 @@ static SRes LzmaEnc_AllocAndInit(CLzmaEn return SZ_OK; } --static SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, -+SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, +-static SRes LzmaEnc_Prepare(CLzmaEncHandle p, ++SRes LzmaEnc_Prepare(CLzmaEncHandle p, + ISeqOutStreamPtr outStream, + ISeqInStreamPtr inStream, ISzAllocPtr alloc, ISzAllocPtr allocBig) - { - CLzmaEnc *p = (CLzmaEnc *)pp; -@@ -2974,3 +2975,9 @@ SRes LzmaEncode(Byte *dest, SizeT *destL - LzmaEnc_Destroy(p, alloc, allocBig); - return res; +@@ -2883,6 +2883,12 @@ static SRes LzmaEnc_Prepare(CLzmaEncHand + return LzmaEnc_AllocAndInit(p, 0, alloc, allocBig); } -+ + +BoolInt LzmaEnc_IsFinished(CLzmaEncHandle pp) +{ + CLzmaEnc *p = (CLzmaEnc *)pp; + return p->finished; +} ++ + SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle p, + ISeqInStreamPtr inStream, UInt32 keepWindowSize, + ISzAllocPtr alloc, ISzAllocPtr allocBig) Index: pylzma/src/sdk/C/LzmaEnc.h =================================================================== --- pylzma.orig/src/sdk/C/LzmaEnc.h +++ pylzma/src/sdk/C/LzmaEnc.h -@@ -73,4 +73,11 @@ SRes LzmaEncode(Byte *dest, SizeT *destL +@@ -82,4 +82,11 @@ SRes LzmaEncode(Byte *dest, SizeT *destL EXTERN_C_END +/* ---------- Streaming Interface ---------- */ + -+SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ISzAllocPtr alloc, ISzAllocPtr allocBig); ++SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, ISzAllocPtr alloc, ISzAllocPtr allocBig); +SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpackSize); +BoolInt LzmaEnc_IsFinished(CLzmaEncHandle pp); +void LzmaEnc_Finish(CLzmaEncHandle pp); diff --git a/patches/strict_prototype.patch b/patches/strict_prototype.patch deleted file mode 100644 index b31c5a7..0000000 --- a/patches/strict_prototype.patch +++ /dev/null @@ -1,17 +0,0 @@ -Index: pylzma/src/sdk/C/CpuArch.h -=================================================================== ---- pylzma.orig/src/sdk/C/CpuArch.h -+++ pylzma/src/sdk/C/CpuArch.h -@@ -325,9 +325,9 @@ int x86cpuid_GetFirm(const Cx86cpuid *p) - #define x86cpuid_GetModel(ver) (((ver >> 12) & 0xF0) | ((ver >> 4) & 0xF)) - #define x86cpuid_GetStepping(ver) (ver & 0xF) - --BoolInt CPU_Is_InOrder(); --BoolInt CPU_Is_Aes_Supported(); --BoolInt CPU_IsSupported_PageGB(); -+BoolInt CPU_Is_InOrder(void); -+BoolInt CPU_Is_Aes_Supported(void); -+BoolInt CPU_IsSupported_PageGB(void); - - #endif - diff --git a/setup.py b/setup.py index 4cc4d10..549efc2 100644 --- a/setup.py +++ b/setup.py @@ -85,11 +85,6 @@ def __new__(cls, s): library_dirs = [] -# platforms that multithreaded compression is supported on -mt_platforms = ( - 'win32', -) - if IS_WINDOWS: # don't try to import MSVC compiler on non-windows platforms # as this triggers unnecessary warnings @@ -107,14 +102,7 @@ class MSVCCompiler(object): class build_ext(_build_ext): def build_extension(self, ext): - self.with_mt = ENABLE_MULTITHREADING - if self.with_mt and not sys.platform in mt_platforms: - warn("""\ -Multithreading is not supported on the platform "%s", -please contact mail@joachim-bauch.de for more informations.""" % (sys.platform), UnsupportedPlatformWarning) - self.with_mt = False - - if self.with_mt: + if ENABLE_MULTITHREADING: log.info('adding support for multithreaded compression') ext.define_macros.append(('COMPRESS_MF_MT', 1)) ext.sources += ( @@ -124,7 +112,7 @@ def build_extension(self, ext): 'src/sdk/C/Threads.c', ) else: - ext.define_macros.append(('_7ZIP_ST', 1)) + ext.define_macros.append(('Z7_ST', 1)) if isinstance(self.compiler, MSVCCompiler) or getattr(self.compiler, 'compiler_type', '') == 'msvc': # set flags only available when using MSVC @@ -163,6 +151,7 @@ def build_extension(self, ext): ('PY_SSIZE_T_CLEAN', 1), ] lzma_files = ( + 'src/sdk/C/7zStream.c', 'src/sdk/C/Aes.c', 'src/sdk/C/AesOpt.c', 'src/sdk/C/Bcj2.c', @@ -172,11 +161,14 @@ def build_extension(self, ext): 'src/sdk/C/CpuArch.c', 'src/sdk/C/Delta.c', 'src/sdk/C/LzFind.c', + 'src/sdk/C/LzFindOpt.c', 'src/sdk/C/LzmaDec.c', 'src/sdk/C/LzmaEnc.c', 'src/sdk/C/Lzma2Dec.c', 'src/sdk/C/Lzma2Enc.c', 'src/sdk/C/Sha256.c', + 'src/sdk/C/Sha256Opt.c', + 'src/sdk/C/SwapBytes.c', 'src/sdk/C/Ppmd7.c', 'src/sdk/C/Ppmd7Dec.c', ) diff --git a/src/pylzma/pylzma.c b/src/pylzma/pylzma.c index 0a00c55..978044b 100644 --- a/src/pylzma/pylzma.c +++ b/src/pylzma/pylzma.c @@ -130,7 +130,7 @@ pylzma_calculate_key(PyObject *self, PyObject *args, PyObject *kwargs) const char doc_bcj_x86_convert[] = \ - "bcj_x86_convert(data) -- Perform BCJ x86 conversion."; + "bcj_x86_convert(data, [encoding]) -- Perform BCJ x86 conversion."; static PyObject * pylzma_bcj_x86_convert(PyObject *self, PyObject *args) @@ -150,10 +150,13 @@ pylzma_bcj_x86_convert(PyObject *self, PyObject *args) result = PyBytes_FromStringAndSize(data, length); if (result != NULL) { - UInt32 state; + UInt32 state = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL; Py_BEGIN_ALLOW_THREADS - x86_Convert_Init(state); - x86_Convert((Byte *) PyBytes_AS_STRING(result), length, 0, &state, encoding); + if (encoding) { + Z7_BRANCH_CONV_ST_ENC(X86)((Byte *) PyBytes_AS_STRING(result), length, 0, &state); + } else { + Z7_BRANCH_CONV_ST_DEC(X86)((Byte *) PyBytes_AS_STRING(result), length, 0, &state); + } Py_END_ALLOW_THREADS } @@ -163,7 +166,7 @@ pylzma_bcj_x86_convert(PyObject *self, PyObject *args) #define DEFINE_BCJ_CONVERTER(id, name) \ const char \ doc_bcj_##id##_convert[] = \ - "bcj_" #id "_convert(data) -- Perform BCJ " #name " conversion."; \ + "bcj_" #id "_convert(data, [encoding]) -- Perform BCJ " #name " conversion."; \ \ static PyObject * \ pylzma_bcj_##id##_convert(PyObject *self, PyObject *args) \ @@ -184,7 +187,11 @@ pylzma_bcj_##id##_convert(PyObject *self, PyObject *args) \ result = PyBytes_FromStringAndSize(data, length); \ if (result != NULL) { \ Py_BEGIN_ALLOW_THREADS \ - name##_Convert((Byte *) PyBytes_AS_STRING(result), length, 0, encoding); \ + if (encoding) { \ + Z7_BRANCH_CONV_ENC(name)((Byte *) PyBytes_AS_STRING(result), length, 0); \ + } else { \ + Z7_BRANCH_CONV_DEC(name)((Byte *) PyBytes_AS_STRING(result), length, 0); \ + } \ Py_END_ALLOW_THREADS \ } \ \ @@ -193,7 +200,9 @@ pylzma_bcj_##id##_convert(PyObject *self, PyObject *args) \ DEFINE_BCJ_CONVERTER(arm, ARM); DEFINE_BCJ_CONVERTER(armt, ARMT); +DEFINE_BCJ_CONVERTER(arm64, ARM64); DEFINE_BCJ_CONVERTER(ppc, PPC); +DEFINE_BCJ_CONVERTER(riscv, RISCV); DEFINE_BCJ_CONVERTER(sparc, SPARC); DEFINE_BCJ_CONVERTER(ia64, IA64); @@ -259,7 +268,7 @@ pylzma_bcj2_decode(PyObject *self, PyObject *args) } // Conversion must be finished and the output buffer filled completely. - if (!Bcj2Dec_IsFinished(&dec)) { + if (!Bcj2Dec_IsMaybeFinished(&dec)) { goto error; } else if (dec.dest != dec.destLim || dec.state != BCJ2_STREAM_MAIN) { goto error; @@ -370,7 +379,7 @@ typedef struct static Byte ReadByte(const IByteIn *pp) { - CByteInToLook *p = CONTAINER_FROM_VTBL(pp, CByteInToLook, vt); + CByteInToLook *p = Z7_CONTAINER_FROM_VTBL(pp, CByteInToLook, vt); if (p->cur != p->end) { return *p->cur++; } @@ -404,7 +413,6 @@ pylzma_ppmd_decompress(PyObject *self, PyObject *args) unsigned order; UInt32 memSize; CPpmd7 ppmd; - CPpmd7z_RangeDec rc; CByteInToLook s; SRes res = SZ_OK; CMemoryLookInStream stream; @@ -446,22 +454,21 @@ pylzma_ppmd_decompress(PyObject *self, PyObject *args) CreateMemoryLookInStream(&stream, (Byte*) data, length); tmp = (Byte *) PyBytes_AS_STRING(result); Py_BEGIN_ALLOW_THREADS - Ppmd7z_RangeDec_CreateVTable(&rc); s.vt.Read = ReadByte; s.inStream = &stream.s; s.begin = s.end = s.cur = NULL; s.extra = False; s.res = SZ_OK; s.processed = 0; - rc.Stream = &s.vt; - if (!Ppmd7z_RangeDec_Init(&rc)) { + ppmd.rc.dec.Stream = &s.vt; + if (!Ppmd7z_RangeDec_Init(&ppmd.rc.dec)) { res = SZ_ERROR_DATA; } else if (s.extra) { res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA); } else { SizeT i; for (i = 0; i < outsize; i++) { - int sym = Ppmd7_DecodeSymbol(&ppmd, &rc.vt); + int sym = Ppmd7z_DecodeSymbol(&ppmd); if (s.extra || sym < 0) { break; } @@ -469,7 +476,7 @@ pylzma_ppmd_decompress(PyObject *self, PyObject *args) } if (i != outsize) { res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA); - } else if (s.processed + (s.cur - s.begin) != (UInt64)length || !Ppmd7z_RangeDec_IsFinishedOK(&rc)) { + } else if (s.processed + (s.cur - s.begin) != (UInt64)length || !Ppmd7z_RangeDec_IsFinishedOK(&ppmd.rc.dec)) { res = SZ_ERROR_DATA; } } @@ -498,7 +505,9 @@ methods[] = { {"bcj_x86_convert", (PyCFunction)pylzma_bcj_x86_convert, METH_VARARGS, (char *)&doc_bcj_x86_convert}, {"bcj_arm_convert", (PyCFunction)pylzma_bcj_arm_convert, METH_VARARGS, (char *)&doc_bcj_arm_convert}, {"bcj_armt_convert", (PyCFunction)pylzma_bcj_armt_convert, METH_VARARGS, (char *)&doc_bcj_armt_convert}, + {"bcj_arm64_convert", (PyCFunction)pylzma_bcj_arm64_convert, METH_VARARGS, (char *)&doc_bcj_arm64_convert}, {"bcj_ppc_convert", (PyCFunction)pylzma_bcj_ppc_convert, METH_VARARGS, (char *)&doc_bcj_ppc_convert}, + {"bcj_riscv_convert", (PyCFunction)pylzma_bcj_riscv_convert, METH_VARARGS, (char *)&doc_bcj_riscv_convert}, {"bcj_sparc_convert", (PyCFunction)pylzma_bcj_sparc_convert, METH_VARARGS, (char *)&doc_bcj_sparc_convert}, {"bcj_ia64_convert", (PyCFunction)pylzma_bcj_ia64_convert, METH_VARARGS, (char *)&doc_bcj_ia64_convert}, {"bcj2_decode", (PyCFunction)pylzma_bcj2_decode, METH_VARARGS, (char *)&doc_bcj2_decode}, diff --git a/src/sdk/C/7z.h b/src/sdk/C/7z.h index 6c7886e..9e27c01 100644 --- a/src/sdk/C/7z.h +++ b/src/sdk/C/7z.h @@ -1,8 +1,8 @@ /* 7z.h -- 7z interface -2017-04-03 : Igor Pavlov : Public domain */ +2023-04-02 : Igor Pavlov : Public domain */ -#ifndef __7Z_H -#define __7Z_H +#ifndef ZIP7_INC_7Z_H +#define ZIP7_INC_7Z_H #include "7zTypes.h" @@ -91,12 +91,14 @@ typedef struct UInt64 *CoderUnpackSizes; // for all coders in all folders Byte *CodersData; + + UInt64 RangeLimit; } CSzAr; UInt64 SzAr_GetFolderUnpackSize(const CSzAr *p, UInt32 folderIndex); SRes SzAr_DecodeFolder(const CSzAr *p, UInt32 folderIndex, - ILookInStream *stream, UInt64 startPos, + ILookInStreamPtr stream, UInt64 startPos, Byte *outBuffer, size_t outSize, ISzAllocPtr allocMain); @@ -172,7 +174,7 @@ UInt16 *SzArEx_GetFullNameUtf16_Back(const CSzArEx *p, size_t fileIndex, UInt16 SRes SzArEx_Extract( const CSzArEx *db, - ILookInStream *inStream, + ILookInStreamPtr inStream, UInt32 fileIndex, /* index of file */ UInt32 *blockIndex, /* index of solid block */ Byte **outBuffer, /* pointer to pointer to output buffer (allocated with allocMain) */ @@ -194,7 +196,7 @@ SZ_ERROR_INPUT_EOF SZ_ERROR_FAIL */ -SRes SzArEx_Open(CSzArEx *p, ILookInStream *inStream, +SRes SzArEx_Open(CSzArEx *p, ILookInStreamPtr inStream, ISzAllocPtr allocMain, ISzAllocPtr allocTemp); EXTERN_C_END diff --git a/src/sdk/C/7zAlloc.c b/src/sdk/C/7zAlloc.c index c924a52..2f0659a 100644 --- a/src/sdk/C/7zAlloc.c +++ b/src/sdk/C/7zAlloc.c @@ -1,5 +1,5 @@ -/* 7zAlloc.c -- Allocation functions -2017-04-03 : Igor Pavlov : Public domain */ +/* 7zAlloc.c -- Allocation functions for 7z processing +2023-03-04 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -7,74 +7,83 @@ #include "7zAlloc.h" -/* #define _SZ_ALLOC_DEBUG */ -/* use _SZ_ALLOC_DEBUG to debug alloc/free operations */ +/* #define SZ_ALLOC_DEBUG */ +/* use SZ_ALLOC_DEBUG to debug alloc/free operations */ -#ifdef _SZ_ALLOC_DEBUG +#ifdef SZ_ALLOC_DEBUG +/* #ifdef _WIN32 -#include +#include "7zWindows.h" #endif +*/ #include -int g_allocCount = 0; -int g_allocCountTemp = 0; +static int g_allocCount = 0; +static int g_allocCountTemp = 0; +static void Print_Alloc(const char *s, size_t size, int *counter) +{ + const unsigned size2 = (unsigned)size; + fprintf(stderr, "\n%s count = %10d : %10u bytes; ", s, *counter, size2); + (*counter)++; +} +static void Print_Free(const char *s, int *counter) +{ + (*counter)--; + fprintf(stderr, "\n%s count = %10d", s, *counter); +} #endif void *SzAlloc(ISzAllocPtr p, size_t size) { - UNUSED_VAR(p); + UNUSED_VAR(p) if (size == 0) return 0; - #ifdef _SZ_ALLOC_DEBUG - fprintf(stderr, "\nAlloc %10u bytes; count = %10d", (unsigned)size, g_allocCount); - g_allocCount++; + #ifdef SZ_ALLOC_DEBUG + Print_Alloc("Alloc", size, &g_allocCount); #endif return malloc(size); } void SzFree(ISzAllocPtr p, void *address) { - UNUSED_VAR(p); - #ifdef _SZ_ALLOC_DEBUG - if (address != 0) - { - g_allocCount--; - fprintf(stderr, "\nFree; count = %10d", g_allocCount); - } + UNUSED_VAR(p) + #ifdef SZ_ALLOC_DEBUG + if (address) + Print_Free("Free ", &g_allocCount); #endif free(address); } void *SzAllocTemp(ISzAllocPtr p, size_t size) { - UNUSED_VAR(p); + UNUSED_VAR(p) if (size == 0) return 0; - #ifdef _SZ_ALLOC_DEBUG - fprintf(stderr, "\nAlloc_temp %10u bytes; count = %10d", (unsigned)size, g_allocCountTemp); - g_allocCountTemp++; + #ifdef SZ_ALLOC_DEBUG + Print_Alloc("Alloc_temp", size, &g_allocCountTemp); + /* #ifdef _WIN32 return HeapAlloc(GetProcessHeap(), 0, size); #endif + */ #endif return malloc(size); } void SzFreeTemp(ISzAllocPtr p, void *address) { - UNUSED_VAR(p); - #ifdef _SZ_ALLOC_DEBUG - if (address != 0) - { - g_allocCountTemp--; - fprintf(stderr, "\nFree_temp; count = %10d", g_allocCountTemp); - } + UNUSED_VAR(p) + #ifdef SZ_ALLOC_DEBUG + if (address) + Print_Free("Free_temp ", &g_allocCountTemp); + /* #ifdef _WIN32 HeapFree(GetProcessHeap(), 0, address); return; #endif + */ #endif free(address); } diff --git a/src/sdk/C/7zAlloc.h b/src/sdk/C/7zAlloc.h index 44778f9..b2b8b0c 100644 --- a/src/sdk/C/7zAlloc.h +++ b/src/sdk/C/7zAlloc.h @@ -1,8 +1,8 @@ /* 7zAlloc.h -- Allocation functions -2017-04-03 : Igor Pavlov : Public domain */ +2023-03-04 : Igor Pavlov : Public domain */ -#ifndef __7Z_ALLOC_H -#define __7Z_ALLOC_H +#ifndef ZIP7_INC_7Z_ALLOC_H +#define ZIP7_INC_7Z_ALLOC_H #include "7zTypes.h" diff --git a/src/sdk/C/7zArcIn.c b/src/sdk/C/7zArcIn.c index f74d0fa..23f2949 100644 --- a/src/sdk/C/7zArcIn.c +++ b/src/sdk/C/7zArcIn.c @@ -1,5 +1,5 @@ /* 7zArcIn.c -- 7z Input functions -2018-12-31 : Igor Pavlov : Public domain */ +2023-09-07 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -10,10 +10,11 @@ #include "7zCrc.h" #include "CpuArch.h" -#define MY_ALLOC(T, p, size, alloc) { \ - if ((p = (T *)ISzAlloc_Alloc(alloc, (size) * sizeof(T))) == NULL) return SZ_ERROR_MEM; } +#define MY_ALLOC(T, p, size, alloc) \ + { if ((p = (T *)ISzAlloc_Alloc(alloc, (size) * sizeof(T))) == NULL) return SZ_ERROR_MEM; } -#define MY_ALLOC_ZE(T, p, size, alloc) { if ((size) == 0) p = NULL; else MY_ALLOC(T, p, size, alloc) } +#define MY_ALLOC_ZE(T, p, size, alloc) \ + { if ((size) == 0) p = NULL; else MY_ALLOC(T, p, size, alloc) } #define MY_ALLOC_AND_CPY(to, size, from, alloc) \ { MY_ALLOC(Byte, to, size, alloc); memcpy(to, from, size); } @@ -58,7 +59,7 @@ enum EIdEnum const Byte k7zSignature[k7zSignatureSize] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C}; -#define SzBitUi32s_Init(p) { (p)->Defs = NULL; (p)->Vals = NULL; } +#define SzBitUi32s_INIT(p) { (p)->Defs = NULL; (p)->Vals = NULL; } static SRes SzBitUi32s_Alloc(CSzBitUi32s *p, size_t num, ISzAllocPtr alloc) { @@ -69,21 +70,21 @@ static SRes SzBitUi32s_Alloc(CSzBitUi32s *p, size_t num, ISzAllocPtr alloc) } else { - MY_ALLOC(Byte, p->Defs, (num + 7) >> 3, alloc); - MY_ALLOC(UInt32, p->Vals, num, alloc); + MY_ALLOC(Byte, p->Defs, (num + 7) >> 3, alloc) + MY_ALLOC(UInt32, p->Vals, num, alloc) } return SZ_OK; } -void SzBitUi32s_Free(CSzBitUi32s *p, ISzAllocPtr alloc) +static void SzBitUi32s_Free(CSzBitUi32s *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->Defs); p->Defs = NULL; ISzAlloc_Free(alloc, p->Vals); p->Vals = NULL; } -#define SzBitUi64s_Init(p) { (p)->Defs = NULL; (p)->Vals = NULL; } +#define SzBitUi64s_INIT(p) { (p)->Defs = NULL; (p)->Vals = NULL; } -void SzBitUi64s_Free(CSzBitUi64s *p, ISzAllocPtr alloc) +static void SzBitUi64s_Free(CSzBitUi64s *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->Defs); p->Defs = NULL; ISzAlloc_Free(alloc, p->Vals); p->Vals = NULL; @@ -96,7 +97,7 @@ static void SzAr_Init(CSzAr *p) p->NumFolders = 0; p->PackPositions = NULL; - SzBitUi32s_Init(&p->FolderCRCs); + SzBitUi32s_INIT(&p->FolderCRCs) p->FoCodersOffsets = NULL; p->FoStartPackStreamIndex = NULL; @@ -105,6 +106,8 @@ static void SzAr_Init(CSzAr *p) p->CoderUnpackSizes = NULL; p->CodersData = NULL; + + p->RangeLimit = 0; } static void SzAr_Free(CSzAr *p, ISzAllocPtr alloc) @@ -140,11 +143,11 @@ void SzArEx_Init(CSzArEx *p) p->FileNameOffsets = NULL; p->FileNames = NULL; - SzBitUi32s_Init(&p->CRCs); - SzBitUi32s_Init(&p->Attribs); - // SzBitUi32s_Init(&p->Parents); - SzBitUi64s_Init(&p->MTime); - SzBitUi64s_Init(&p->CTime); + SzBitUi32s_INIT(&p->CRCs) + SzBitUi32s_INIT(&p->Attribs) + // SzBitUi32s_INIT(&p->Parents) + SzBitUi64s_INIT(&p->MTime) + SzBitUi64s_INIT(&p->CTime) } void SzArEx_Free(CSzArEx *p, ISzAllocPtr alloc) @@ -178,11 +181,20 @@ static int TestSignatureCandidate(const Byte *testBytes) return 1; } -#define SzData_Clear(p) { (p)->Data = NULL; (p)->Size = 0; } +#define SzData_CLEAR(p) { (p)->Data = NULL; (p)->Size = 0; } + +#define SZ_READ_BYTE_SD_NOCHECK(_sd_, dest) \ + (_sd_)->Size--; dest = *(_sd_)->Data++; + +#define SZ_READ_BYTE_SD(_sd_, dest) \ + if ((_sd_)->Size == 0) return SZ_ERROR_ARCHIVE; \ + SZ_READ_BYTE_SD_NOCHECK(_sd_, dest) -#define SZ_READ_BYTE_SD(_sd_, dest) if ((_sd_)->Size == 0) return SZ_ERROR_ARCHIVE; (_sd_)->Size--; dest = *(_sd_)->Data++; #define SZ_READ_BYTE(dest) SZ_READ_BYTE_SD(sd, dest) -#define SZ_READ_BYTE_2(dest) if (sd.Size == 0) return SZ_ERROR_ARCHIVE; sd.Size--; dest = *sd.Data++; + +#define SZ_READ_BYTE_2(dest) \ + if (sd.Size == 0) return SZ_ERROR_ARCHIVE; \ + sd.Size--; dest = *sd.Data++; #define SKIP_DATA(sd, size) { sd->Size -= (size_t)(size); sd->Data += (size_t)(size); } #define SKIP_DATA2(sd, size) { sd.Size -= (size_t)(size); sd.Data += (size_t)(size); } @@ -190,25 +202,25 @@ static int TestSignatureCandidate(const Byte *testBytes) #define SZ_READ_32(dest) if (sd.Size < 4) return SZ_ERROR_ARCHIVE; \ dest = GetUi32(sd.Data); SKIP_DATA2(sd, 4); -static MY_NO_INLINE SRes ReadNumber(CSzData *sd, UInt64 *value) +static Z7_NO_INLINE SRes ReadNumber(CSzData *sd, UInt64 *value) { Byte firstByte, mask; unsigned i; UInt32 v; - SZ_READ_BYTE(firstByte); + SZ_READ_BYTE(firstByte) if ((firstByte & 0x80) == 0) { *value = firstByte; return SZ_OK; } - SZ_READ_BYTE(v); + SZ_READ_BYTE(v) if ((firstByte & 0x40) == 0) { *value = (((UInt32)firstByte & 0x3F) << 8) | v; return SZ_OK; } - SZ_READ_BYTE(mask); + SZ_READ_BYTE(mask) *value = v | ((UInt32)mask << 8); mask = 0x20; for (i = 2; i < 8; i++) @@ -216,11 +228,11 @@ static MY_NO_INLINE SRes ReadNumber(CSzData *sd, UInt64 *value) Byte b; if ((firstByte & mask) == 0) { - UInt64 highPart = (unsigned)firstByte & (unsigned)(mask - 1); + const UInt64 highPart = (unsigned)firstByte & (unsigned)(mask - 1); *value |= (highPart << (8 * i)); return SZ_OK; } - SZ_READ_BYTE(b); + SZ_READ_BYTE(b) *value |= ((UInt64)b << (8 * i)); mask >>= 1; } @@ -228,7 +240,7 @@ static MY_NO_INLINE SRes ReadNumber(CSzData *sd, UInt64 *value) } -static MY_NO_INLINE SRes SzReadNumber32(CSzData *sd, UInt32 *value) +static Z7_NO_INLINE SRes SzReadNumber32(CSzData *sd, UInt32 *value) { Byte firstByte; UInt64 value64; @@ -242,7 +254,7 @@ static MY_NO_INLINE SRes SzReadNumber32(CSzData *sd, UInt32 *value) sd->Size--; return SZ_OK; } - RINOK(ReadNumber(sd, &value64)); + RINOK(ReadNumber(sd, &value64)) if (value64 >= (UInt32)0x80000000 - 1) return SZ_ERROR_UNSUPPORTED; if (value64 >= ((UInt64)(1) << ((sizeof(size_t) - 1) * 8 + 4))) @@ -256,10 +268,10 @@ static MY_NO_INLINE SRes SzReadNumber32(CSzData *sd, UInt32 *value) static SRes SkipData(CSzData *sd) { UInt64 size; - RINOK(ReadNumber(sd, &size)); + RINOK(ReadNumber(sd, &size)) if (size > sd->Size) return SZ_ERROR_ARCHIVE; - SKIP_DATA(sd, size); + SKIP_DATA(sd, size) return SZ_OK; } @@ -268,28 +280,28 @@ static SRes WaitId(CSzData *sd, UInt32 id) for (;;) { UInt64 type; - RINOK(ReadID(sd, &type)); + RINOK(ReadID(sd, &type)) if (type == id) return SZ_OK; if (type == k7zIdEnd) return SZ_ERROR_ARCHIVE; - RINOK(SkipData(sd)); + RINOK(SkipData(sd)) } } static SRes RememberBitVector(CSzData *sd, UInt32 numItems, const Byte **v) { - UInt32 numBytes = (numItems + 7) >> 3; + const UInt32 numBytes = (numItems + 7) >> 3; if (numBytes > sd->Size) return SZ_ERROR_ARCHIVE; *v = sd->Data; - SKIP_DATA(sd, numBytes); + SKIP_DATA(sd, numBytes) return SZ_OK; } static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems) { - Byte b = 0; + unsigned b = 0; unsigned m = 0; UInt32 sum = 0; for (; numItems != 0; numItems--) @@ -300,53 +312,53 @@ static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems) m = 8; } m--; - sum += ((b >> m) & 1); + sum += (UInt32)((b >> m) & 1); } return sum; } -static MY_NO_INLINE SRes ReadBitVector(CSzData *sd, UInt32 numItems, Byte **v, ISzAllocPtr alloc) +static Z7_NO_INLINE SRes ReadBitVector(CSzData *sd, UInt32 numItems, Byte **v, ISzAllocPtr alloc) { Byte allAreDefined; Byte *v2; - UInt32 numBytes = (numItems + 7) >> 3; + const UInt32 numBytes = (numItems + 7) >> 3; *v = NULL; - SZ_READ_BYTE(allAreDefined); + SZ_READ_BYTE(allAreDefined) if (numBytes == 0) return SZ_OK; if (allAreDefined == 0) { if (numBytes > sd->Size) return SZ_ERROR_ARCHIVE; - MY_ALLOC_AND_CPY(*v, numBytes, sd->Data, alloc); - SKIP_DATA(sd, numBytes); + MY_ALLOC_AND_CPY(*v, numBytes, sd->Data, alloc) + SKIP_DATA(sd, numBytes) return SZ_OK; } - MY_ALLOC(Byte, *v, numBytes, alloc); + MY_ALLOC(Byte, *v, numBytes, alloc) v2 = *v; memset(v2, 0xFF, (size_t)numBytes); { - unsigned numBits = (unsigned)numItems & 7; + const unsigned numBits = (unsigned)numItems & 7; if (numBits != 0) v2[(size_t)numBytes - 1] = (Byte)((((UInt32)1 << numBits) - 1) << (8 - numBits)); } return SZ_OK; } -static MY_NO_INLINE SRes ReadUi32s(CSzData *sd2, UInt32 numItems, CSzBitUi32s *crcs, ISzAllocPtr alloc) +static Z7_NO_INLINE SRes ReadUi32s(CSzData *sd2, UInt32 numItems, CSzBitUi32s *crcs, ISzAllocPtr alloc) { UInt32 i; CSzData sd; UInt32 *vals; const Byte *defs; - MY_ALLOC_ZE(UInt32, crcs->Vals, numItems, alloc); + MY_ALLOC_ZE(UInt32, crcs->Vals, numItems, alloc) sd = *sd2; defs = crcs->Defs; vals = crcs->Vals; for (i = 0; i < numItems; i++) if (SzBitArray_Check(defs, i)) { - SZ_READ_32(vals[i]); + SZ_READ_32(vals[i]) } else vals[i] = 0; @@ -357,7 +369,7 @@ static MY_NO_INLINE SRes ReadUi32s(CSzData *sd2, UInt32 numItems, CSzBitUi32s *c static SRes ReadBitUi32s(CSzData *sd, UInt32 numItems, CSzBitUi32s *crcs, ISzAllocPtr alloc) { SzBitUi32s_Free(crcs, alloc); - RINOK(ReadBitVector(sd, numItems, &crcs->Defs, alloc)); + RINOK(ReadBitVector(sd, numItems, &crcs->Defs, alloc)) return ReadUi32s(sd, numItems, crcs, alloc); } @@ -365,36 +377,36 @@ static SRes SkipBitUi32s(CSzData *sd, UInt32 numItems) { Byte allAreDefined; UInt32 numDefined = numItems; - SZ_READ_BYTE(allAreDefined); + SZ_READ_BYTE(allAreDefined) if (!allAreDefined) { - size_t numBytes = (numItems + 7) >> 3; + const size_t numBytes = (numItems + 7) >> 3; if (numBytes > sd->Size) return SZ_ERROR_ARCHIVE; numDefined = CountDefinedBits(sd->Data, numItems); - SKIP_DATA(sd, numBytes); + SKIP_DATA(sd, numBytes) } if (numDefined > (sd->Size >> 2)) return SZ_ERROR_ARCHIVE; - SKIP_DATA(sd, (size_t)numDefined * 4); + SKIP_DATA(sd, (size_t)numDefined * 4) return SZ_OK; } static SRes ReadPackInfo(CSzAr *p, CSzData *sd, ISzAllocPtr alloc) { - RINOK(SzReadNumber32(sd, &p->NumPackStreams)); + RINOK(SzReadNumber32(sd, &p->NumPackStreams)) - RINOK(WaitId(sd, k7zIdSize)); - MY_ALLOC(UInt64, p->PackPositions, (size_t)p->NumPackStreams + 1, alloc); + RINOK(WaitId(sd, k7zIdSize)) + MY_ALLOC(UInt64, p->PackPositions, (size_t)p->NumPackStreams + 1, alloc) { UInt64 sum = 0; UInt32 i; - UInt32 numPackStreams = p->NumPackStreams; + const UInt32 numPackStreams = p->NumPackStreams; for (i = 0; i < numPackStreams; i++) { UInt64 packSize; p->PackPositions[i] = sum; - RINOK(ReadNumber(sd, &packSize)); + RINOK(ReadNumber(sd, &packSize)) sum += packSize; if (sum < packSize) return SZ_ERROR_ARCHIVE; @@ -405,16 +417,16 @@ static SRes ReadPackInfo(CSzAr *p, CSzData *sd, ISzAllocPtr alloc) for (;;) { UInt64 type; - RINOK(ReadID(sd, &type)); + RINOK(ReadID(sd, &type)) if (type == k7zIdEnd) return SZ_OK; if (type == k7zIdCRC) { /* CRC of packed streams is unused now */ - RINOK(SkipBitUi32s(sd, p->NumPackStreams)); + RINOK(SkipBitUi32s(sd, p->NumPackStreams)) continue; } - RINOK(SkipData(sd)); + RINOK(SkipData(sd)) } } @@ -440,7 +452,7 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd) f->NumPackStreams = 0; f->UnpackStream = 0; - RINOK(SzReadNumber32(sd, &numCoders)); + RINOK(SzReadNumber32(sd, &numCoders)) if (numCoders == 0 || numCoders > SZ_NUM_CODERS_IN_FOLDER_MAX) return SZ_ERROR_UNSUPPORTED; @@ -451,7 +463,7 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd) unsigned idSize, j; UInt64 id; - SZ_READ_BYTE(mainByte); + SZ_READ_BYTE(mainByte) if ((mainByte & 0xC0) != 0) return SZ_ERROR_UNSUPPORTED; @@ -479,12 +491,12 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd) { UInt32 numStreams; - RINOK(SzReadNumber32(sd, &numStreams)); + RINOK(SzReadNumber32(sd, &numStreams)) if (numStreams > k_NumCodersStreams_in_Folder_MAX) return SZ_ERROR_UNSUPPORTED; coder->NumStreams = (Byte)numStreams; - RINOK(SzReadNumber32(sd, &numStreams)); + RINOK(SzReadNumber32(sd, &numStreams)) if (numStreams != 1) return SZ_ERROR_UNSUPPORTED; } @@ -497,12 +509,12 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd) if ((mainByte & 0x20) != 0) { UInt32 propsSize = 0; - RINOK(SzReadNumber32(sd, &propsSize)); + RINOK(SzReadNumber32(sd, &propsSize)) if (propsSize > sd->Size) return SZ_ERROR_ARCHIVE; if (propsSize >= 0x80) return SZ_ERROR_UNSUPPORTED; - coder->PropsOffset = sd->Data - dataStart; + coder->PropsOffset = (size_t)(sd->Data - dataStart); coder->PropsSize = (Byte)propsSize; sd->Data += (size_t)propsSize; sd->Size -= (size_t)propsSize; @@ -547,12 +559,12 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd) { CSzBond *bp = f->Bonds + i; - RINOK(SzReadNumber32(sd, &bp->InIndex)); + RINOK(SzReadNumber32(sd, &bp->InIndex)) if (bp->InIndex >= numInStreams || streamUsed[bp->InIndex]) return SZ_ERROR_ARCHIVE; streamUsed[bp->InIndex] = True; - RINOK(SzReadNumber32(sd, &bp->OutIndex)); + RINOK(SzReadNumber32(sd, &bp->OutIndex)) if (bp->OutIndex >= numCoders || coderUsed[bp->OutIndex]) return SZ_ERROR_ARCHIVE; coderUsed[bp->OutIndex] = True; @@ -582,7 +594,7 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd) for (i = 0; i < numPackStreams; i++) { UInt32 index; - RINOK(SzReadNumber32(sd, &index)); + RINOK(SzReadNumber32(sd, &index)) if (index >= numInStreams || streamUsed[index]) return SZ_ERROR_ARCHIVE; streamUsed[index] = True; @@ -596,7 +608,7 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd) } -static MY_NO_INLINE SRes SkipNumbers(CSzData *sd2, UInt32 num) +static Z7_NO_INLINE SRes SkipNumbers(CSzData *sd2, UInt32 num) { CSzData sd; sd = *sd2; @@ -604,7 +616,7 @@ static MY_NO_INLINE SRes SkipNumbers(CSzData *sd2, UInt32 num) { Byte firstByte, mask; unsigned i; - SZ_READ_BYTE_2(firstByte); + SZ_READ_BYTE_2(firstByte) if ((firstByte & 0x80) == 0) continue; if ((firstByte & 0x40) == 0) @@ -620,7 +632,7 @@ static MY_NO_INLINE SRes SkipNumbers(CSzData *sd2, UInt32 num) mask >>= 1; if (i > sd.Size) return SZ_ERROR_ARCHIVE; - SKIP_DATA2(sd, i); + SKIP_DATA2(sd, i) } *sd2 = sd; return SZ_OK; @@ -643,30 +655,30 @@ static SRes ReadUnpackInfo(CSzAr *p, const Byte *startBufPtr; Byte external; - RINOK(WaitId(sd2, k7zIdFolder)); + RINOK(WaitId(sd2, k7zIdFolder)) - RINOK(SzReadNumber32(sd2, &numFolders)); + RINOK(SzReadNumber32(sd2, &numFolders)) if (numFolders > numFoldersMax) return SZ_ERROR_UNSUPPORTED; p->NumFolders = numFolders; - SZ_READ_BYTE_SD(sd2, external); + SZ_READ_BYTE_SD(sd2, external) if (external == 0) sd = *sd2; else { UInt32 index; - RINOK(SzReadNumber32(sd2, &index)); + RINOK(SzReadNumber32(sd2, &index)) if (index >= numTempBufs) return SZ_ERROR_ARCHIVE; sd.Data = tempBufs[index].data; sd.Size = tempBufs[index].size; } - MY_ALLOC(size_t, p->FoCodersOffsets, (size_t)numFolders + 1, alloc); - MY_ALLOC(UInt32, p->FoStartPackStreamIndex, (size_t)numFolders + 1, alloc); - MY_ALLOC(UInt32, p->FoToCoderUnpackSizes, (size_t)numFolders + 1, alloc); - MY_ALLOC_ZE(Byte, p->FoToMainUnpackSizeIndex, (size_t)numFolders, alloc); + MY_ALLOC(size_t, p->FoCodersOffsets, (size_t)numFolders + 1, alloc) + MY_ALLOC(UInt32, p->FoStartPackStreamIndex, (size_t)numFolders + 1, alloc) + MY_ALLOC(UInt32, p->FoToCoderUnpackSizes, (size_t)numFolders + 1, alloc) + MY_ALLOC_ZE(Byte, p->FoToMainUnpackSizeIndex, (size_t)numFolders, alloc) startBufPtr = sd.Data; @@ -677,9 +689,9 @@ static SRes ReadUnpackInfo(CSzAr *p, { UInt32 numCoders, ci, numInStreams = 0; - p->FoCodersOffsets[fo] = sd.Data - startBufPtr; + p->FoCodersOffsets[fo] = (size_t)(sd.Data - startBufPtr); - RINOK(SzReadNumber32(&sd, &numCoders)); + RINOK(SzReadNumber32(&sd, &numCoders)) if (numCoders == 0 || numCoders > k_Scan_NumCoders_MAX) return SZ_ERROR_UNSUPPORTED; @@ -689,7 +701,7 @@ static SRes ReadUnpackInfo(CSzAr *p, unsigned idSize; UInt32 coderInStreams; - SZ_READ_BYTE_2(mainByte); + SZ_READ_BYTE_2(mainByte) if ((mainByte & 0xC0) != 0) return SZ_ERROR_UNSUPPORTED; idSize = (mainByte & 0xF); @@ -697,15 +709,15 @@ static SRes ReadUnpackInfo(CSzAr *p, return SZ_ERROR_UNSUPPORTED; if (idSize > sd.Size) return SZ_ERROR_ARCHIVE; - SKIP_DATA2(sd, idSize); + SKIP_DATA2(sd, idSize) coderInStreams = 1; if ((mainByte & 0x10) != 0) { UInt32 coderOutStreams; - RINOK(SzReadNumber32(&sd, &coderInStreams)); - RINOK(SzReadNumber32(&sd, &coderOutStreams)); + RINOK(SzReadNumber32(&sd, &coderInStreams)) + RINOK(SzReadNumber32(&sd, &coderOutStreams)) if (coderInStreams > k_Scan_NumCodersStreams_in_Folder_MAX || coderOutStreams != 1) return SZ_ERROR_UNSUPPORTED; } @@ -715,10 +727,10 @@ static SRes ReadUnpackInfo(CSzAr *p, if ((mainByte & 0x20) != 0) { UInt32 propsSize; - RINOK(SzReadNumber32(&sd, &propsSize)); + RINOK(SzReadNumber32(&sd, &propsSize)) if (propsSize > sd.Size) return SZ_ERROR_ARCHIVE; - SKIP_DATA2(sd, propsSize); + SKIP_DATA2(sd, propsSize) } } @@ -732,7 +744,7 @@ static SRes ReadUnpackInfo(CSzAr *p, Byte coderUsed[k_Scan_NumCoders_MAX]; UInt32 i; - UInt32 numBonds = numCoders - 1; + const UInt32 numBonds = numCoders - 1; if (numInStreams < numBonds) return SZ_ERROR_ARCHIVE; @@ -748,12 +760,12 @@ static SRes ReadUnpackInfo(CSzAr *p, { UInt32 index; - RINOK(SzReadNumber32(&sd, &index)); + RINOK(SzReadNumber32(&sd, &index)) if (index >= numInStreams || streamUsed[index]) return SZ_ERROR_ARCHIVE; streamUsed[index] = True; - RINOK(SzReadNumber32(&sd, &index)); + RINOK(SzReadNumber32(&sd, &index)) if (index >= numCoders || coderUsed[index]) return SZ_ERROR_ARCHIVE; coderUsed[index] = True; @@ -765,7 +777,7 @@ static SRes ReadUnpackInfo(CSzAr *p, for (i = 0; i < numPackStreams; i++) { UInt32 index; - RINOK(SzReadNumber32(&sd, &index)); + RINOK(SzReadNumber32(&sd, &index)) if (index >= numInStreams || streamUsed[index]) return SZ_ERROR_ARCHIVE; streamUsed[index] = True; @@ -797,10 +809,10 @@ static SRes ReadUnpackInfo(CSzAr *p, p->FoToCoderUnpackSizes[fo] = numCodersOutStreams; { - size_t dataSize = sd.Data - startBufPtr; + const size_t dataSize = (size_t)(sd.Data - startBufPtr); p->FoStartPackStreamIndex[fo] = packStreamIndex; p->FoCodersOffsets[fo] = dataSize; - MY_ALLOC_ZE_AND_CPY(p->CodersData, dataSize, startBufPtr, alloc); + MY_ALLOC_ZE_AND_CPY(p->CodersData, dataSize, startBufPtr, alloc) } if (external != 0) @@ -810,21 +822,21 @@ static SRes ReadUnpackInfo(CSzAr *p, sd = *sd2; } - RINOK(WaitId(&sd, k7zIdCodersUnpackSize)); + RINOK(WaitId(&sd, k7zIdCodersUnpackSize)) - MY_ALLOC_ZE(UInt64, p->CoderUnpackSizes, (size_t)numCodersOutStreams, alloc); + MY_ALLOC_ZE(UInt64, p->CoderUnpackSizes, (size_t)numCodersOutStreams, alloc) { UInt32 i; for (i = 0; i < numCodersOutStreams; i++) { - RINOK(ReadNumber(&sd, p->CoderUnpackSizes + i)); + RINOK(ReadNumber(&sd, p->CoderUnpackSizes + i)) } } for (;;) { UInt64 type; - RINOK(ReadID(&sd, &type)); + RINOK(ReadID(&sd, &type)) if (type == k7zIdEnd) { *sd2 = sd; @@ -832,10 +844,10 @@ static SRes ReadUnpackInfo(CSzAr *p, } if (type == k7zIdCRC) { - RINOK(ReadBitUi32s(&sd, numFolders, &p->FolderCRCs, alloc)); + RINOK(ReadBitUi32s(&sd, numFolders, &p->FolderCRCs, alloc)) continue; } - RINOK(SkipData(&sd)); + RINOK(SkipData(&sd)) } } @@ -860,13 +872,13 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi) { UInt64 type = 0; UInt32 numSubDigests = 0; - UInt32 numFolders = p->NumFolders; + const UInt32 numFolders = p->NumFolders; UInt32 numUnpackStreams = numFolders; UInt32 numUnpackSizesInData = 0; for (;;) { - RINOK(ReadID(sd, &type)); + RINOK(ReadID(sd, &type)) if (type == k7zIdNumUnpackStream) { UInt32 i; @@ -876,7 +888,7 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi) for (i = 0; i < numFolders; i++) { UInt32 numStreams; - RINOK(SzReadNumber32(sd, &numStreams)); + RINOK(SzReadNumber32(sd, &numStreams)) if (numUnpackStreams > numUnpackStreams + numStreams) return SZ_ERROR_UNSUPPORTED; numUnpackStreams += numStreams; @@ -885,12 +897,12 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi) if (numStreams != 1 || !SzBitWithVals_Check(&p->FolderCRCs, i)) numSubDigests += numStreams; } - ssi->sdNumSubStreams.Size = sd->Data - ssi->sdNumSubStreams.Data; + ssi->sdNumSubStreams.Size = (size_t)(sd->Data - ssi->sdNumSubStreams.Data); continue; } if (type == k7zIdCRC || type == k7zIdSize || type == k7zIdEnd) break; - RINOK(SkipData(sd)); + RINOK(SkipData(sd)) } if (!ssi->sdNumSubStreams.Data) @@ -906,9 +918,9 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi) if (type == k7zIdSize) { ssi->sdSizes.Data = sd->Data; - RINOK(SkipNumbers(sd, numUnpackSizesInData)); - ssi->sdSizes.Size = sd->Data - ssi->sdSizes.Data; - RINOK(ReadID(sd, &type)); + RINOK(SkipNumbers(sd, numUnpackSizesInData)) + ssi->sdSizes.Size = (size_t)(sd->Data - ssi->sdSizes.Data); + RINOK(ReadID(sd, &type)) } for (;;) @@ -918,14 +930,14 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi) if (type == k7zIdCRC) { ssi->sdCRCs.Data = sd->Data; - RINOK(SkipBitUi32s(sd, numSubDigests)); - ssi->sdCRCs.Size = sd->Data - ssi->sdCRCs.Data; + RINOK(SkipBitUi32s(sd, numSubDigests)) + ssi->sdCRCs.Size = (size_t)(sd->Data - ssi->sdCRCs.Data); } else { - RINOK(SkipData(sd)); + RINOK(SkipData(sd)) } - RINOK(ReadID(sd, &type)); + RINOK(ReadID(sd, &type)) } } @@ -938,27 +950,31 @@ static SRes SzReadStreamsInfo(CSzAr *p, { UInt64 type; - SzData_Clear(&ssi->sdSizes); - SzData_Clear(&ssi->sdCRCs); - SzData_Clear(&ssi->sdNumSubStreams); + SzData_CLEAR(&ssi->sdSizes) + SzData_CLEAR(&ssi->sdCRCs) + SzData_CLEAR(&ssi->sdNumSubStreams) *dataOffset = 0; - RINOK(ReadID(sd, &type)); + RINOK(ReadID(sd, &type)) if (type == k7zIdPackInfo) { - RINOK(ReadNumber(sd, dataOffset)); - RINOK(ReadPackInfo(p, sd, alloc)); - RINOK(ReadID(sd, &type)); + RINOK(ReadNumber(sd, dataOffset)) + if (*dataOffset > p->RangeLimit) + return SZ_ERROR_ARCHIVE; + RINOK(ReadPackInfo(p, sd, alloc)) + if (p->PackPositions[p->NumPackStreams] > p->RangeLimit - *dataOffset) + return SZ_ERROR_ARCHIVE; + RINOK(ReadID(sd, &type)) } if (type == k7zIdUnpackInfo) { - RINOK(ReadUnpackInfo(p, sd, numFoldersMax, tempBufs, numTempBufs, alloc)); - RINOK(ReadID(sd, &type)); + RINOK(ReadUnpackInfo(p, sd, numFoldersMax, tempBufs, numTempBufs, alloc)) + RINOK(ReadID(sd, &type)) } if (type == k7zIdSubStreamsInfo) { - RINOK(ReadSubStreamsInfo(p, sd, ssi)); - RINOK(ReadID(sd, &type)); + RINOK(ReadSubStreamsInfo(p, sd, ssi)) + RINOK(ReadID(sd, &type)) } else { @@ -970,7 +986,7 @@ static SRes SzReadStreamsInfo(CSzAr *p, } static SRes SzReadAndDecodePackedStreams( - ILookInStream *inStream, + ILookInStreamPtr inStream, CSzData *sd, CBuf *tempBufs, UInt32 numFoldersMax, @@ -982,7 +998,7 @@ static SRes SzReadAndDecodePackedStreams( UInt32 fo; CSubStreamInfo ssi; - RINOK(SzReadStreamsInfo(p, sd, numFoldersMax, NULL, 0, &dataStartPos, &ssi, allocTemp)); + RINOK(SzReadStreamsInfo(p, sd, numFoldersMax, NULL, 0, &dataStartPos, &ssi, allocTemp)) dataStartPos += baseOffset; if (p->NumFolders == 0) @@ -994,7 +1010,7 @@ static SRes SzReadAndDecodePackedStreams( for (fo = 0; fo < p->NumFolders; fo++) { CBuf *tempBuf = tempBufs + fo; - UInt64 unpackSize = SzAr_GetFolderUnpackSize(p, fo); + const UInt64 unpackSize = SzAr_GetFolderUnpackSize(p, fo); if ((size_t)unpackSize != unpackSize) return SZ_ERROR_MEM; if (!Buf_Create(tempBuf, (size_t)unpackSize, allocTemp)) @@ -1004,8 +1020,8 @@ static SRes SzReadAndDecodePackedStreams( for (fo = 0; fo < p->NumFolders; fo++) { const CBuf *tempBuf = tempBufs + fo; - RINOK(LookInStream_SeekTo(inStream, dataStartPos)); - RINOK(SzAr_DecodeFolder(p, fo, inStream, dataStartPos, tempBuf->data, tempBuf->size, allocTemp)); + RINOK(LookInStream_SeekTo(inStream, dataStartPos)) + RINOK(SzAr_DecodeFolder(p, fo, inStream, dataStartPos, tempBuf->data, tempBuf->size, allocTemp)) } return SZ_OK; @@ -1028,19 +1044,19 @@ static SRes SzReadFileNames(const Byte *data, size_t size, UInt32 numFiles, size return SZ_ERROR_ARCHIVE; for (p = data + pos; #ifdef _WIN32 - *(const UInt16 *)p != 0 + *(const UInt16 *)(const void *)p != 0 #else p[0] != 0 || p[1] != 0 #endif ; p += 2); - pos = p - data + 2; + pos = (size_t)(p - data) + 2; *offsets++ = (pos >> 1); } while (--numFiles); return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE; } -static MY_NO_INLINE SRes ReadTime(CSzBitUi64s *p, UInt32 num, +static Z7_NO_INLINE SRes ReadTime(CSzBitUi64s *p, UInt32 num, CSzData *sd2, const CBuf *tempBufs, UInt32 numTempBufs, ISzAllocPtr alloc) @@ -1051,22 +1067,22 @@ static MY_NO_INLINE SRes ReadTime(CSzBitUi64s *p, UInt32 num, Byte *defs; Byte external; - RINOK(ReadBitVector(sd2, num, &p->Defs, alloc)); + RINOK(ReadBitVector(sd2, num, &p->Defs, alloc)) - SZ_READ_BYTE_SD(sd2, external); + SZ_READ_BYTE_SD(sd2, external) if (external == 0) sd = *sd2; else { UInt32 index; - RINOK(SzReadNumber32(sd2, &index)); + RINOK(SzReadNumber32(sd2, &index)) if (index >= numTempBufs) return SZ_ERROR_ARCHIVE; sd.Data = tempBufs[index].data; sd.Size = tempBufs[index].size; } - MY_ALLOC_ZE(CNtfsFileTime, p->Vals, num, alloc); + MY_ALLOC_ZE(CNtfsFileTime, p->Vals, num, alloc) vals = p->Vals; defs = p->Defs; for (i = 0; i < num; i++) @@ -1076,7 +1092,7 @@ static MY_NO_INLINE SRes ReadTime(CSzBitUi64s *p, UInt32 num, return SZ_ERROR_ARCHIVE; vals[i].Low = GetUi32(sd.Data); vals[i].High = GetUi32(sd.Data + 4); - SKIP_DATA2(sd, 8); + SKIP_DATA2(sd, 8) } else vals[i].High = vals[i].Low = 0; @@ -1094,7 +1110,7 @@ static MY_NO_INLINE SRes ReadTime(CSzBitUi64s *p, UInt32 num, static SRes SzReadHeader2( CSzArEx *p, /* allocMain */ CSzData *sd, - ILookInStream *inStream, + ILookInStreamPtr inStream, CBuf *tempBufs, UInt32 *numTempBufs, ISzAllocPtr allocMain, ISzAllocPtr allocTemp @@ -1105,26 +1121,26 @@ static SRes SzReadHeader2( { UInt64 type; - SzData_Clear(&ssi.sdSizes); - SzData_Clear(&ssi.sdCRCs); - SzData_Clear(&ssi.sdNumSubStreams); + SzData_CLEAR(&ssi.sdSizes) + SzData_CLEAR(&ssi.sdCRCs) + SzData_CLEAR(&ssi.sdNumSubStreams) ssi.NumSubDigests = 0; ssi.NumTotalSubStreams = 0; - RINOK(ReadID(sd, &type)); + RINOK(ReadID(sd, &type)) if (type == k7zIdArchiveProperties) { for (;;) { UInt64 type2; - RINOK(ReadID(sd, &type2)); + RINOK(ReadID(sd, &type2)) if (type2 == k7zIdEnd) break; - RINOK(SkipData(sd)); + RINOK(SkipData(sd)) } - RINOK(ReadID(sd, &type)); + RINOK(ReadID(sd, &type)) } if (type == k7zIdAdditionalStreamsInfo) @@ -1133,6 +1149,8 @@ static SRes SzReadHeader2( SRes res; SzAr_Init(&tempAr); + tempAr.RangeLimit = p->db.RangeLimit; + res = SzReadAndDecodePackedStreams(inStream, sd, tempBufs, NUM_ADDITIONAL_STREAMS_MAX, p->startPosAfterHeader, &tempAr, allocTemp); *numTempBufs = tempAr.NumFolders; @@ -1140,15 +1158,15 @@ static SRes SzReadHeader2( if (res != SZ_OK) return res; - RINOK(ReadID(sd, &type)); + RINOK(ReadID(sd, &type)) } if (type == k7zIdMainStreamsInfo) { RINOK(SzReadStreamsInfo(&p->db, sd, (UInt32)1 << 30, tempBufs, *numTempBufs, - &p->dataPos, &ssi, allocMain)); + &p->dataPos, &ssi, allocMain)) p->dataPos += p->startPosAfterHeader; - RINOK(ReadID(sd, &type)); + RINOK(ReadID(sd, &type)) } if (type == k7zIdEnd) @@ -1166,23 +1184,23 @@ static SRes SzReadHeader2( const Byte *emptyStreams = NULL; const Byte *emptyFiles = NULL; - RINOK(SzReadNumber32(sd, &numFiles)); + RINOK(SzReadNumber32(sd, &numFiles)) p->NumFiles = numFiles; for (;;) { UInt64 type; UInt64 size; - RINOK(ReadID(sd, &type)); + RINOK(ReadID(sd, &type)) if (type == k7zIdEnd) break; - RINOK(ReadNumber(sd, &size)); + RINOK(ReadNumber(sd, &size)) if (size > sd->Size) return SZ_ERROR_ARCHIVE; if (type >= ((UInt32)1 << 8)) { - SKIP_DATA(sd, size); + SKIP_DATA(sd, size) } else switch ((unsigned)type) { @@ -1192,7 +1210,7 @@ static SRes SzReadHeader2( const Byte *namesData; Byte external; - SZ_READ_BYTE(external); + SZ_READ_BYTE(external) if (external == 0) { namesSize = (size_t)size - 1; @@ -1201,7 +1219,7 @@ static SRes SzReadHeader2( else { UInt32 index; - RINOK(SzReadNumber32(sd, &index)); + RINOK(SzReadNumber32(sd, &index)) if (index >= *numTempBufs) return SZ_ERROR_ARCHIVE; namesData = (tempBufs)[index].data; @@ -1210,25 +1228,25 @@ static SRes SzReadHeader2( if ((namesSize & 1) != 0) return SZ_ERROR_ARCHIVE; - MY_ALLOC(size_t, p->FileNameOffsets, numFiles + 1, allocMain); - MY_ALLOC_ZE_AND_CPY(p->FileNames, namesSize, namesData, allocMain); + MY_ALLOC(size_t, p->FileNameOffsets, numFiles + 1, allocMain) + MY_ALLOC_ZE_AND_CPY(p->FileNames, namesSize, namesData, allocMain) RINOK(SzReadFileNames(p->FileNames, namesSize, numFiles, p->FileNameOffsets)) if (external == 0) { - SKIP_DATA(sd, namesSize); + SKIP_DATA(sd, namesSize) } break; } case k7zIdEmptyStream: { - RINOK(RememberBitVector(sd, numFiles, &emptyStreams)); + RINOK(RememberBitVector(sd, numFiles, &emptyStreams)) numEmptyStreams = CountDefinedBits(emptyStreams, numFiles); emptyFiles = NULL; break; } case k7zIdEmptyFile: { - RINOK(RememberBitVector(sd, numEmptyStreams, &emptyFiles)); + RINOK(RememberBitVector(sd, numEmptyStreams, &emptyFiles)) break; } case k7zIdWinAttrib: @@ -1237,22 +1255,22 @@ static SRes SzReadHeader2( CSzData sdSwitch; CSzData *sdPtr; SzBitUi32s_Free(&p->Attribs, allocMain); - RINOK(ReadBitVector(sd, numFiles, &p->Attribs.Defs, allocMain)); + RINOK(ReadBitVector(sd, numFiles, &p->Attribs.Defs, allocMain)) - SZ_READ_BYTE(external); + SZ_READ_BYTE(external) if (external == 0) sdPtr = sd; else { UInt32 index; - RINOK(SzReadNumber32(sd, &index)); + RINOK(SzReadNumber32(sd, &index)) if (index >= *numTempBufs) return SZ_ERROR_ARCHIVE; sdSwitch.Data = (tempBufs)[index].data; sdSwitch.Size = (tempBufs)[index].size; sdPtr = &sdSwitch; } - RINOK(ReadUi32s(sdPtr, numFiles, &p->Attribs, allocMain)); + RINOK(ReadUi32s(sdPtr, numFiles, &p->Attribs, allocMain)) break; } /* @@ -1265,11 +1283,11 @@ static SRes SzReadHeader2( break; } */ - case k7zIdMTime: RINOK(ReadTime(&p->MTime, numFiles, sd, tempBufs, *numTempBufs, allocMain)); break; - case k7zIdCTime: RINOK(ReadTime(&p->CTime, numFiles, sd, tempBufs, *numTempBufs, allocMain)); break; + case k7zIdMTime: RINOK(ReadTime(&p->MTime, numFiles, sd, tempBufs, *numTempBufs, allocMain)) break; + case k7zIdCTime: RINOK(ReadTime(&p->CTime, numFiles, sd, tempBufs, *numTempBufs, allocMain)) break; default: { - SKIP_DATA(sd, size); + SKIP_DATA(sd, size) } } } @@ -1280,10 +1298,10 @@ static SRes SzReadHeader2( for (;;) { UInt64 type; - RINOK(ReadID(sd, &type)); + RINOK(ReadID(sd, &type)) if (type == k7zIdEnd) break; - RINOK(SkipData(sd)); + RINOK(SkipData(sd)) } { @@ -1295,40 +1313,37 @@ static SRes SzReadHeader2( UInt64 unpackPos = 0; const Byte *digestsDefs = NULL; const Byte *digestsVals = NULL; - UInt32 digestsValsIndex = 0; - UInt32 digestIndex; - Byte allDigestsDefined = 0; + UInt32 digestIndex = 0; Byte isDirMask = 0; Byte crcMask = 0; Byte mask = 0x80; - MY_ALLOC(UInt32, p->FolderToFile, p->db.NumFolders + 1, allocMain); - MY_ALLOC_ZE(UInt32, p->FileToFolder, p->NumFiles, allocMain); - MY_ALLOC(UInt64, p->UnpackPositions, p->NumFiles + 1, allocMain); - MY_ALLOC_ZE(Byte, p->IsDirs, (p->NumFiles + 7) >> 3, allocMain); + MY_ALLOC(UInt32, p->FolderToFile, p->db.NumFolders + 1, allocMain) + MY_ALLOC_ZE(UInt32, p->FileToFolder, p->NumFiles, allocMain) + MY_ALLOC(UInt64, p->UnpackPositions, p->NumFiles + 1, allocMain) + MY_ALLOC_ZE(Byte, p->IsDirs, (p->NumFiles + 7) >> 3, allocMain) - RINOK(SzBitUi32s_Alloc(&p->CRCs, p->NumFiles, allocMain)); + RINOK(SzBitUi32s_Alloc(&p->CRCs, p->NumFiles, allocMain)) if (ssi.sdCRCs.Size != 0) { - SZ_READ_BYTE_SD(&ssi.sdCRCs, allDigestsDefined); + Byte allDigestsDefined = 0; + SZ_READ_BYTE_SD_NOCHECK(&ssi.sdCRCs, allDigestsDefined) if (allDigestsDefined) digestsVals = ssi.sdCRCs.Data; else { - size_t numBytes = (ssi.NumSubDigests + 7) >> 3; + const size_t numBytes = (ssi.NumSubDigests + 7) >> 3; digestsDefs = ssi.sdCRCs.Data; digestsVals = digestsDefs + numBytes; } } - digestIndex = 0; - for (i = 0; i < numFiles; i++, mask >>= 1) { if (mask == 0) { - UInt32 byteIndex = (i - 1) >> 3; + const UInt32 byteIndex = (i - 1) >> 3; p->IsDirs[byteIndex] = isDirMask; p->CRCs.Defs[byteIndex] = crcMask; isDirMask = 0; @@ -1366,18 +1381,17 @@ static SRes SzReadHeader2( numSubStreams = 1; if (ssi.sdNumSubStreams.Data) { - RINOK(SzReadNumber32(&ssi.sdNumSubStreams, &numSubStreams)); + RINOK(SzReadNumber32(&ssi.sdNumSubStreams, &numSubStreams)) } remSubStreams = numSubStreams; if (numSubStreams != 0) break; { - UInt64 folderUnpackSize = SzAr_GetFolderUnpackSize(&p->db, folderIndex); + const UInt64 folderUnpackSize = SzAr_GetFolderUnpackSize(&p->db, folderIndex); unpackPos += folderUnpackSize; if (unpackPos < folderUnpackSize) return SZ_ERROR_ARCHIVE; } - folderIndex++; } } @@ -1389,47 +1403,44 @@ static SRes SzReadHeader2( if (--remSubStreams == 0) { - UInt64 folderUnpackSize = SzAr_GetFolderUnpackSize(&p->db, folderIndex); - UInt64 startFolderUnpackPos = p->UnpackPositions[p->FolderToFile[folderIndex]]; + const UInt64 folderUnpackSize = SzAr_GetFolderUnpackSize(&p->db, folderIndex); + const UInt64 startFolderUnpackPos = p->UnpackPositions[p->FolderToFile[folderIndex]]; if (folderUnpackSize < unpackPos - startFolderUnpackPos) return SZ_ERROR_ARCHIVE; unpackPos = startFolderUnpackPos + folderUnpackSize; if (unpackPos < folderUnpackSize) return SZ_ERROR_ARCHIVE; - if (numSubStreams == 1 && SzBitWithVals_Check(&p->db.FolderCRCs, i)) + if (numSubStreams == 1 && SzBitWithVals_Check(&p->db.FolderCRCs, folderIndex)) { p->CRCs.Vals[i] = p->db.FolderCRCs.Vals[folderIndex]; crcMask |= mask; } - else if (allDigestsDefined || (digestsDefs && SzBitArray_Check(digestsDefs, digestIndex))) - { - p->CRCs.Vals[i] = GetUi32(digestsVals + (size_t)digestsValsIndex * 4); - digestsValsIndex++; - crcMask |= mask; - } - folderIndex++; } else { UInt64 v; - RINOK(ReadNumber(&ssi.sdSizes, &v)); + RINOK(ReadNumber(&ssi.sdSizes, &v)) unpackPos += v; if (unpackPos < v) return SZ_ERROR_ARCHIVE; - if (allDigestsDefined || (digestsDefs && SzBitArray_Check(digestsDefs, digestIndex))) + } + if ((crcMask & mask) == 0 && digestsVals) + { + if (!digestsDefs || SzBitArray_Check(digestsDefs, digestIndex)) { - p->CRCs.Vals[i] = GetUi32(digestsVals + (size_t)digestsValsIndex * 4); - digestsValsIndex++; + p->CRCs.Vals[i] = GetUi32(digestsVals); + digestsVals += 4; crcMask |= mask; } + digestIndex++; } } if (mask != 0x80) { - UInt32 byteIndex = (i - 1) >> 3; + const UInt32 byteIndex = (i - 1) >> 3; p->IsDirs[byteIndex] = isDirMask; p->CRCs.Defs[byteIndex] = crcMask; } @@ -1446,7 +1457,7 @@ static SRes SzReadHeader2( break; if (!ssi.sdNumSubStreams.Data) return SZ_ERROR_ARCHIVE; - RINOK(SzReadNumber32(&ssi.sdNumSubStreams, &numSubStreams)); + RINOK(SzReadNumber32(&ssi.sdNumSubStreams, &numSubStreams)) if (numSubStreams != 0) return SZ_ERROR_ARCHIVE; /* @@ -1471,7 +1482,7 @@ static SRes SzReadHeader2( static SRes SzReadHeader( CSzArEx *p, CSzData *sd, - ILookInStream *inStream, + ILookInStreamPtr inStream, ISzAllocPtr allocMain, ISzAllocPtr allocTemp) { @@ -1490,7 +1501,7 @@ static SRes SzReadHeader( for (i = 0; i < NUM_ADDITIONAL_STREAMS_MAX; i++) Buf_Free(tempBufs + i, allocTemp); - RINOK(res); + RINOK(res) if (sd->Size != 0) return SZ_ERROR_FAIL; @@ -1500,7 +1511,7 @@ static SRes SzReadHeader( static SRes SzArEx_Open2( CSzArEx *p, - ILookInStream *inStream, + ILookInStreamPtr inStream, ISzAllocPtr allocMain, ISzAllocPtr allocTemp) { @@ -1513,9 +1524,9 @@ static SRes SzArEx_Open2( SRes res; startArcPos = 0; - RINOK(ILookInStream_Seek(inStream, &startArcPos, SZ_SEEK_CUR)); + RINOK(ILookInStream_Seek(inStream, &startArcPos, SZ_SEEK_CUR)) - RINOK(LookInStream_Read2(inStream, header, k7zStartHeaderSize, SZ_ERROR_NO_ARCHIVE)); + RINOK(LookInStream_Read2(inStream, header, k7zStartHeaderSize, SZ_ERROR_NO_ARCHIVE)) if (!TestSignatureCandidate(header)) return SZ_ERROR_NO_ARCHIVE; @@ -1526,11 +1537,13 @@ static SRes SzArEx_Open2( nextHeaderSize = GetUi64(header + 20); nextHeaderCRC = GetUi32(header + 28); - p->startPosAfterHeader = startArcPos + k7zStartHeaderSize; + p->startPosAfterHeader = (UInt64)startArcPos + k7zStartHeaderSize; if (CrcCalc(header + 12, 20) != GetUi32(header + 8)) return SZ_ERROR_CRC; + p->db.RangeLimit = nextHeaderOffset; + nextHeaderSizeT = (size_t)nextHeaderSize; if (nextHeaderSizeT != nextHeaderSize) return SZ_ERROR_MEM; @@ -1542,14 +1555,14 @@ static SRes SzArEx_Open2( { Int64 pos = 0; - RINOK(ILookInStream_Seek(inStream, &pos, SZ_SEEK_END)); - if ((UInt64)pos < startArcPos + nextHeaderOffset || - (UInt64)pos < startArcPos + k7zStartHeaderSize + nextHeaderOffset || - (UInt64)pos < startArcPos + k7zStartHeaderSize + nextHeaderOffset + nextHeaderSize) + RINOK(ILookInStream_Seek(inStream, &pos, SZ_SEEK_END)) + if ((UInt64)pos < (UInt64)startArcPos + nextHeaderOffset || + (UInt64)pos < (UInt64)startArcPos + k7zStartHeaderSize + nextHeaderOffset || + (UInt64)pos < (UInt64)startArcPos + k7zStartHeaderSize + nextHeaderOffset + nextHeaderSize) return SZ_ERROR_INPUT_EOF; } - RINOK(LookInStream_SeekTo(inStream, startArcPos + k7zStartHeaderSize + nextHeaderOffset)); + RINOK(LookInStream_SeekTo(inStream, (UInt64)startArcPos + k7zStartHeaderSize + nextHeaderOffset)) if (!Buf_Create(&buf, nextHeaderSizeT, allocTemp)) return SZ_ERROR_MEM; @@ -1575,6 +1588,8 @@ static SRes SzArEx_Open2( Buf_Init(&tempBuf); SzAr_Init(&tempAr); + tempAr.RangeLimit = p->db.RangeLimit; + res = SzReadAndDecodePackedStreams(inStream, &sd, &tempBuf, 1, p->startPosAfterHeader, &tempAr, allocTemp); SzAr_Free(&tempAr, allocTemp); @@ -1622,10 +1637,10 @@ static SRes SzArEx_Open2( } -SRes SzArEx_Open(CSzArEx *p, ILookInStream *inStream, +SRes SzArEx_Open(CSzArEx *p, ILookInStreamPtr inStream, ISzAllocPtr allocMain, ISzAllocPtr allocTemp) { - SRes res = SzArEx_Open2(p, inStream, allocMain, allocTemp); + const SRes res = SzArEx_Open2(p, inStream, allocMain, allocTemp); if (res != SZ_OK) SzArEx_Free(p, allocMain); return res; @@ -1634,7 +1649,7 @@ SRes SzArEx_Open(CSzArEx *p, ILookInStream *inStream, SRes SzArEx_Extract( const CSzArEx *p, - ILookInStream *inStream, + ILookInStreamPtr inStream, UInt32 fileIndex, UInt32 *blockIndex, Byte **tempBuf, @@ -1644,7 +1659,7 @@ SRes SzArEx_Extract( ISzAllocPtr allocMain, ISzAllocPtr allocTemp) { - UInt32 folderIndex = p->FileToFolder[fileIndex]; + const UInt32 folderIndex = p->FileToFolder[fileIndex]; SRes res = SZ_OK; *offset = 0; @@ -1661,13 +1676,13 @@ SRes SzArEx_Extract( if (*tempBuf == NULL || *blockIndex != folderIndex) { - UInt64 unpackSizeSpec = SzAr_GetFolderUnpackSize(&p->db, folderIndex); + const UInt64 unpackSizeSpec = SzAr_GetFolderUnpackSize(&p->db, folderIndex); /* UInt64 unpackSizeSpec = p->UnpackPositions[p->FolderToFile[(size_t)folderIndex + 1]] - p->UnpackPositions[p->FolderToFile[folderIndex]]; */ - size_t unpackSize = (size_t)unpackSizeSpec; + const size_t unpackSize = (size_t)unpackSizeSpec; if (unpackSize != unpackSizeSpec) return SZ_ERROR_MEM; @@ -1695,7 +1710,7 @@ SRes SzArEx_Extract( if (res == SZ_OK) { - UInt64 unpackPos = p->UnpackPositions[fileIndex]; + const UInt64 unpackPos = p->UnpackPositions[fileIndex]; *offset = (size_t)(unpackPos - p->UnpackPositions[p->FolderToFile[folderIndex]]); *outSizeProcessed = (size_t)(p->UnpackPositions[(size_t)fileIndex + 1] - unpackPos); if (*offset + *outSizeProcessed > *outBufferSize) @@ -1711,8 +1726,8 @@ SRes SzArEx_Extract( size_t SzArEx_GetFileNameUtf16(const CSzArEx *p, size_t fileIndex, UInt16 *dest) { - size_t offs = p->FileNameOffsets[fileIndex]; - size_t len = p->FileNameOffsets[fileIndex + 1] - offs; + const size_t offs = p->FileNameOffsets[fileIndex]; + const size_t len = p->FileNameOffsets[fileIndex + 1] - offs; if (dest != 0) { size_t i; diff --git a/src/sdk/C/7zBuf.h b/src/sdk/C/7zBuf.h index 81d1b5b..c0ba8a7 100644 --- a/src/sdk/C/7zBuf.h +++ b/src/sdk/C/7zBuf.h @@ -1,8 +1,8 @@ /* 7zBuf.h -- Byte Buffer -2017-04-03 : Igor Pavlov : Public domain */ +2023-03-04 : Igor Pavlov : Public domain */ -#ifndef __7Z_BUF_H -#define __7Z_BUF_H +#ifndef ZIP7_INC_7Z_BUF_H +#define ZIP7_INC_7Z_BUF_H #include "7zTypes.h" diff --git a/src/sdk/C/7zCrc.c b/src/sdk/C/7zCrc.c index b4d84f0..6e2db9e 100644 --- a/src/sdk/C/7zCrc.c +++ b/src/sdk/C/7zCrc.c @@ -1,128 +1,420 @@ -/* 7zCrc.c -- CRC32 init -2017-06-06 : Igor Pavlov : Public domain */ +/* 7zCrc.c -- CRC32 calculation and init +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" #include "7zCrc.h" #include "CpuArch.h" -#define kCrcPoly 0xEDB88320 +// for debug: +// #define __ARM_FEATURE_CRC32 1 -#ifdef MY_CPU_LE - #define CRC_NUM_TABLES 8 +#ifdef __ARM_FEATURE_CRC32 +// #pragma message("__ARM_FEATURE_CRC32") +#define Z7_CRC_HW_FORCE +#endif + +// #define Z7_CRC_DEBUG_BE +#ifdef Z7_CRC_DEBUG_BE +#undef MY_CPU_LE +#define MY_CPU_BE +#endif + +#ifdef Z7_CRC_HW_FORCE + #define Z7_CRC_NUM_TABLES_USE 1 #else - #define CRC_NUM_TABLES 9 +#ifdef Z7_CRC_NUM_TABLES + #define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES +#else + #define Z7_CRC_NUM_TABLES_USE 12 +#endif +#endif - #define CRC_UINT32_SWAP(v) ((v >> 24) | ((v >> 8) & 0xFF00) | ((v << 8) & 0xFF0000) | (v << 24)) +#if Z7_CRC_NUM_TABLES_USE < 1 + #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES +#endif - UInt32 MY_FAST_CALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table); - UInt32 MY_FAST_CALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table); +#if defined(MY_CPU_LE) || (Z7_CRC_NUM_TABLES_USE == 1) + #define Z7_CRC_NUM_TABLES_TOTAL Z7_CRC_NUM_TABLES_USE +#else + #define Z7_CRC_NUM_TABLES_TOTAL (Z7_CRC_NUM_TABLES_USE + 1) #endif +#ifndef Z7_CRC_HW_FORCE + +#if Z7_CRC_NUM_TABLES_USE == 1 \ + || (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) +#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) +#define Z7_CRC_UPDATE_T1_FUNC_NAME CrcUpdateGT1 +static UInt32 Z7_FASTCALL Z7_CRC_UPDATE_T1_FUNC_NAME(UInt32 v, const void *data, size_t size) +{ + const UInt32 *table = g_CrcTable; + const Byte *p = (const Byte *)data; + const Byte *lim = p + size; + for (; p != lim; p++) + v = CRC_UPDATE_BYTE_2(v, *p); + return v; +} +#endif + + +#if Z7_CRC_NUM_TABLES_USE != 1 #ifndef MY_CPU_BE - UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); - UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table); + #define FUNC_NAME_LE_2(s) CrcUpdateT ## s + #define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s) + #define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC_NUM_TABLES_USE) + UInt32 Z7_FASTCALL FUNC_NAME_LE (UInt32 v, const void *data, size_t size, const UInt32 *table); +#endif +#ifndef MY_CPU_LE + #define FUNC_NAME_BE_2(s) CrcUpdateT1_BeT ## s + #define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s) + #define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC_NUM_TABLES_USE) + UInt32 Z7_FASTCALL FUNC_NAME_BE (UInt32 v, const void *data, size_t size, const UInt32 *table); +#endif #endif -typedef UInt32 (MY_FAST_CALL *CRC_FUNC)(UInt32 v, const void *data, size_t size, const UInt32 *table); +#endif // Z7_CRC_HW_FORCE + +/* ---------- hardware CRC ---------- */ + +#ifdef MY_CPU_LE + +#if defined(MY_CPU_ARM_OR_ARM64) +// #pragma message("ARM*") -CRC_FUNC g_CrcUpdateT4; -CRC_FUNC g_CrcUpdateT8; -CRC_FUNC g_CrcUpdate; + #if (defined(__clang__) && (__clang_major__ >= 3)) \ + || defined(__GNUC__) && (__GNUC__ >= 6) && defined(MY_CPU_ARM64) \ + || defined(__GNUC__) && (__GNUC__ >= 8) + #if !defined(__ARM_FEATURE_CRC32) +// #pragma message("!defined(__ARM_FEATURE_CRC32)") +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #define __ARM_FEATURE_CRC32 1 +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER + #define Z7_ARM_FEATURE_CRC32_WAS_SET + #if defined(__clang__) + #if defined(MY_CPU_ARM64) + #define ATTRIB_CRC __attribute__((__target__("crc"))) + #else + #define ATTRIB_CRC __attribute__((__target__("armv8-a,crc"))) + #endif + #else + #if defined(MY_CPU_ARM64) +#if !defined(Z7_GCC_VERSION) || (Z7_GCC_VERSION >= 60000) + #define ATTRIB_CRC __attribute__((__target__("+crc"))) +#endif + #else +#if !defined(Z7_GCC_VERSION) || (__GNUC__ >= 8) +#if defined(__ARM_FP) && __GNUC__ >= 8 +// for -mfloat-abi=hard: similar to + #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc+simd"))) +#else + #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc"))) +#endif +#endif + #endif + #endif + #endif + #if defined(__ARM_FEATURE_CRC32) + // #pragma message("") +/* +arm_acle.h (GGC): + before Nov 17, 2017: +#ifdef __ARM_FEATURE_CRC32 -UInt32 g_CrcTable[256 * CRC_NUM_TABLES]; + Nov 17, 2017: gcc10.0 (gcc 9.2.0) checked" +#if __ARM_ARCH >= 8 +#pragma GCC target ("arch=armv8-a+crc") -UInt32 MY_FAST_CALL CrcUpdate(UInt32 v, const void *data, size_t size) + Aug 22, 2019: GCC 8.4?, 9.2.1, 10.1: +#ifdef __ARM_FEATURE_CRC32 +#ifdef __ARM_FP +#pragma GCC target ("arch=armv8-a+crc+simd") +#else +#pragma GCC target ("arch=armv8-a+crc") +#endif +*/ +#if defined(__ARM_ARCH) && __ARM_ARCH < 8 +#if defined(Z7_GCC_VERSION) && (__GNUC__ == 8) && (Z7_GCC_VERSION < 80400) \ + || defined(Z7_GCC_VERSION) && (__GNUC__ == 9) && (Z7_GCC_VERSION < 90201) \ + || defined(Z7_GCC_VERSION) && (__GNUC__ == 10) && (Z7_GCC_VERSION < 100100) +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +// #pragma message("#define __ARM_ARCH 8") +#undef __ARM_ARCH +#define __ARM_ARCH 8 +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif +#endif + #define Z7_CRC_HW_USE + #include + #endif + #elif defined(_MSC_VER) + #if defined(MY_CPU_ARM64) + #if (_MSC_VER >= 1910) + #ifdef __clang__ + // #define Z7_CRC_HW_USE + // #include + #else + #define Z7_CRC_HW_USE + #include + #endif + #endif + #endif + #endif + +#else // non-ARM* + +// #define Z7_CRC_HW_USE // for debug : we can test HW-branch of code +#ifdef Z7_CRC_HW_USE +#include "7zCrcEmu.h" +#endif + +#endif // non-ARM* + + + +#if defined(Z7_CRC_HW_USE) + +// #pragma message("USE ARM HW CRC") + +#ifdef MY_CPU_64BIT + #define CRC_HW_WORD_TYPE UInt64 + #define CRC_HW_WORD_FUNC __crc32d +#else + #define CRC_HW_WORD_TYPE UInt32 + #define CRC_HW_WORD_FUNC __crc32w +#endif + +#define CRC_HW_UNROLL_BYTES (sizeof(CRC_HW_WORD_TYPE) * 4) + +#ifdef ATTRIB_CRC + ATTRIB_CRC +#endif +Z7_NO_INLINE +#ifdef Z7_CRC_HW_FORCE + UInt32 Z7_FASTCALL CrcUpdate +#else + static UInt32 Z7_FASTCALL CrcUpdate_HW +#endif + (UInt32 v, const void *data, size_t size) { - return g_CrcUpdate(v, data, size, g_CrcTable); + const Byte *p = (const Byte *)data; + for (; size != 0 && ((unsigned)(ptrdiff_t)p & (CRC_HW_UNROLL_BYTES - 1)) != 0; size--) + v = __crc32b(v, *p++); + if (size >= CRC_HW_UNROLL_BYTES) + { + const Byte *lim = p + size; + size &= CRC_HW_UNROLL_BYTES - 1; + lim -= size; + do + { + v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p)); + v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE))); + p += 2 * sizeof(CRC_HW_WORD_TYPE); + v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p)); + v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE))); + p += 2 * sizeof(CRC_HW_WORD_TYPE); + } + while (p != lim); + } + + for (; size != 0; size--) + v = __crc32b(v, *p++); + + return v; } -UInt32 MY_FAST_CALL CrcCalc(const void *data, size_t size) +#ifdef Z7_ARM_FEATURE_CRC32_WAS_SET +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +#undef __ARM_FEATURE_CRC32 +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#undef Z7_ARM_FEATURE_CRC32_WAS_SET +#endif + +#endif // defined(Z7_CRC_HW_USE) +#endif // MY_CPU_LE + + + +#ifndef Z7_CRC_HW_FORCE + +#if defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) +/* +typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_WITH_TABLE_FUNC) + (UInt32 v, const void *data, size_t size, const UInt32 *table); +Z7_CRC_UPDATE_WITH_TABLE_FUNC g_CrcUpdate; +*/ +static unsigned g_Crc_Algo; +#if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) +static unsigned g_Crc_Be; +#endif +#endif // defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) + + + +Z7_NO_INLINE +#ifdef Z7_CRC_HW_USE + static UInt32 Z7_FASTCALL CrcUpdate_Base +#else + UInt32 Z7_FASTCALL CrcUpdate +#endif + (UInt32 crc, const void *data, size_t size) { - return g_CrcUpdate(CRC_INIT_VAL, data, size, g_CrcTable) ^ CRC_INIT_VAL; +#if Z7_CRC_NUM_TABLES_USE == 1 + return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size); +#else // Z7_CRC_NUM_TABLES_USE != 1 +#ifdef Z7_CRC_UPDATE_T1_FUNC_NAME + if (g_Crc_Algo == 1) + return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size); +#endif + +#ifdef MY_CPU_LE + return FUNC_NAME_LE(crc, data, size, g_CrcTable); +#elif defined(MY_CPU_BE) + return FUNC_NAME_BE(crc, data, size, g_CrcTable); +#else + if (g_Crc_Be) + return FUNC_NAME_BE(crc, data, size, g_CrcTable); + else + return FUNC_NAME_LE(crc, data, size, g_CrcTable); +#endif +#endif // Z7_CRC_NUM_TABLES_USE != 1 } -#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) -UInt32 MY_FAST_CALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table) +#ifdef Z7_CRC_HW_USE +Z7_NO_INLINE +UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size) { - const Byte *p = (const Byte *)data; - const Byte *pEnd = p + size; - for (; p != pEnd; p++) - v = CRC_UPDATE_BYTE_2(v, *p); - return v; + if (g_Crc_Algo == 0) + return CrcUpdate_HW(crc, data, size); + return CrcUpdate_Base(crc, data, size); +} +#endif + +#endif // !defined(Z7_CRC_HW_FORCE) + + + +UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size) +{ + return CrcUpdate(CRC_INIT_VAL, data, size) ^ CRC_INIT_VAL; } -void MY_FAST_CALL CrcGenerateTable() + +MY_ALIGN(64) +UInt32 g_CrcTable[256 * Z7_CRC_NUM_TABLES_TOTAL]; + + +void Z7_FASTCALL CrcGenerateTable(void) { UInt32 i; for (i = 0; i < 256; i++) { +#if defined(Z7_CRC_HW_FORCE) + g_CrcTable[i] = __crc32b(i, 0); +#else + #define kCrcPoly 0xEDB88320 UInt32 r = i; unsigned j; for (j = 0; j < 8; j++) r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1))); g_CrcTable[i] = r; +#endif } - for (i = 256; i < 256 * CRC_NUM_TABLES; i++) + for (i = 256; i < 256 * Z7_CRC_NUM_TABLES_USE; i++) { - UInt32 r = g_CrcTable[(size_t)i - 256]; + const UInt32 r = g_CrcTable[(size_t)i - 256]; g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8); } - #if CRC_NUM_TABLES < 4 - - g_CrcUpdate = CrcUpdateT1; - - #else - - #ifdef MY_CPU_LE - - g_CrcUpdateT4 = CrcUpdateT4; - g_CrcUpdate = CrcUpdateT4; +#if !defined(Z7_CRC_HW_FORCE) && \ + (defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) || defined(MY_CPU_BE)) - #if CRC_NUM_TABLES >= 8 - g_CrcUpdateT8 = CrcUpdateT8; - - #ifdef MY_CPU_X86_OR_AMD64 - if (!CPU_Is_InOrder()) - #endif - g_CrcUpdate = CrcUpdateT8; - #endif +#if Z7_CRC_NUM_TABLES_USE <= 1 + g_Crc_Algo = 1; +#else // Z7_CRC_NUM_TABLES_USE <= 1 - #else +#if defined(MY_CPU_LE) + g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; +#else // !defined(MY_CPU_LE) { - #ifndef MY_CPU_BE +#ifndef MY_CPU_BE UInt32 k = 0x01020304; const Byte *p = (const Byte *)&k; if (p[0] == 4 && p[1] == 3) - { - g_CrcUpdateT4 = CrcUpdateT4; - g_CrcUpdate = CrcUpdateT4; - #if CRC_NUM_TABLES >= 8 - g_CrcUpdateT8 = CrcUpdateT8; - g_CrcUpdate = CrcUpdateT8; - #endif - } + g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; else if (p[0] != 1 || p[1] != 2) - g_CrcUpdate = CrcUpdateT1; + g_Crc_Algo = 1; else - #endif +#endif // MY_CPU_BE { - for (i = 256 * CRC_NUM_TABLES - 1; i >= 256; i--) + for (i = 256 * Z7_CRC_NUM_TABLES_TOTAL - 1; i >= 256; i--) { - UInt32 x = g_CrcTable[(size_t)i - 256]; - g_CrcTable[i] = CRC_UINT32_SWAP(x); + const UInt32 x = g_CrcTable[(size_t)i - 256]; + g_CrcTable[i] = Z7_BSWAP32(x); } - g_CrcUpdateT4 = CrcUpdateT1_BeT4; - g_CrcUpdate = CrcUpdateT1_BeT4; - #if CRC_NUM_TABLES >= 8 - g_CrcUpdateT8 = CrcUpdateT1_BeT8; - g_CrcUpdate = CrcUpdateT1_BeT8; - #endif +#if defined(Z7_CRC_UPDATE_T1_FUNC_NAME) + g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; +#endif +#if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) + g_Crc_Be = 1; +#endif } } - #endif +#endif // !defined(MY_CPU_LE) + +#ifdef MY_CPU_LE +#ifdef Z7_CRC_HW_USE + if (CPU_IsSupported_CRC32()) + g_Crc_Algo = 0; +#endif // Z7_CRC_HW_USE +#endif // MY_CPU_LE + +#endif // Z7_CRC_NUM_TABLES_USE <= 1 +#endif // g_Crc_Algo was declared +} + +Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo) +{ + if (algo == 0) + return &CrcUpdate; + +#if defined(Z7_CRC_HW_USE) + if (algo == sizeof(CRC_HW_WORD_TYPE) * 8) + { +#ifdef Z7_CRC_HW_FORCE + return &CrcUpdate; +#else + if (g_Crc_Algo == 0) + return &CrcUpdate_HW; +#endif + } +#endif +#ifndef Z7_CRC_HW_FORCE + if (algo == Z7_CRC_NUM_TABLES_USE) + return + #ifdef Z7_CRC_HW_USE + &CrcUpdate_Base; + #else + &CrcUpdate; #endif +#endif + + return NULL; } + +#undef kCrcPoly +#undef Z7_CRC_NUM_TABLES_USE +#undef Z7_CRC_NUM_TABLES_TOTAL +#undef CRC_UPDATE_BYTE_2 +#undef FUNC_NAME_LE_2 +#undef FUNC_NAME_LE_1 +#undef FUNC_NAME_LE +#undef FUNC_NAME_BE_2 +#undef FUNC_NAME_BE_1 +#undef FUNC_NAME_BE + +#undef CRC_HW_UNROLL_BYTES +#undef CRC_HW_WORD_FUNC +#undef CRC_HW_WORD_TYPE diff --git a/src/sdk/C/7zCrc.h b/src/sdk/C/7zCrc.h index 8fd5795..3e6d408 100644 --- a/src/sdk/C/7zCrc.h +++ b/src/sdk/C/7zCrc.h @@ -1,8 +1,8 @@ /* 7zCrc.h -- CRC32 calculation -2013-01-18 : Igor Pavlov : Public domain */ +2024-01-22 : Igor Pavlov : Public domain */ -#ifndef __7Z_CRC_H -#define __7Z_CRC_H +#ifndef ZIP7_INC_7Z_CRC_H +#define ZIP7_INC_7Z_CRC_H #include "7zTypes.h" @@ -11,14 +11,17 @@ EXTERN_C_BEGIN extern UInt32 g_CrcTable[]; /* Call CrcGenerateTable one time before other CRC functions */ -void MY_FAST_CALL CrcGenerateTable(void); +void Z7_FASTCALL CrcGenerateTable(void); #define CRC_INIT_VAL 0xFFFFFFFF #define CRC_GET_DIGEST(crc) ((crc) ^ CRC_INIT_VAL) #define CRC_UPDATE_BYTE(crc, b) (g_CrcTable[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) -UInt32 MY_FAST_CALL CrcUpdate(UInt32 crc, const void *data, size_t size); -UInt32 MY_FAST_CALL CrcCalc(const void *data, size_t size); +UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size); +UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size); + +typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_FUNC)(UInt32 v, const void *data, size_t size); +Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo); EXTERN_C_END diff --git a/src/sdk/C/7zCrcOpt.c b/src/sdk/C/7zCrcOpt.c index 73beba2..9408017 100644 --- a/src/sdk/C/7zCrcOpt.c +++ b/src/sdk/C/7zCrcOpt.c @@ -1,115 +1,199 @@ -/* 7zCrcOpt.c -- CRC32 calculation -2017-04-03 : Igor Pavlov : Public domain */ +/* 7zCrcOpt.c -- CRC32 calculation (optimized functions) +2023-12-07 : Igor Pavlov : Public domain */ #include "Precomp.h" #include "CpuArch.h" +#if !defined(Z7_CRC_NUM_TABLES) || Z7_CRC_NUM_TABLES > 1 + +// for debug only : define Z7_CRC_DEBUG_BE to test big-endian code in little-endian cpu +// #define Z7_CRC_DEBUG_BE +#ifdef Z7_CRC_DEBUG_BE +#undef MY_CPU_LE +#define MY_CPU_BE +#endif + +// the value Z7_CRC_NUM_TABLES_USE must be defined to same value as in 7zCrc.c +#ifdef Z7_CRC_NUM_TABLES +#define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES +#else +#define Z7_CRC_NUM_TABLES_USE 12 +#endif + +#if Z7_CRC_NUM_TABLES_USE % 4 || \ + Z7_CRC_NUM_TABLES_USE < 4 * 1 || \ + Z7_CRC_NUM_TABLES_USE > 4 * 6 + #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES +#endif + + #ifndef MY_CPU_BE -#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) +#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) -UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table) -{ - const Byte *p = (const Byte *)data; - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) - v = CRC_UPDATE_BYTE_2(v, *p); - for (; size >= 4; size -= 4, p += 4) - { - v ^= *(const UInt32 *)p; - v = - (table + 0x300)[((v ) & 0xFF)] - ^ (table + 0x200)[((v >> 8) & 0xFF)] - ^ (table + 0x100)[((v >> 16) & 0xFF)] - ^ (table + 0x000)[((v >> 24))]; - } - for (; size > 0; size--, p++) - v = CRC_UPDATE_BYTE_2(v, *p); - return v; -} +#define Q(n, d) \ + ( (table + ((n) * 4 + 3) * 0x100)[(Byte)(d)] \ + ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] ) + +#define R(a) *((const UInt32 *)(const void *)p + (a)) + +#define CRC_FUNC_PRE_LE2(step) \ +UInt32 Z7_FASTCALL CrcUpdateT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table) -UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table) +#define CRC_FUNC_PRE_LE(step) \ + CRC_FUNC_PRE_LE2(step); \ + CRC_FUNC_PRE_LE2(step) + +CRC_FUNC_PRE_LE(Z7_CRC_NUM_TABLES_USE) { const Byte *p = (const Byte *)data; - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++) + const Byte *lim; + for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++) v = CRC_UPDATE_BYTE_2(v, *p); - for (; size >= 8; size -= 8, p += 8) + lim = p + size; + if (size >= Z7_CRC_NUM_TABLES_USE) { - UInt32 d; - v ^= *(const UInt32 *)p; - v = - (table + 0x700)[((v ) & 0xFF)] - ^ (table + 0x600)[((v >> 8) & 0xFF)] - ^ (table + 0x500)[((v >> 16) & 0xFF)] - ^ (table + 0x400)[((v >> 24))]; - d = *((const UInt32 *)p + 1); - v ^= - (table + 0x300)[((d ) & 0xFF)] - ^ (table + 0x200)[((d >> 8) & 0xFF)] - ^ (table + 0x100)[((d >> 16) & 0xFF)] - ^ (table + 0x000)[((d >> 24))]; + lim -= Z7_CRC_NUM_TABLES_USE; + do + { + v ^= R(0); + { +#if Z7_CRC_NUM_TABLES_USE == 1 * 4 + v = Q(0, v); +#else +#define U2(r, op) \ + { d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); } + UInt32 d, x; + U2(1, =) +#if Z7_CRC_NUM_TABLES_USE >= 3 * 4 +#define U(r) U2(r, ^=) + U(2) +#if Z7_CRC_NUM_TABLES_USE >= 4 * 4 + U(3) +#if Z7_CRC_NUM_TABLES_USE >= 5 * 4 + U(4) +#if Z7_CRC_NUM_TABLES_USE >= 6 * 4 + U(5) +#if Z7_CRC_NUM_TABLES_USE >= 7 * 4 +#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES +#endif +#endif +#endif +#endif +#endif +#undef U +#undef U2 + v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v); +#endif + } + p += Z7_CRC_NUM_TABLES_USE; + } + while (p <= lim); + lim += Z7_CRC_NUM_TABLES_USE; } - for (; size > 0; size--, p++) + for (; p < lim; p++) v = CRC_UPDATE_BYTE_2(v, *p); return v; } +#undef CRC_UPDATE_BYTE_2 +#undef R +#undef Q +#undef CRC_FUNC_PRE_LE +#undef CRC_FUNC_PRE_LE2 + #endif + + #ifndef MY_CPU_LE -#define CRC_UINT32_SWAP(v) ((v >> 24) | ((v >> 8) & 0xFF00) | ((v << 8) & 0xFF0000) | (v << 24)) +#define CRC_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 24) ^ (b)] ^ ((crc) << 8)) -#define CRC_UPDATE_BYTE_2_BE(crc, b) (table[(((crc) >> 24) ^ (b))] ^ ((crc) << 8)) +#define Q(n, d) \ + ( (table + ((n) * 4 + 0) * 0x100)[((d)) & 0xFF] \ + ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] ) -UInt32 MY_FAST_CALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table) -{ - const Byte *p = (const Byte *)data; - table += 0x100; - v = CRC_UINT32_SWAP(v); - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) - v = CRC_UPDATE_BYTE_2_BE(v, *p); - for (; size >= 4; size -= 4, p += 4) - { - v ^= *(const UInt32 *)p; - v = - (table + 0x000)[((v ) & 0xFF)] - ^ (table + 0x100)[((v >> 8) & 0xFF)] - ^ (table + 0x200)[((v >> 16) & 0xFF)] - ^ (table + 0x300)[((v >> 24))]; - } - for (; size > 0; size--, p++) - v = CRC_UPDATE_BYTE_2_BE(v, *p); - return CRC_UINT32_SWAP(v); -} +#ifdef Z7_CRC_DEBUG_BE + #define R(a) GetBe32a((const UInt32 *)(const void *)p + (a)) +#else + #define R(a) *((const UInt32 *)(const void *)p + (a)) +#endif + + +#define CRC_FUNC_PRE_BE2(step) \ +UInt32 Z7_FASTCALL CrcUpdateT1_BeT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table) -UInt32 MY_FAST_CALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table) +#define CRC_FUNC_PRE_BE(step) \ + CRC_FUNC_PRE_BE2(step); \ + CRC_FUNC_PRE_BE2(step) + +CRC_FUNC_PRE_BE(Z7_CRC_NUM_TABLES_USE) { const Byte *p = (const Byte *)data; + const Byte *lim; table += 0x100; - v = CRC_UINT32_SWAP(v); - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++) + v = Z7_BSWAP32(v); + for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++) v = CRC_UPDATE_BYTE_2_BE(v, *p); - for (; size >= 8; size -= 8, p += 8) + lim = p + size; + if (size >= Z7_CRC_NUM_TABLES_USE) { - UInt32 d; - v ^= *(const UInt32 *)p; - v = - (table + 0x400)[((v ) & 0xFF)] - ^ (table + 0x500)[((v >> 8) & 0xFF)] - ^ (table + 0x600)[((v >> 16) & 0xFF)] - ^ (table + 0x700)[((v >> 24))]; - d = *((const UInt32 *)p + 1); - v ^= - (table + 0x000)[((d ) & 0xFF)] - ^ (table + 0x100)[((d >> 8) & 0xFF)] - ^ (table + 0x200)[((d >> 16) & 0xFF)] - ^ (table + 0x300)[((d >> 24))]; + lim -= Z7_CRC_NUM_TABLES_USE; + do + { + v ^= R(0); + { +#if Z7_CRC_NUM_TABLES_USE == 1 * 4 + v = Q(0, v); +#else +#define U2(r, op) \ + { d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); } + UInt32 d, x; + U2(1, =) +#if Z7_CRC_NUM_TABLES_USE >= 3 * 4 +#define U(r) U2(r, ^=) + U(2) +#if Z7_CRC_NUM_TABLES_USE >= 4 * 4 + U(3) +#if Z7_CRC_NUM_TABLES_USE >= 5 * 4 + U(4) +#if Z7_CRC_NUM_TABLES_USE >= 6 * 4 + U(5) +#if Z7_CRC_NUM_TABLES_USE >= 7 * 4 +#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES +#endif +#endif +#endif +#endif +#endif +#undef U +#undef U2 + v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v); +#endif + } + p += Z7_CRC_NUM_TABLES_USE; + } + while (p <= lim); + lim += Z7_CRC_NUM_TABLES_USE; } - for (; size > 0; size--, p++) + for (; p < lim; p++) v = CRC_UPDATE_BYTE_2_BE(v, *p); - return CRC_UINT32_SWAP(v); + return Z7_BSWAP32(v); } +#undef CRC_UPDATE_BYTE_2_BE +#undef R +#undef Q +#undef CRC_FUNC_PRE_BE +#undef CRC_FUNC_PRE_BE2 + +#endif +#undef Z7_CRC_NUM_TABLES_USE #endif diff --git a/src/sdk/C/7zDec.c b/src/sdk/C/7zDec.c index 7c46352..520cbfd 100644 --- a/src/sdk/C/7zDec.c +++ b/src/sdk/C/7zDec.c @@ -1,11 +1,11 @@ /* 7zDec.c -- Decoding from 7z folder -2019-02-02 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" #include -/* #define _7ZIP_PPMD_SUPPPORT */ +/* #define Z7_PPMD_SUPPORT */ #include "7z.h" #include "7zCrc.h" @@ -16,24 +16,50 @@ #include "Delta.h" #include "LzmaDec.h" #include "Lzma2Dec.h" -#ifdef _7ZIP_PPMD_SUPPPORT +#ifdef Z7_PPMD_SUPPORT #include "Ppmd7.h" #endif #define k_Copy 0 -#define k_Delta 3 +#ifndef Z7_NO_METHOD_LZMA2 #define k_LZMA2 0x21 +#endif #define k_LZMA 0x30101 -#define k_BCJ 0x3030103 #define k_BCJ2 0x303011B + +#if !defined(Z7_NO_METHODS_FILTERS) +#define Z7_USE_BRANCH_FILTER +#endif + +#if !defined(Z7_NO_METHODS_FILTERS) || \ + defined(Z7_USE_NATIVE_BRANCH_FILTER) && defined(MY_CPU_ARM64) +#define Z7_USE_FILTER_ARM64 +#ifndef Z7_USE_BRANCH_FILTER +#define Z7_USE_BRANCH_FILTER +#endif +#define k_ARM64 0xa +#endif + +#if !defined(Z7_NO_METHODS_FILTERS) || \ + defined(Z7_USE_NATIVE_BRANCH_FILTER) && defined(MY_CPU_ARMT) +#define Z7_USE_FILTER_ARMT +#ifndef Z7_USE_BRANCH_FILTER +#define Z7_USE_BRANCH_FILTER +#endif +#define k_ARMT 0x3030701 +#endif + +#ifndef Z7_NO_METHODS_FILTERS +#define k_Delta 3 +#define k_RISCV 0xb +#define k_BCJ 0x3030103 #define k_PPC 0x3030205 #define k_IA64 0x3030401 #define k_ARM 0x3030501 -#define k_ARMT 0x3030701 #define k_SPARC 0x3030805 +#endif - -#ifdef _7ZIP_PPMD_SUPPPORT +#ifdef Z7_PPMD_SUPPORT #define k_PPMD 0x30401 @@ -46,17 +72,17 @@ typedef struct UInt64 processed; BoolInt extra; SRes res; - const ILookInStream *inStream; + ILookInStreamPtr inStream; } CByteInToLook; -static Byte ReadByte(const IByteIn *pp) +static Byte ReadByte(IByteInPtr pp) { - CByteInToLook *p = CONTAINER_FROM_VTBL(pp, CByteInToLook, vt); + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CByteInToLook) if (p->cur != p->end) return *p->cur++; if (p->res == SZ_OK) { - size_t size = p->cur - p->begin; + size_t size = (size_t)(p->cur - p->begin); p->processed += size; p->res = ILookInStream_Skip(p->inStream, size); size = (1 << 25); @@ -64,13 +90,13 @@ static Byte ReadByte(const IByteIn *pp) p->cur = p->begin; p->end = p->begin + size; if (size != 0) - return *p->cur++;; + return *p->cur++; } p->extra = True; return 0; } -static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, const ILookInStream *inStream, +static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream, Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain) { CPpmd7 ppmd; @@ -101,28 +127,32 @@ static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, c Ppmd7_Init(&ppmd, order); } { - CPpmd7z_RangeDec rc; - Ppmd7z_RangeDec_CreateVTable(&rc); - rc.Stream = &s.vt; - if (!Ppmd7z_RangeDec_Init(&rc)) + ppmd.rc.dec.Stream = &s.vt; + if (!Ppmd7z_RangeDec_Init(&ppmd.rc.dec)) res = SZ_ERROR_DATA; - else if (s.extra) - res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA); - else + else if (!s.extra) { - SizeT i; - for (i = 0; i < outSize; i++) + Byte *buf = outBuffer; + const Byte *lim = buf + outSize; + for (; buf != lim; buf++) { - int sym = Ppmd7_DecodeSymbol(&ppmd, &rc.vt); + int sym = Ppmd7z_DecodeSymbol(&ppmd); if (s.extra || sym < 0) break; - outBuffer[i] = (Byte)sym; + *buf = (Byte)sym; } - if (i != outSize) - res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA); - else if (s.processed + (s.cur - s.begin) != inSize || !Ppmd7z_RangeDec_IsFinishedOK(&rc)) + if (buf != lim) res = SZ_ERROR_DATA; + else if (!Ppmd7z_RangeDec_IsFinishedOK(&ppmd.rc.dec)) + { + /* if (Ppmd7z_DecodeSymbol(&ppmd) != PPMD7_SYM_END || !Ppmd7z_RangeDec_IsFinishedOK(&ppmd.rc.dec)) */ + res = SZ_ERROR_DATA; + } } + if (s.extra) + res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA); + else if (s.processed + (size_t)(s.cur - s.begin) != inSize) + res = SZ_ERROR_DATA; } Ppmd7_Free(&ppmd, allocMain); return res; @@ -131,14 +161,14 @@ static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, c #endif -static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStream *inStream, +static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream, Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain) { CLzmaDec state; SRes res = SZ_OK; - LzmaDec_Construct(&state); - RINOK(LzmaDec_AllocateProbs(&state, props, propsSize, allocMain)); + LzmaDec_CONSTRUCT(&state) + RINOK(LzmaDec_AllocateProbs(&state, props, propsSize, allocMain)) state.dic = outBuffer; state.dicBufSize = outSize; LzmaDec_Init(&state); @@ -189,18 +219,18 @@ static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, I } -#ifndef _7Z_NO_METHOD_LZMA2 +#ifndef Z7_NO_METHOD_LZMA2 -static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStream *inStream, +static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream, Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain) { CLzma2Dec state; SRes res = SZ_OK; - Lzma2Dec_Construct(&state); + Lzma2Dec_CONSTRUCT(&state) if (propsSize != 1) return SZ_ERROR_DATA; - RINOK(Lzma2Dec_AllocateProbs(&state, props[0], allocMain)); + RINOK(Lzma2Dec_AllocateProbs(&state, props[0], allocMain)) state.decoder.dic = outBuffer; state.decoder.dicBufSize = outSize; Lzma2Dec_Init(&state); @@ -250,7 +280,7 @@ static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize, #endif -static SRes SzDecodeCopy(UInt64 inSize, ILookInStream *inStream, Byte *outBuffer) +static SRes SzDecodeCopy(UInt64 inSize, ILookInStreamPtr inStream, Byte *outBuffer) { while (inSize > 0) { @@ -258,13 +288,13 @@ static SRes SzDecodeCopy(UInt64 inSize, ILookInStream *inStream, Byte *outBuffer size_t curSize = (1 << 18); if (curSize > inSize) curSize = (size_t)inSize; - RINOK(ILookInStream_Look(inStream, &inBuf, &curSize)); + RINOK(ILookInStream_Look(inStream, &inBuf, &curSize)) if (curSize == 0) return SZ_ERROR_INPUT_EOF; memcpy(outBuffer, inBuf, curSize); outBuffer += curSize; inSize -= curSize; - RINOK(ILookInStream_Skip(inStream, curSize)); + RINOK(ILookInStream_Skip(inStream, curSize)) } return SZ_OK; } @@ -275,15 +305,16 @@ static BoolInt IS_MAIN_METHOD(UInt32 m) { case k_Copy: case k_LZMA: - #ifndef _7Z_NO_METHOD_LZMA2 + #ifndef Z7_NO_METHOD_LZMA2 case k_LZMA2: - #endif - #ifdef _7ZIP_PPMD_SUPPPORT + #endif + #ifdef Z7_PPMD_SUPPORT case k_PPMD: - #endif + #endif return True; + default: + return False; } - return False; } static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c) @@ -310,7 +341,7 @@ static SRes CheckSupportedFolder(const CSzFolder *f) } - #ifndef _7Z_NO_METHODS_FILTERS + #if defined(Z7_USE_BRANCH_FILTER) if (f->NumCoders == 2) { @@ -326,13 +357,21 @@ static SRes CheckSupportedFolder(const CSzFolder *f) return SZ_ERROR_UNSUPPORTED; switch ((UInt32)c->MethodID) { + #if !defined(Z7_NO_METHODS_FILTERS) case k_Delta: case k_BCJ: case k_PPC: case k_IA64: case k_SPARC: case k_ARM: + case k_RISCV: + #endif + #ifdef Z7_USE_FILTER_ARM64 + case k_ARM64: + #endif + #ifdef Z7_USE_FILTER_ARMT case k_ARMT: + #endif break; default: return SZ_ERROR_UNSUPPORTED; @@ -365,13 +404,16 @@ static SRes CheckSupportedFolder(const CSzFolder *f) return SZ_ERROR_UNSUPPORTED; } -#define CASE_BRA_CONV(isa) case k_ ## isa: isa ## _Convert(outBuffer, outSize, 0, 0); break; + + + + static SRes SzFolder_Decode2(const CSzFolder *folder, const Byte *propsData, const UInt64 *unpackSizes, const UInt64 *packPositions, - ILookInStream *inStream, UInt64 startPos, + ILookInStreamPtr inStream, UInt64 startPos, Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain, Byte *tempBuf[]) { @@ -380,7 +422,7 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, SizeT tempSize3 = 0; Byte *tempBuf3 = 0; - RINOK(CheckSupportedFolder(folder)); + RINOK(CheckSupportedFolder(folder)) for (ci = 0; ci < folder->NumCoders; ci++) { @@ -395,8 +437,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, SizeT outSizeCur = outSize; if (folder->NumCoders == 4) { - UInt32 indices[] = { 3, 2, 0 }; - UInt64 unpackSize = unpackSizes[ci]; + const UInt32 indices[] = { 3, 2, 0 }; + const UInt64 unpackSize = unpackSizes[ci]; si = indices[ci]; if (ci < 2) { @@ -422,37 +464,37 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, } offset = packPositions[si]; inSize = packPositions[(size_t)si + 1] - offset; - RINOK(LookInStream_SeekTo(inStream, startPos + offset)); + RINOK(LookInStream_SeekTo(inStream, startPos + offset)) if (coder->MethodID == k_Copy) { if (inSize != outSizeCur) /* check it */ return SZ_ERROR_DATA; - RINOK(SzDecodeCopy(inSize, inStream, outBufCur)); + RINOK(SzDecodeCopy(inSize, inStream, outBufCur)) } else if (coder->MethodID == k_LZMA) { - RINOK(SzDecodeLzma(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain)); + RINOK(SzDecodeLzma(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain)) } - #ifndef _7Z_NO_METHOD_LZMA2 + #ifndef Z7_NO_METHOD_LZMA2 else if (coder->MethodID == k_LZMA2) { - RINOK(SzDecodeLzma2(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain)); + RINOK(SzDecodeLzma2(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain)) } - #endif - #ifdef _7ZIP_PPMD_SUPPPORT + #endif + #ifdef Z7_PPMD_SUPPORT else if (coder->MethodID == k_PPMD) { - RINOK(SzDecodePpmd(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain)); + RINOK(SzDecodePpmd(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain)) } - #endif + #endif else return SZ_ERROR_UNSUPPORTED; } else if (coder->MethodID == k_BCJ2) { - UInt64 offset = packPositions[1]; - UInt64 s3Size = packPositions[2] - offset; + const UInt64 offset = packPositions[1]; + const UInt64 s3Size = packPositions[2] - offset; if (ci != 3) return SZ_ERROR_UNSUPPORTED; @@ -464,8 +506,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, if (!tempBuf[2] && tempSizes[2] != 0) return SZ_ERROR_MEM; - RINOK(LookInStream_SeekTo(inStream, startPos + offset)); - RINOK(SzDecodeCopy(s3Size, inStream, tempBuf[2])); + RINOK(LookInStream_SeekTo(inStream, startPos + offset)) + RINOK(SzDecodeCopy(s3Size, inStream, tempBuf[2])) if ((tempSizes[0] & 3) != 0 || (tempSizes[1] & 3) != 0 || @@ -484,26 +526,22 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, p.destLim = outBuffer + outSize; Bcj2Dec_Init(&p); - RINOK(Bcj2Dec_Decode(&p)); + RINOK(Bcj2Dec_Decode(&p)) { unsigned i; for (i = 0; i < 4; i++) if (p.bufs[i] != p.lims[i]) return SZ_ERROR_DATA; - - if (!Bcj2Dec_IsFinished(&p)) - return SZ_ERROR_DATA; - - if (p.dest != p.destLim - || p.state != BCJ2_STREAM_MAIN) + if (p.dest != p.destLim || !Bcj2Dec_IsMaybeFinished(&p)) return SZ_ERROR_DATA; } } } - #ifndef _7Z_NO_METHODS_FILTERS +#if defined(Z7_USE_BRANCH_FILTER) else if (ci == 1) { +#if !defined(Z7_NO_METHODS_FILTERS) if (coder->MethodID == k_Delta) { if (coder->PropsSize != 1) @@ -513,31 +551,75 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, Delta_Init(state); Delta_Decode(state, (unsigned)(propsData[coder->PropsOffset]) + 1, outBuffer, outSize); } + continue; } - else +#endif + +#ifdef Z7_USE_FILTER_ARM64 + if (coder->MethodID == k_ARM64) + { + UInt32 pc = 0; + if (coder->PropsSize == 4) + { + pc = GetUi32(propsData + coder->PropsOffset); + if (pc & 3) + return SZ_ERROR_UNSUPPORTED; + } + else if (coder->PropsSize != 0) + return SZ_ERROR_UNSUPPORTED; + z7_BranchConv_ARM64_Dec(outBuffer, outSize, pc); + continue; + } +#endif + +#if !defined(Z7_NO_METHODS_FILTERS) + if (coder->MethodID == k_RISCV) + { + UInt32 pc = 0; + if (coder->PropsSize == 4) + { + pc = GetUi32(propsData + coder->PropsOffset); + if (pc & 1) + return SZ_ERROR_UNSUPPORTED; + } + else if (coder->PropsSize != 0) + return SZ_ERROR_UNSUPPORTED; + z7_BranchConv_RISCV_Dec(outBuffer, outSize, pc); + continue; + } +#endif + +#if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT) { if (coder->PropsSize != 0) return SZ_ERROR_UNSUPPORTED; + #define CASE_BRA_CONV(isa) case k_ ## isa: Z7_BRANCH_CONV_DEC(isa)(outBuffer, outSize, 0); break; // pc = 0; switch (coder->MethodID) { + #if !defined(Z7_NO_METHODS_FILTERS) case k_BCJ: { - UInt32 state; - x86_Convert_Init(state); - x86_Convert(outBuffer, outSize, 0, &state, 0); + UInt32 state = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL; + z7_BranchConvSt_X86_Dec(outBuffer, outSize, 0, &state); // pc = 0 break; } - CASE_BRA_CONV(PPC) + case k_PPC: Z7_BRANCH_CONV_DEC_2(BranchConv_PPC)(outBuffer, outSize, 0); break; // pc = 0; + // CASE_BRA_CONV(PPC) CASE_BRA_CONV(IA64) CASE_BRA_CONV(SPARC) CASE_BRA_CONV(ARM) + #endif + #if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT) CASE_BRA_CONV(ARMT) + #endif default: return SZ_ERROR_UNSUPPORTED; } + continue; } - } - #endif +#endif + } // (c == 1) +#endif // Z7_USE_BRANCH_FILTER else return SZ_ERROR_UNSUPPORTED; } @@ -547,7 +629,7 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, SRes SzAr_DecodeFolder(const CSzAr *p, UInt32 folderIndex, - ILookInStream *inStream, UInt64 startPos, + ILookInStreamPtr inStream, UInt64 startPos, Byte *outBuffer, size_t outSize, ISzAllocPtr allocMain) { diff --git a/src/sdk/C/7zFile.c b/src/sdk/C/7zFile.c index 8992fb1..ba5daa1 100644 --- a/src/sdk/C/7zFile.c +++ b/src/sdk/C/7zFile.c @@ -1,5 +1,5 @@ /* 7zFile.c -- File IO -2017-04-03 : Igor Pavlov : Public domain */ +2023-04-02 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -7,9 +7,19 @@ #ifndef USE_WINDOWS_FILE -#ifndef UNDER_CE -#include -#endif + #include + + #ifndef USE_FOPEN + #include + #include + #ifdef _WIN32 + #include + typedef int ssize_t; + typedef int off_t; + #else + #include + #endif + #endif #else @@ -23,30 +33,36 @@ And message can be "Network connection was lost" */ -#define kChunkSizeMax (1 << 22) - #endif +#define kChunkSizeMax (1 << 22) + void File_Construct(CSzFile *p) { #ifdef USE_WINDOWS_FILE p->handle = INVALID_HANDLE_VALUE; - #else + #elif defined(USE_FOPEN) p->file = NULL; + #else + p->fd = -1; #endif } #if !defined(UNDER_CE) || !defined(USE_WINDOWS_FILE) + static WRes File_Open(CSzFile *p, const char *name, int writeMode) { #ifdef USE_WINDOWS_FILE + p->handle = CreateFileA(name, writeMode ? GENERIC_WRITE : GENERIC_READ, FILE_SHARE_READ, NULL, writeMode ? CREATE_ALWAYS : OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); return (p->handle != INVALID_HANDLE_VALUE) ? 0 : GetLastError(); - #else + + #elif defined(USE_FOPEN) + p->file = fopen(name, writeMode ? "wb+" : "rb"); return (p->file != 0) ? 0 : #ifdef UNDER_CE @@ -54,13 +70,34 @@ static WRes File_Open(CSzFile *p, const char *name, int writeMode) #else errno; #endif + + #else + + int flags = (writeMode ? (O_CREAT | O_EXCL | O_WRONLY) : O_RDONLY); + #ifdef O_BINARY + flags |= O_BINARY; + #endif + p->fd = open(name, flags, 0666); + return (p->fd != -1) ? 0 : errno; + #endif } WRes InFile_Open(CSzFile *p, const char *name) { return File_Open(p, name, 0); } -WRes OutFile_Open(CSzFile *p, const char *name) { return File_Open(p, name, 1); } + +WRes OutFile_Open(CSzFile *p, const char *name) +{ + #if defined(USE_WINDOWS_FILE) || defined(USE_FOPEN) + return File_Open(p, name, 1); + #else + p->fd = creat(name, 0666); + return (p->fd != -1) ? 0 : errno; + #endif +} + #endif + #ifdef USE_WINDOWS_FILE static WRes File_OpenW(CSzFile *p, const WCHAR *name, int writeMode) { @@ -78,74 +115,124 @@ WRes OutFile_OpenW(CSzFile *p, const WCHAR *name) { return File_OpenW(p, name, 1 WRes File_Close(CSzFile *p) { #ifdef USE_WINDOWS_FILE + if (p->handle != INVALID_HANDLE_VALUE) { if (!CloseHandle(p->handle)) return GetLastError(); p->handle = INVALID_HANDLE_VALUE; } - #else + + #elif defined(USE_FOPEN) + if (p->file != NULL) { int res = fclose(p->file); if (res != 0) + { + if (res == EOF) + return errno; return res; + } p->file = NULL; } + + #else + + if (p->fd != -1) + { + if (close(p->fd) != 0) + return errno; + p->fd = -1; + } + #endif + return 0; } + WRes File_Read(CSzFile *p, void *data, size_t *size) { size_t originalSize = *size; + *size = 0; if (originalSize == 0) return 0; #ifdef USE_WINDOWS_FILE - *size = 0; do { - DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize; + const DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize; DWORD processed = 0; - BOOL res = ReadFile(p->handle, data, curSize, &processed, NULL); + const BOOL res = ReadFile(p->handle, data, curSize, &processed, NULL); data = (void *)((Byte *)data + processed); originalSize -= processed; *size += processed; if (!res) return GetLastError(); + // debug : we can break here for partial reading mode + if (processed == 0) + break; + } + while (originalSize > 0); + + #elif defined(USE_FOPEN) + + do + { + const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize; + const size_t processed = fread(data, 1, curSize, p->file); + data = (void *)((Byte *)data + (size_t)processed); + originalSize -= processed; + *size += processed; + if (processed != curSize) + return ferror(p->file); + // debug : we can break here for partial reading mode if (processed == 0) break; } while (originalSize > 0); - return 0; #else - - *size = fread(data, 1, originalSize, p->file); - if (*size == originalSize) - return 0; - return ferror(p->file); - + + do + { + const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize; + const ssize_t processed = read(p->fd, data, curSize); + if (processed == -1) + return errno; + if (processed == 0) + break; + data = (void *)((Byte *)data + (size_t)processed); + originalSize -= (size_t)processed; + *size += (size_t)processed; + // debug : we can break here for partial reading mode + // break; + } + while (originalSize > 0); + #endif + + return 0; } + WRes File_Write(CSzFile *p, const void *data, size_t *size) { size_t originalSize = *size; + *size = 0; if (originalSize == 0) return 0; #ifdef USE_WINDOWS_FILE - *size = 0; do { - DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize; + const DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize; DWORD processed = 0; - BOOL res = WriteFile(p->handle, data, curSize, &processed, NULL); - data = (void *)((Byte *)data + processed); + const BOOL res = WriteFile(p->handle, data, curSize, &processed, NULL); + data = (const void *)((const Byte *)data + processed); originalSize -= processed; *size += processed; if (!res) @@ -154,61 +241,106 @@ WRes File_Write(CSzFile *p, const void *data, size_t *size) break; } while (originalSize > 0); - return 0; + + #elif defined(USE_FOPEN) + + do + { + const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize; + const size_t processed = fwrite(data, 1, curSize, p->file); + data = (void *)((Byte *)data + (size_t)processed); + originalSize -= processed; + *size += processed; + if (processed != curSize) + return ferror(p->file); + if (processed == 0) + break; + } + while (originalSize > 0); #else - *size = fwrite(data, 1, originalSize, p->file); - if (*size == originalSize) - return 0; - return ferror(p->file); - + do + { + const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize; + const ssize_t processed = write(p->fd, data, curSize); + if (processed == -1) + return errno; + if (processed == 0) + break; + data = (const void *)((const Byte *)data + (size_t)processed); + originalSize -= (size_t)processed; + *size += (size_t)processed; + } + while (originalSize > 0); + #endif + + return 0; } + WRes File_Seek(CSzFile *p, Int64 *pos, ESzSeek origin) { #ifdef USE_WINDOWS_FILE - LARGE_INTEGER value; DWORD moveMethod; - value.LowPart = (DWORD)*pos; - value.HighPart = (LONG)((UInt64)*pos >> 16 >> 16); /* for case when UInt64 is 32-bit only */ - switch (origin) + UInt32 low = (UInt32)*pos; + LONG high = (LONG)((UInt64)*pos >> 16 >> 16); /* for case when UInt64 is 32-bit only */ + // (int) to eliminate clang warning + switch ((int)origin) { case SZ_SEEK_SET: moveMethod = FILE_BEGIN; break; case SZ_SEEK_CUR: moveMethod = FILE_CURRENT; break; case SZ_SEEK_END: moveMethod = FILE_END; break; default: return ERROR_INVALID_PARAMETER; } - value.LowPart = SetFilePointer(p->handle, value.LowPart, &value.HighPart, moveMethod); - if (value.LowPart == 0xFFFFFFFF) + low = SetFilePointer(p->handle, (LONG)low, &high, moveMethod); + if (low == (UInt32)0xFFFFFFFF) { WRes res = GetLastError(); if (res != NO_ERROR) return res; } - *pos = ((Int64)value.HighPart << 32) | value.LowPart; + *pos = ((Int64)high << 32) | low; return 0; #else - int moveMethod; - int res; - switch (origin) + int moveMethod; // = origin; + + switch ((int)origin) { case SZ_SEEK_SET: moveMethod = SEEK_SET; break; case SZ_SEEK_CUR: moveMethod = SEEK_CUR; break; case SZ_SEEK_END: moveMethod = SEEK_END; break; - default: return 1; + default: return EINVAL; } - res = fseek(p->file, (long)*pos, moveMethod); - *pos = ftell(p->file); - return res; - #endif + #if defined(USE_FOPEN) + { + int res = fseek(p->file, (long)*pos, moveMethod); + if (res == -1) + return errno; + *pos = ftell(p->file); + if (*pos == -1) + return errno; + return 0; + } + #else + { + off_t res = lseek(p->fd, (off_t)*pos, moveMethod); + if (res == -1) + return errno; + *pos = res; + return 0; + } + + #endif // USE_FOPEN + #endif // USE_WINDOWS_FILE } + WRes File_GetLength(CSzFile *p, UInt64 *length) { #ifdef USE_WINDOWS_FILE @@ -224,13 +356,31 @@ WRes File_GetLength(CSzFile *p, UInt64 *length) *length = (((UInt64)sizeHigh) << 32) + sizeLow; return 0; - #else + #elif defined(USE_FOPEN) long pos = ftell(p->file); int res = fseek(p->file, 0, SEEK_END); *length = ftell(p->file); fseek(p->file, pos, SEEK_SET); return res; + + #else + + off_t pos; + *length = 0; + pos = lseek(p->fd, 0, SEEK_CUR); + if (pos != -1) + { + const off_t len2 = lseek(p->fd, 0, SEEK_END); + const off_t res2 = lseek(p->fd, pos, SEEK_SET); + if (len2 != -1) + { + *length = (UInt64)len2; + if (res2 != -1) + return 0; + } + } + return errno; #endif } @@ -238,10 +388,12 @@ WRes File_GetLength(CSzFile *p, UInt64 *length) /* ---------- FileSeqInStream ---------- */ -static SRes FileSeqInStream_Read(const ISeqInStream *pp, void *buf, size_t *size) +static SRes FileSeqInStream_Read(ISeqInStreamPtr pp, void *buf, size_t *size) { - CFileSeqInStream *p = CONTAINER_FROM_VTBL(pp, CFileSeqInStream, vt); - return File_Read(&p->file, buf, size) == 0 ? SZ_OK : SZ_ERROR_READ; + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileSeqInStream) + const WRes wres = File_Read(&p->file, buf, size); + p->wres = wres; + return (wres == 0) ? SZ_OK : SZ_ERROR_READ; } void FileSeqInStream_CreateVTable(CFileSeqInStream *p) @@ -252,16 +404,20 @@ void FileSeqInStream_CreateVTable(CFileSeqInStream *p) /* ---------- FileInStream ---------- */ -static SRes FileInStream_Read(const ISeekInStream *pp, void *buf, size_t *size) +static SRes FileInStream_Read(ISeekInStreamPtr pp, void *buf, size_t *size) { - CFileInStream *p = CONTAINER_FROM_VTBL(pp, CFileInStream, vt); - return (File_Read(&p->file, buf, size) == 0) ? SZ_OK : SZ_ERROR_READ; + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileInStream) + const WRes wres = File_Read(&p->file, buf, size); + p->wres = wres; + return (wres == 0) ? SZ_OK : SZ_ERROR_READ; } -static SRes FileInStream_Seek(const ISeekInStream *pp, Int64 *pos, ESzSeek origin) +static SRes FileInStream_Seek(ISeekInStreamPtr pp, Int64 *pos, ESzSeek origin) { - CFileInStream *p = CONTAINER_FROM_VTBL(pp, CFileInStream, vt); - return File_Seek(&p->file, pos, origin); + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileInStream) + const WRes wres = File_Seek(&p->file, pos, origin); + p->wres = wres; + return (wres == 0) ? SZ_OK : SZ_ERROR_READ; } void FileInStream_CreateVTable(CFileInStream *p) @@ -273,10 +429,11 @@ void FileInStream_CreateVTable(CFileInStream *p) /* ---------- FileOutStream ---------- */ -static size_t FileOutStream_Write(const ISeqOutStream *pp, const void *data, size_t size) +static size_t FileOutStream_Write(ISeqOutStreamPtr pp, const void *data, size_t size) { - CFileOutStream *p = CONTAINER_FROM_VTBL(pp, CFileOutStream, vt); - File_Write(&p->file, data, &size); + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileOutStream) + const WRes wres = File_Write(&p->file, data, &size); + p->wres = wres; return size; } diff --git a/src/sdk/C/7zFile.h b/src/sdk/C/7zFile.h index 0e79253..f5069cd 100644 --- a/src/sdk/C/7zFile.h +++ b/src/sdk/C/7zFile.h @@ -1,17 +1,21 @@ /* 7zFile.h -- File IO -2017-04-03 : Igor Pavlov : Public domain */ +2023-03-05 : Igor Pavlov : Public domain */ -#ifndef __7Z_FILE_H -#define __7Z_FILE_H +#ifndef ZIP7_INC_FILE_H +#define ZIP7_INC_FILE_H #ifdef _WIN32 #define USE_WINDOWS_FILE +// #include #endif #ifdef USE_WINDOWS_FILE -#include +#include "7zWindows.h" + #else -#include +// note: USE_FOPEN mode is limited to 32-bit file size +// #define USE_FOPEN +// #include #endif #include "7zTypes.h" @@ -24,8 +28,10 @@ typedef struct { #ifdef USE_WINDOWS_FILE HANDLE handle; - #else + #elif defined(USE_FOPEN) FILE *file; + #else + int fd; #endif } CSzFile; @@ -56,6 +62,7 @@ typedef struct { ISeqInStream vt; CSzFile file; + WRes wres; } CFileSeqInStream; void FileSeqInStream_CreateVTable(CFileSeqInStream *p); @@ -65,6 +72,7 @@ typedef struct { ISeekInStream vt; CSzFile file; + WRes wres; } CFileInStream; void FileInStream_CreateVTable(CFileInStream *p); @@ -74,6 +82,7 @@ typedef struct { ISeqOutStream vt; CSzFile file; + WRes wres; } CFileOutStream; void FileOutStream_CreateVTable(CFileOutStream *p); diff --git a/src/sdk/C/7zStream.c b/src/sdk/C/7zStream.c index 6b5aa16..74e75b6 100644 --- a/src/sdk/C/7zStream.c +++ b/src/sdk/C/7zStream.c @@ -1,5 +1,5 @@ /* 7zStream.c -- 7z Stream functions -2017-04-03 : Igor Pavlov : Public domain */ +2023-04-02 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -7,12 +7,33 @@ #include "7zTypes.h" -SRes SeqInStream_Read2(const ISeqInStream *stream, void *buf, size_t size, SRes errorType) + +SRes SeqInStream_ReadMax(ISeqInStreamPtr stream, void *buf, size_t *processedSize) +{ + size_t size = *processedSize; + *processedSize = 0; + while (size != 0) + { + size_t cur = size; + const SRes res = ISeqInStream_Read(stream, buf, &cur); + *processedSize += cur; + buf = (void *)((Byte *)buf + cur); + size -= cur; + if (res != SZ_OK) + return res; + if (cur == 0) + return SZ_OK; + } + return SZ_OK; +} + +/* +SRes SeqInStream_Read2(ISeqInStreamPtr stream, void *buf, size_t size, SRes errorType) { while (size != 0) { size_t processed = size; - RINOK(ISeqInStream_Read(stream, buf, &processed)); + RINOK(ISeqInStream_Read(stream, buf, &processed)) if (processed == 0) return errorType; buf = (void *)((Byte *)buf + processed); @@ -21,42 +42,44 @@ SRes SeqInStream_Read2(const ISeqInStream *stream, void *buf, size_t size, SRes return SZ_OK; } -SRes SeqInStream_Read(const ISeqInStream *stream, void *buf, size_t size) +SRes SeqInStream_Read(ISeqInStreamPtr stream, void *buf, size_t size) { return SeqInStream_Read2(stream, buf, size, SZ_ERROR_INPUT_EOF); } +*/ + -SRes SeqInStream_ReadByte(const ISeqInStream *stream, Byte *buf) +SRes SeqInStream_ReadByte(ISeqInStreamPtr stream, Byte *buf) { size_t processed = 1; - RINOK(ISeqInStream_Read(stream, buf, &processed)); + RINOK(ISeqInStream_Read(stream, buf, &processed)) return (processed == 1) ? SZ_OK : SZ_ERROR_INPUT_EOF; } -SRes LookInStream_SeekTo(const ILookInStream *stream, UInt64 offset) +SRes LookInStream_SeekTo(ILookInStreamPtr stream, UInt64 offset) { - Int64 t = offset; + Int64 t = (Int64)offset; return ILookInStream_Seek(stream, &t, SZ_SEEK_SET); } -SRes LookInStream_LookRead(const ILookInStream *stream, void *buf, size_t *size) +SRes LookInStream_LookRead(ILookInStreamPtr stream, void *buf, size_t *size) { const void *lookBuf; if (*size == 0) return SZ_OK; - RINOK(ILookInStream_Look(stream, &lookBuf, size)); + RINOK(ILookInStream_Look(stream, &lookBuf, size)) memcpy(buf, lookBuf, *size); return ILookInStream_Skip(stream, *size); } -SRes LookInStream_Read2(const ILookInStream *stream, void *buf, size_t size, SRes errorType) +SRes LookInStream_Read2(ILookInStreamPtr stream, void *buf, size_t size, SRes errorType) { while (size != 0) { size_t processed = size; - RINOK(ILookInStream_Read(stream, buf, &processed)); + RINOK(ILookInStream_Read(stream, buf, &processed)) if (processed == 0) return errorType; buf = (void *)((Byte *)buf + processed); @@ -65,16 +88,16 @@ SRes LookInStream_Read2(const ILookInStream *stream, void *buf, size_t size, SRe return SZ_OK; } -SRes LookInStream_Read(const ILookInStream *stream, void *buf, size_t size) +SRes LookInStream_Read(ILookInStreamPtr stream, void *buf, size_t size) { return LookInStream_Read2(stream, buf, size, SZ_ERROR_INPUT_EOF); } -#define GET_LookToRead2 CLookToRead2 *p = CONTAINER_FROM_VTBL(pp, CLookToRead2, vt); +#define GET_LookToRead2 Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CLookToRead2) -static SRes LookToRead2_Look_Lookahead(const ILookInStream *pp, const void **buf, size_t *size) +static SRes LookToRead2_Look_Lookahead(ILookInStreamPtr pp, const void **buf, size_t *size) { SRes res = SZ_OK; GET_LookToRead2 @@ -93,7 +116,7 @@ static SRes LookToRead2_Look_Lookahead(const ILookInStream *pp, const void **buf return res; } -static SRes LookToRead2_Look_Exact(const ILookInStream *pp, const void **buf, size_t *size) +static SRes LookToRead2_Look_Exact(ILookInStreamPtr pp, const void **buf, size_t *size) { SRes res = SZ_OK; GET_LookToRead2 @@ -113,14 +136,14 @@ static SRes LookToRead2_Look_Exact(const ILookInStream *pp, const void **buf, si return res; } -static SRes LookToRead2_Skip(const ILookInStream *pp, size_t offset) +static SRes LookToRead2_Skip(ILookInStreamPtr pp, size_t offset) { GET_LookToRead2 p->pos += offset; return SZ_OK; } -static SRes LookToRead2_Read(const ILookInStream *pp, void *buf, size_t *size) +static SRes LookToRead2_Read(ILookInStreamPtr pp, void *buf, size_t *size) { GET_LookToRead2 size_t rem = p->size - p->pos; @@ -134,7 +157,7 @@ static SRes LookToRead2_Read(const ILookInStream *pp, void *buf, size_t *size) return SZ_OK; } -static SRes LookToRead2_Seek(const ILookInStream *pp, Int64 *pos, ESzSeek origin) +static SRes LookToRead2_Seek(ILookInStreamPtr pp, Int64 *pos, ESzSeek origin) { GET_LookToRead2 p->pos = p->size = 0; @@ -153,9 +176,9 @@ void LookToRead2_CreateVTable(CLookToRead2 *p, int lookahead) -static SRes SecToLook_Read(const ISeqInStream *pp, void *buf, size_t *size) +static SRes SecToLook_Read(ISeqInStreamPtr pp, void *buf, size_t *size) { - CSecToLook *p = CONTAINER_FROM_VTBL(pp, CSecToLook, vt); + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSecToLook) return LookInStream_LookRead(p->realStream, buf, size); } @@ -164,9 +187,9 @@ void SecToLook_CreateVTable(CSecToLook *p) p->vt.Read = SecToLook_Read; } -static SRes SecToRead_Read(const ISeqInStream *pp, void *buf, size_t *size) +static SRes SecToRead_Read(ISeqInStreamPtr pp, void *buf, size_t *size) { - CSecToRead *p = CONTAINER_FROM_VTBL(pp, CSecToRead, vt); + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSecToRead) return ILookInStream_Read(p->realStream, buf, size); } diff --git a/src/sdk/C/7zTypes.h b/src/sdk/C/7zTypes.h index 65b3af6..5b77420 100644 --- a/src/sdk/C/7zTypes.h +++ b/src/sdk/C/7zTypes.h @@ -1,11 +1,13 @@ /* 7zTypes.h -- Basic types -2018-08-04 : Igor Pavlov : Public domain */ +2024-01-24 : Igor Pavlov : Public domain */ -#ifndef __7Z_TYPES_H -#define __7Z_TYPES_H +#ifndef ZIP7_7Z_TYPES_H +#define ZIP7_7Z_TYPES_H #ifdef _WIN32 /* #include */ +#else +#include #endif #include @@ -43,31 +45,134 @@ EXTERN_C_BEGIN typedef int SRes; +#ifdef _MSC_VER + #if _MSC_VER > 1200 + #define MY_ALIGN(n) __declspec(align(n)) + #else + #define MY_ALIGN(n) + #endif +#else + /* + // C11/C++11: + #include + #define MY_ALIGN(n) alignas(n) + */ + #define MY_ALIGN(n) __attribute__ ((aligned(n))) +#endif + + #ifdef _WIN32 /* typedef DWORD WRes; */ typedef unsigned WRes; #define MY_SRes_HRESULT_FROM_WRes(x) HRESULT_FROM_WIN32(x) -#else +// #define MY_HRES_ERROR_INTERNAL_ERROR MY_SRes_HRESULT_FROM_WRes(ERROR_INTERNAL_ERROR) +#else // _WIN32 + +// #define ENV_HAVE_LSTAT typedef int WRes; -#define MY__FACILITY_WIN32 7 -#define MY__FACILITY__WRes MY__FACILITY_WIN32 -#define MY_SRes_HRESULT_FROM_WRes(x) ((HRESULT)(x) <= 0 ? ((HRESULT)(x)) : ((HRESULT) (((x) & 0x0000FFFF) | (MY__FACILITY__WRes << 16) | 0x80000000))) + +// (FACILITY_ERRNO = 0x800) is 7zip's FACILITY constant to represent (errno) errors in HRESULT +#define MY_FACILITY_ERRNO 0x800 +#define MY_FACILITY_WIN32 7 +#define MY_FACILITY_WRes MY_FACILITY_ERRNO + +#define MY_HRESULT_FROM_errno_CONST_ERROR(x) ((HRESULT)( \ + ( (HRESULT)(x) & 0x0000FFFF) \ + | (MY_FACILITY_WRes << 16) \ + | (HRESULT)0x80000000 )) + +#define MY_SRes_HRESULT_FROM_WRes(x) \ + ((HRESULT)(x) <= 0 ? ((HRESULT)(x)) : MY_HRESULT_FROM_errno_CONST_ERROR(x)) + +// we call macro HRESULT_FROM_WIN32 for system errors (WRes) that are (errno) +#define HRESULT_FROM_WIN32(x) MY_SRes_HRESULT_FROM_WRes(x) + +/* +#define ERROR_FILE_NOT_FOUND 2L +#define ERROR_ACCESS_DENIED 5L +#define ERROR_NO_MORE_FILES 18L +#define ERROR_LOCK_VIOLATION 33L +#define ERROR_FILE_EXISTS 80L +#define ERROR_DISK_FULL 112L +#define ERROR_NEGATIVE_SEEK 131L +#define ERROR_ALREADY_EXISTS 183L +#define ERROR_DIRECTORY 267L +#define ERROR_TOO_MANY_POSTS 298L + +#define ERROR_INTERNAL_ERROR 1359L +#define ERROR_INVALID_REPARSE_DATA 4392L +#define ERROR_REPARSE_TAG_INVALID 4393L +#define ERROR_REPARSE_TAG_MISMATCH 4394L +*/ + +// we use errno equivalents for some WIN32 errors: + +#define ERROR_INVALID_PARAMETER EINVAL +#define ERROR_INVALID_FUNCTION EINVAL +#define ERROR_ALREADY_EXISTS EEXIST +#define ERROR_FILE_EXISTS EEXIST +#define ERROR_PATH_NOT_FOUND ENOENT +#define ERROR_FILE_NOT_FOUND ENOENT +#define ERROR_DISK_FULL ENOSPC +// #define ERROR_INVALID_HANDLE EBADF + +// we use FACILITY_WIN32 for errors that has no errno equivalent +// Too many posts were made to a semaphore. +#define ERROR_TOO_MANY_POSTS ((HRESULT)0x8007012AL) +#define ERROR_INVALID_REPARSE_DATA ((HRESULT)0x80071128L) +#define ERROR_REPARSE_TAG_INVALID ((HRESULT)0x80071129L) + +// if (MY_FACILITY_WRes != FACILITY_WIN32), +// we use FACILITY_WIN32 for COM errors: +#define E_OUTOFMEMORY ((HRESULT)0x8007000EL) +#define E_INVALIDARG ((HRESULT)0x80070057L) +#define MY_E_ERROR_NEGATIVE_SEEK ((HRESULT)0x80070083L) + +/* +// we can use FACILITY_ERRNO for some COM errors, that have errno equivalents: +#define E_OUTOFMEMORY MY_HRESULT_FROM_errno_CONST_ERROR(ENOMEM) +#define E_INVALIDARG MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL) +#define MY_E_ERROR_NEGATIVE_SEEK MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL) +*/ + +#define TEXT(quote) quote + +#define FILE_ATTRIBUTE_READONLY 0x0001 +#define FILE_ATTRIBUTE_HIDDEN 0x0002 +#define FILE_ATTRIBUTE_SYSTEM 0x0004 +#define FILE_ATTRIBUTE_DIRECTORY 0x0010 +#define FILE_ATTRIBUTE_ARCHIVE 0x0020 +#define FILE_ATTRIBUTE_DEVICE 0x0040 +#define FILE_ATTRIBUTE_NORMAL 0x0080 +#define FILE_ATTRIBUTE_TEMPORARY 0x0100 +#define FILE_ATTRIBUTE_SPARSE_FILE 0x0200 +#define FILE_ATTRIBUTE_REPARSE_POINT 0x0400 +#define FILE_ATTRIBUTE_COMPRESSED 0x0800 +#define FILE_ATTRIBUTE_OFFLINE 0x1000 +#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x2000 +#define FILE_ATTRIBUTE_ENCRYPTED 0x4000 + +#define FILE_ATTRIBUTE_UNIX_EXTENSION 0x8000 /* trick for Unix */ #endif #ifndef RINOK -#define RINOK(x) { int __result__ = (x); if (__result__ != 0) return __result__; } +#define RINOK(x) { const int _result_ = (x); if (_result_ != 0) return _result_; } +#endif + +#ifndef RINOK_WRes +#define RINOK_WRes(x) { const WRes _result_ = (x); if (_result_ != 0) return _result_; } #endif typedef unsigned char Byte; typedef short Int16; typedef unsigned short UInt16; -#ifdef _LZMA_UINT32_IS_ULONG +#ifdef Z7_DECL_Int32_AS_long typedef long Int32; typedef unsigned long UInt32; #else @@ -75,34 +180,82 @@ typedef int Int32; typedef unsigned int UInt32; #endif -#ifdef _SZ_NO_INT_64 -/* define _SZ_NO_INT_64, if your compiler doesn't support 64-bit integers. - NOTES: Some code will work incorrectly in that case! */ +#ifndef _WIN32 + +typedef int INT; +typedef Int32 INT32; +typedef unsigned int UINT; +typedef UInt32 UINT32; +typedef INT32 LONG; // LONG, ULONG and DWORD must be 32-bit for _WIN32 compatibility +typedef UINT32 ULONG; + +#undef DWORD +typedef UINT32 DWORD; + +#define VOID void + +#define HRESULT LONG + +typedef void *LPVOID; +// typedef void VOID; +// typedef ULONG_PTR DWORD_PTR, *PDWORD_PTR; +// gcc / clang on Unix : sizeof(long==sizeof(void*) in 32 or 64 bits) +typedef long INT_PTR; +typedef unsigned long UINT_PTR; +typedef long LONG_PTR; +typedef unsigned long DWORD_PTR; + +typedef size_t SIZE_T; + +#endif // _WIN32 + + +#define MY_HRES_ERROR_INTERNAL_ERROR ((HRESULT)0x8007054FL) + + +#ifdef Z7_DECL_Int64_AS_long typedef long Int64; typedef unsigned long UInt64; #else -#if defined(_MSC_VER) || defined(__BORLANDC__) +#if (defined(_MSC_VER) || defined(__BORLANDC__)) && !defined(__clang__) typedef __int64 Int64; typedef unsigned __int64 UInt64; -#define UINT64_CONST(n) n +#else +#if defined(__clang__) || defined(__GNUC__) +#include +typedef int64_t Int64; +typedef uint64_t UInt64; #else typedef long long int Int64; typedef unsigned long long int UInt64; -#define UINT64_CONST(n) n ## ULL +// #define UINT64_CONST(n) n ## ULL +#endif #endif #endif -#ifdef _LZMA_NO_SYSTEM_SIZE_T -typedef UInt32 SizeT; +#define UINT64_CONST(n) n + + +#ifdef Z7_DECL_SizeT_AS_unsigned_int +typedef unsigned int SizeT; #else typedef size_t SizeT; #endif +/* +#if (defined(_MSC_VER) && _MSC_VER <= 1200) +typedef size_t MY_uintptr_t; +#else +#include +typedef uintptr_t MY_uintptr_t; +#endif +*/ + typedef int BoolInt; /* typedef BoolInt Bool; */ #define True 1 @@ -110,81 +263,99 @@ typedef int BoolInt; #ifdef _WIN32 -#define MY_STD_CALL __stdcall +#define Z7_STDCALL __stdcall #else -#define MY_STD_CALL +#define Z7_STDCALL #endif #ifdef _MSC_VER #if _MSC_VER >= 1300 -#define MY_NO_INLINE __declspec(noinline) +#define Z7_NO_INLINE __declspec(noinline) #else -#define MY_NO_INLINE +#define Z7_NO_INLINE #endif -#define MY_FORCE_INLINE __forceinline +#define Z7_FORCE_INLINE __forceinline -#define MY_CDECL __cdecl -#define MY_FAST_CALL __fastcall +#define Z7_CDECL __cdecl +#define Z7_FASTCALL __fastcall -#else +#else // _MSC_VER -#define MY_NO_INLINE -#define MY_FORCE_INLINE -#define MY_CDECL -#define MY_FAST_CALL +#if (defined(__GNUC__) && (__GNUC__ >= 4)) \ + || (defined(__clang__) && (__clang_major__ >= 4)) \ + || defined(__INTEL_COMPILER) \ + || defined(__xlC__) +#define Z7_NO_INLINE __attribute__((noinline)) +#define Z7_FORCE_INLINE __attribute__((always_inline)) inline +#else +#define Z7_NO_INLINE +#define Z7_FORCE_INLINE +#endif -/* inline keyword : for C++ / C99 */ +#define Z7_CDECL -/* GCC, clang: */ -/* -#if defined (__GNUC__) && (__GNUC__ >= 4) -#define MY_FORCE_INLINE __attribute__((always_inline)) -#define MY_NO_INLINE __attribute__((noinline)) +#if defined(_M_IX86) \ + || defined(__i386__) +// #define Z7_FASTCALL __attribute__((fastcall)) +// #define Z7_FASTCALL __attribute__((cdecl)) +#define Z7_FASTCALL +#elif defined(MY_CPU_AMD64) +// #define Z7_FASTCALL __attribute__((ms_abi)) +#define Z7_FASTCALL +#else +#define Z7_FASTCALL #endif -*/ -#endif +#endif // _MSC_VER /* The following interfaces use first parameter as pointer to structure */ -typedef struct IByteIn IByteIn; -struct IByteIn +// #define Z7_C_IFACE_CONST_QUAL +#define Z7_C_IFACE_CONST_QUAL const + +#define Z7_C_IFACE_DECL(a) \ + struct a ## _; \ + typedef Z7_C_IFACE_CONST_QUAL struct a ## _ * a ## Ptr; \ + typedef struct a ## _ a; \ + struct a ## _ + + +Z7_C_IFACE_DECL (IByteIn) { - Byte (*Read)(const IByteIn *p); /* reads one byte, returns 0 in case of EOF or error */ + Byte (*Read)(IByteInPtr p); /* reads one byte, returns 0 in case of EOF or error */ }; #define IByteIn_Read(p) (p)->Read(p) -typedef struct IByteOut IByteOut; -struct IByteOut +Z7_C_IFACE_DECL (IByteOut) { - void (*Write)(const IByteOut *p, Byte b); + void (*Write)(IByteOutPtr p, Byte b); }; #define IByteOut_Write(p, b) (p)->Write(p, b) -typedef struct ISeqInStream ISeqInStream; -struct ISeqInStream +Z7_C_IFACE_DECL (ISeqInStream) { - SRes (*Read)(const ISeqInStream *p, void *buf, size_t *size); + SRes (*Read)(ISeqInStreamPtr p, void *buf, size_t *size); /* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. (output(*size) < input(*size)) is allowed */ }; #define ISeqInStream_Read(p, buf, size) (p)->Read(p, buf, size) +/* try to read as much as avail in stream and limited by (*processedSize) */ +SRes SeqInStream_ReadMax(ISeqInStreamPtr stream, void *buf, size_t *processedSize); /* it can return SZ_ERROR_INPUT_EOF */ -SRes SeqInStream_Read(const ISeqInStream *stream, void *buf, size_t size); -SRes SeqInStream_Read2(const ISeqInStream *stream, void *buf, size_t size, SRes errorType); -SRes SeqInStream_ReadByte(const ISeqInStream *stream, Byte *buf); +// SRes SeqInStream_Read(ISeqInStreamPtr stream, void *buf, size_t size); +// SRes SeqInStream_Read2(ISeqInStreamPtr stream, void *buf, size_t size, SRes errorType); +SRes SeqInStream_ReadByte(ISeqInStreamPtr stream, Byte *buf); -typedef struct ISeqOutStream ISeqOutStream; -struct ISeqOutStream +Z7_C_IFACE_DECL (ISeqOutStream) { - size_t (*Write)(const ISeqOutStream *p, const void *buf, size_t size); + size_t (*Write)(ISeqOutStreamPtr p, const void *buf, size_t size); /* Returns: result - the number of actually written bytes. (result < size) means error */ }; @@ -198,29 +369,26 @@ typedef enum } ESzSeek; -typedef struct ISeekInStream ISeekInStream; -struct ISeekInStream +Z7_C_IFACE_DECL (ISeekInStream) { - SRes (*Read)(const ISeekInStream *p, void *buf, size_t *size); /* same as ISeqInStream::Read */ - SRes (*Seek)(const ISeekInStream *p, Int64 *pos, ESzSeek origin); + SRes (*Read)(ISeekInStreamPtr p, void *buf, size_t *size); /* same as ISeqInStream::Read */ + SRes (*Seek)(ISeekInStreamPtr p, Int64 *pos, ESzSeek origin); }; #define ISeekInStream_Read(p, buf, size) (p)->Read(p, buf, size) #define ISeekInStream_Seek(p, pos, origin) (p)->Seek(p, pos, origin) -typedef struct ILookInStream ILookInStream; -struct ILookInStream +Z7_C_IFACE_DECL (ILookInStream) { - SRes (*Look)(const ILookInStream *p, const void **buf, size_t *size); + SRes (*Look)(ILookInStreamPtr p, const void **buf, size_t *size); /* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. (output(*size) > input(*size)) is not allowed (output(*size) < input(*size)) is allowed */ - SRes (*Skip)(const ILookInStream *p, size_t offset); + SRes (*Skip)(ILookInStreamPtr p, size_t offset); /* offset must be <= output(*size) of Look */ - - SRes (*Read)(const ILookInStream *p, void *buf, size_t *size); + SRes (*Read)(ILookInStreamPtr p, void *buf, size_t *size); /* reads directly (without buffer). It's same as ISeqInStream::Read */ - SRes (*Seek)(const ILookInStream *p, Int64 *pos, ESzSeek origin); + SRes (*Seek)(ILookInStreamPtr p, Int64 *pos, ESzSeek origin); }; #define ILookInStream_Look(p, buf, size) (p)->Look(p, buf, size) @@ -229,19 +397,18 @@ struct ILookInStream #define ILookInStream_Seek(p, pos, origin) (p)->Seek(p, pos, origin) -SRes LookInStream_LookRead(const ILookInStream *stream, void *buf, size_t *size); -SRes LookInStream_SeekTo(const ILookInStream *stream, UInt64 offset); +SRes LookInStream_LookRead(ILookInStreamPtr stream, void *buf, size_t *size); +SRes LookInStream_SeekTo(ILookInStreamPtr stream, UInt64 offset); /* reads via ILookInStream::Read */ -SRes LookInStream_Read2(const ILookInStream *stream, void *buf, size_t size, SRes errorType); -SRes LookInStream_Read(const ILookInStream *stream, void *buf, size_t size); - +SRes LookInStream_Read2(ILookInStreamPtr stream, void *buf, size_t size, SRes errorType); +SRes LookInStream_Read(ILookInStreamPtr stream, void *buf, size_t size); typedef struct { ILookInStream vt; - const ISeekInStream *realStream; + ISeekInStreamPtr realStream; size_t pos; size_t size; /* it's data size */ @@ -253,13 +420,13 @@ typedef struct void LookToRead2_CreateVTable(CLookToRead2 *p, int lookahead); -#define LookToRead2_Init(p) { (p)->pos = (p)->size = 0; } +#define LookToRead2_INIT(p) { (p)->pos = (p)->size = 0; } typedef struct { ISeqInStream vt; - const ILookInStream *realStream; + ILookInStreamPtr realStream; } CSecToLook; void SecToLook_CreateVTable(CSecToLook *p); @@ -269,20 +436,19 @@ void SecToLook_CreateVTable(CSecToLook *p); typedef struct { ISeqInStream vt; - const ILookInStream *realStream; + ILookInStreamPtr realStream; } CSecToRead; void SecToRead_CreateVTable(CSecToRead *p); -typedef struct ICompressProgress ICompressProgress; - -struct ICompressProgress +Z7_C_IFACE_DECL (ICompressProgress) { - SRes (*Progress)(const ICompressProgress *p, UInt64 inSize, UInt64 outSize); + SRes (*Progress)(ICompressProgressPtr p, UInt64 inSize, UInt64 outSize); /* Returns: result. (result != SZ_OK) means break. Value (UInt64)(Int64)-1 for size means unknown value. */ }; + #define ICompressProgress_Progress(p, inSize, outSize) (p)->Progress(p, inSize, outSize) @@ -320,13 +486,13 @@ struct ISzAlloc -#ifndef MY_container_of +#ifndef Z7_container_of /* -#define MY_container_of(ptr, type, m) container_of(ptr, type, m) -#define MY_container_of(ptr, type, m) CONTAINING_RECORD(ptr, type, m) -#define MY_container_of(ptr, type, m) ((type *)((char *)(ptr) - offsetof(type, m))) -#define MY_container_of(ptr, type, m) (&((type *)0)->m == (ptr), ((type *)(((char *)(ptr)) - MY_offsetof(type, m)))) +#define Z7_container_of(ptr, type, m) container_of(ptr, type, m) +#define Z7_container_of(ptr, type, m) CONTAINING_RECORD(ptr, type, m) +#define Z7_container_of(ptr, type, m) ((type *)((char *)(ptr) - offsetof(type, m))) +#define Z7_container_of(ptr, type, m) (&((type *)0)->m == (ptr), ((type *)(((char *)(ptr)) - MY_offsetof(type, m)))) */ /* @@ -335,23 +501,63 @@ struct ISzAlloc GCC 4.8.1 : classes with non-public variable members" */ -#define MY_container_of(ptr, type, m) ((type *)((char *)(1 ? (ptr) : &((type *)0)->m) - MY_offsetof(type, m))) +#define Z7_container_of(ptr, type, m) \ + ((type *)(void *)((char *)(void *) \ + (1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m))) + +#define Z7_container_of_CONST(ptr, type, m) \ + ((const type *)(const void *)((const char *)(const void *) \ + (1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m))) +/* +#define Z7_container_of_NON_CONST_FROM_CONST(ptr, type, m) \ + ((type *)(void *)(const void *)((const char *)(const void *) \ + (1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m))) +*/ #endif -#define CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) ((type *)(ptr)) +#define Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) ((type *)(void *)(ptr)) -/* -#define CONTAINER_FROM_VTBL(ptr, type, m) CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) -*/ -#define CONTAINER_FROM_VTBL(ptr, type, m) MY_container_of(ptr, type, m) +// #define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) +#define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_container_of(ptr, type, m) +// #define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_container_of_NON_CONST_FROM_CONST(ptr, type, m) + +#define Z7_CONTAINER_FROM_VTBL_CONST(ptr, type, m) Z7_container_of_CONST(ptr, type, m) -#define CONTAINER_FROM_VTBL_CLS(ptr, type, m) CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) +#define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) /* -#define CONTAINER_FROM_VTBL_CLS(ptr, type, m) CONTAINER_FROM_VTBL(ptr, type, m) +#define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL(ptr, type, m) */ +#if defined (__clang__) || defined(__GNUC__) +#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") +#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL \ + _Pragma("GCC diagnostic pop") +#else +#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL +#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL +#endif + +#define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(ptr, type, m, p) \ + Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \ + type *p = Z7_CONTAINER_FROM_VTBL(ptr, type, m); \ + Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL + +#define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(type) \ + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(pp, type, vt, p) + +// #define ZIP7_DECLARE_HANDLE(name) typedef void *name; +#define Z7_DECLARE_HANDLE(name) struct name##_dummy{int unused;}; typedef struct name##_dummy *name; + + +#define Z7_memset_0_ARRAY(a) memset((a), 0, sizeof(a)) + +#ifndef Z7_ARRAY_SIZE +#define Z7_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) +#endif #ifdef _WIN32 @@ -370,6 +576,22 @@ struct ISzAlloc #endif +#define k_PropVar_TimePrec_0 0 +#define k_PropVar_TimePrec_Unix 1 +#define k_PropVar_TimePrec_DOS 2 +#define k_PropVar_TimePrec_HighPrec 3 +#define k_PropVar_TimePrec_Base 16 +#define k_PropVar_TimePrec_100ns (k_PropVar_TimePrec_Base + 7) +#define k_PropVar_TimePrec_1ns (k_PropVar_TimePrec_Base + 9) + EXTERN_C_END #endif + +/* +#ifndef Z7_ST +#ifdef _7ZIP_ST +#define Z7_ST +#endif +#endif +*/ diff --git a/src/sdk/C/7zVersion.h b/src/sdk/C/7zVersion.h index c176823..b6142e9 100644 --- a/src/sdk/C/7zVersion.h +++ b/src/sdk/C/7zVersion.h @@ -1,7 +1,7 @@ -#define MY_VER_MAJOR 19 -#define MY_VER_MINOR 00 +#define MY_VER_MAJOR 25 +#define MY_VER_MINOR 1 #define MY_VER_BUILD 0 -#define MY_VERSION_NUMBERS "19.00" +#define MY_VERSION_NUMBERS "25.01" #define MY_VERSION MY_VERSION_NUMBERS #ifdef MY_CPU_NAME @@ -10,12 +10,12 @@ #define MY_VERSION_CPU MY_VERSION #endif -#define MY_DATE "2019-02-21" +#define MY_DATE "2025-08-03" #undef MY_COPYRIGHT #undef MY_VERSION_COPYRIGHT_DATE #define MY_AUTHOR_NAME "Igor Pavlov" #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" -#define MY_COPYRIGHT_CR "Copyright (c) 1999-2018 Igor Pavlov" +#define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov" #ifdef USE_COPYRIGHT_CR #define MY_COPYRIGHT MY_COPYRIGHT_CR diff --git a/src/sdk/C/7zVersion.rc b/src/sdk/C/7zVersion.rc index e520995..6ed26de 100644 --- a/src/sdk/C/7zVersion.rc +++ b/src/sdk/C/7zVersion.rc @@ -1,55 +1,55 @@ -#define MY_VS_FFI_FILEFLAGSMASK 0x0000003FL -#define MY_VOS_NT_WINDOWS32 0x00040004L -#define MY_VOS_CE_WINDOWS32 0x00050004L - -#define MY_VFT_APP 0x00000001L -#define MY_VFT_DLL 0x00000002L - -// #include - -#ifndef MY_VERSION -#include "7zVersion.h" -#endif - -#define MY_VER MY_VER_MAJOR,MY_VER_MINOR,MY_VER_BUILD,0 - -#ifdef DEBUG -#define DBG_FL VS_FF_DEBUG -#else -#define DBG_FL 0 -#endif - -#define MY_VERSION_INFO(fileType, descr, intName, origName) \ -LANGUAGE 9, 1 \ -1 VERSIONINFO \ - FILEVERSION MY_VER \ - PRODUCTVERSION MY_VER \ - FILEFLAGSMASK MY_VS_FFI_FILEFLAGSMASK \ - FILEFLAGS DBG_FL \ - FILEOS MY_VOS_NT_WINDOWS32 \ - FILETYPE fileType \ - FILESUBTYPE 0x0L \ -BEGIN \ - BLOCK "StringFileInfo" \ - BEGIN \ - BLOCK "040904b0" \ - BEGIN \ - VALUE "CompanyName", "Igor Pavlov" \ - VALUE "FileDescription", descr \ - VALUE "FileVersion", MY_VERSION \ - VALUE "InternalName", intName \ - VALUE "LegalCopyright", MY_COPYRIGHT \ - VALUE "OriginalFilename", origName \ - VALUE "ProductName", "7-Zip" \ - VALUE "ProductVersion", MY_VERSION \ - END \ - END \ - BLOCK "VarFileInfo" \ - BEGIN \ - VALUE "Translation", 0x409, 1200 \ - END \ -END - -#define MY_VERSION_INFO_APP(descr, intName) MY_VERSION_INFO(MY_VFT_APP, descr, intName, intName ".exe") - -#define MY_VERSION_INFO_DLL(descr, intName) MY_VERSION_INFO(MY_VFT_DLL, descr, intName, intName ".dll") +#define MY_VS_FFI_FILEFLAGSMASK 0x0000003FL +#define MY_VOS_NT_WINDOWS32 0x00040004L +#define MY_VOS_CE_WINDOWS32 0x00050004L + +#define MY_VFT_APP 0x00000001L +#define MY_VFT_DLL 0x00000002L + +// #include + +#ifndef MY_VERSION +#include "7zVersion.h" +#endif + +#define MY_VER MY_VER_MAJOR,MY_VER_MINOR,MY_VER_BUILD,0 + +#ifdef DEBUG +#define DBG_FL VS_FF_DEBUG +#else +#define DBG_FL 0 +#endif + +#define MY_VERSION_INFO(fileType, descr, intName, origName) \ +LANGUAGE 9, 1 \ +1 VERSIONINFO \ + FILEVERSION MY_VER \ + PRODUCTVERSION MY_VER \ + FILEFLAGSMASK MY_VS_FFI_FILEFLAGSMASK \ + FILEFLAGS DBG_FL \ + FILEOS MY_VOS_NT_WINDOWS32 \ + FILETYPE fileType \ + FILESUBTYPE 0x0L \ +BEGIN \ + BLOCK "StringFileInfo" \ + BEGIN \ + BLOCK "040904b0" \ + BEGIN \ + VALUE "CompanyName", "Igor Pavlov" \ + VALUE "FileDescription", descr \ + VALUE "FileVersion", MY_VERSION \ + VALUE "InternalName", intName \ + VALUE "LegalCopyright", MY_COPYRIGHT \ + VALUE "OriginalFilename", origName \ + VALUE "ProductName", "7-Zip" \ + VALUE "ProductVersion", MY_VERSION \ + END \ + END \ + BLOCK "VarFileInfo" \ + BEGIN \ + VALUE "Translation", 0x409, 1200 \ + END \ +END + +#define MY_VERSION_INFO_APP(descr, intName) MY_VERSION_INFO(MY_VFT_APP, descr, intName, intName ".exe") + +#define MY_VERSION_INFO_DLL(descr, intName) MY_VERSION_INFO(MY_VFT_DLL, descr, intName, intName ".dll") diff --git a/src/sdk/C/7zWindows.h b/src/sdk/C/7zWindows.h new file mode 100644 index 0000000..42c6db8 --- /dev/null +++ b/src/sdk/C/7zWindows.h @@ -0,0 +1,101 @@ +/* 7zWindows.h -- StdAfx +2023-04-02 : Igor Pavlov : Public domain */ + +#ifndef ZIP7_INC_7Z_WINDOWS_H +#define ZIP7_INC_7Z_WINDOWS_H + +#ifdef _WIN32 + +#if defined(__clang__) +# pragma clang diagnostic push +#endif + +#if defined(_MSC_VER) + +#pragma warning(push) +#pragma warning(disable : 4668) // '_WIN32_WINNT' is not defined as a preprocessor macro, replacing with '0' for '#if/#elif' + +#if _MSC_VER == 1900 +// for old kit10 versions +// #pragma warning(disable : 4255) // winuser.h(13979): warning C4255: 'GetThreadDpiAwarenessContext': +#endif +// win10 Windows Kit: +#endif // _MSC_VER + +#if defined(_MSC_VER) && _MSC_VER <= 1200 && !defined(_WIN64) +// for msvc6 without sdk2003 +#define RPC_NO_WINDOWS_H +#endif + +#if defined(__MINGW32__) || defined(__MINGW64__) +// #if defined(__GNUC__) && !defined(__clang__) +#include +#else +#include +#endif +// #include +// #include + +// but if precompiled with clang-cl then we need +// #include +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +#if defined(__clang__) +# pragma clang diagnostic pop +#endif + +#if defined(_MSC_VER) && _MSC_VER <= 1200 && !defined(_WIN64) +#ifndef _W64 + +typedef long LONG_PTR, *PLONG_PTR; +typedef unsigned long ULONG_PTR, *PULONG_PTR; +typedef ULONG_PTR DWORD_PTR, *PDWORD_PTR; + +#define Z7_OLD_WIN_SDK +#endif // _W64 +#endif // _MSC_VER == 1200 + +#ifdef Z7_OLD_WIN_SDK + +#ifndef INVALID_FILE_ATTRIBUTES +#define INVALID_FILE_ATTRIBUTES ((DWORD)-1) +#endif +#ifndef INVALID_SET_FILE_POINTER +#define INVALID_SET_FILE_POINTER ((DWORD)-1) +#endif +#ifndef FILE_SPECIAL_ACCESS +#define FILE_SPECIAL_ACCESS (FILE_ANY_ACCESS) +#endif + +// ShlObj.h: +// #define BIF_NEWDIALOGSTYLE 0x0040 + +#pragma warning(disable : 4201) +// #pragma warning(disable : 4115) + +#undef VARIANT_TRUE +#define VARIANT_TRUE ((VARIANT_BOOL)-1) +#endif + +#endif // Z7_OLD_WIN_SDK + +#ifdef UNDER_CE +#undef VARIANT_TRUE +#define VARIANT_TRUE ((VARIANT_BOOL)-1) +#endif + + +#if defined(_MSC_VER) +#if _MSC_VER >= 1400 && _MSC_VER <= 1600 + // BaseTsd.h(148) : 'HandleToULong' : unreferenced inline function has been removed + // string.h + // #pragma warning(disable : 4514) +#endif +#endif + + +/* #include "7zTypes.h" */ + +#endif diff --git a/src/sdk/C/Aes.c b/src/sdk/C/Aes.c index 1cdd0e7..abc5d24 100644 --- a/src/sdk/C/Aes.c +++ b/src/sdk/C/Aes.c @@ -1,12 +1,21 @@ /* Aes.c -- AES encryption / decryption -2017-01-24 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" -#include "Aes.h" #include "CpuArch.h" +#include "Aes.h" +AES_CODE_FUNC g_AesCbc_Decode; +#ifndef Z7_SFX +AES_CODE_FUNC g_AesCbc_Encode; +AES_CODE_FUNC g_AesCtr_Code; +UInt32 g_Aes_SupportedFunctions_Flags; +#endif + +MY_ALIGN(64) static UInt32 T[256 * 4]; +MY_ALIGN(64) static const Byte Sbox[256] = { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, @@ -25,23 +34,12 @@ static const Byte Sbox[256] = { 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; -void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks); - -void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *ivAes, Byte *data, size_t numBlocks); - -AES_CODE_FUNC g_AesCbc_Encode; -AES_CODE_FUNC g_AesCbc_Decode; -AES_CODE_FUNC g_AesCtr_Code; +MY_ALIGN(64) static UInt32 D[256 * 4]; +MY_ALIGN(64) static Byte InvS[256]; -static const Byte Rcon[11] = { 0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 }; - #define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF) #define Ui32(a0, a1, a2, a3) ((UInt32)(a0) | ((UInt32)(a1) << 8) | ((UInt32)(a2) << 16) | ((UInt32)(a3) << 24)) @@ -57,6 +55,66 @@ static const Byte Rcon[11] = { 0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0 #define DD(x) (D + (x << 8)) +// #define Z7_SHOW_AES_STATUS + +#ifdef MY_CPU_X86_OR_AMD64 + + #if defined(__INTEL_COMPILER) + #if (__INTEL_COMPILER >= 1110) + #define USE_HW_AES + #if (__INTEL_COMPILER >= 1900) + #define USE_HW_VAES + #endif + #endif + #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400) + #define USE_HW_AES + #if defined(__clang__) && (__clang_major__ >= 8) \ + || defined(__GNUC__) && (__GNUC__ >= 8) + #define USE_HW_VAES + #endif + #elif defined(_MSC_VER) + #define USE_HW_AES + #define USE_HW_VAES + #endif + +#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) + + #if defined(__ARM_FEATURE_AES) \ + || defined(__ARM_FEATURE_CRYPTO) + #define USE_HW_AES + #else + #if defined(MY_CPU_ARM64) \ + || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ + || defined(Z7_MSC_VER_ORIGINAL) + #if defined(__ARM_FP) && \ + ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(__GNUC__) && (__GNUC__ >= 6) \ + ) \ + || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) + #if defined(MY_CPU_ARM64) \ + || !defined(Z7_CLANG_VERSION) \ + || defined(__ARM_NEON) && \ + (Z7_CLANG_VERSION < 170000 || \ + Z7_CLANG_VERSION > 170001) + #define USE_HW_AES + #endif + #endif + #endif + #endif +#endif + +#ifdef USE_HW_AES +// #pragma message("=== Aes.c USE_HW_AES === ") +#ifdef Z7_SHOW_AES_STATUS +#include +#define PRF(x) x +#else +#define PRF(x) +#endif +#endif + + void AesGenTables(void) { unsigned i; @@ -66,23 +124,23 @@ void AesGenTables(void) for (i = 0; i < 256; i++) { { - UInt32 a1 = Sbox[i]; - UInt32 a2 = xtime(a1); - UInt32 a3 = a2 ^ a1; + const UInt32 a1 = Sbox[i]; + const UInt32 a2 = xtime(a1); + const UInt32 a3 = a2 ^ a1; TT(0)[i] = Ui32(a2, a1, a1, a3); TT(1)[i] = Ui32(a3, a2, a1, a1); TT(2)[i] = Ui32(a1, a3, a2, a1); TT(3)[i] = Ui32(a1, a1, a3, a2); } { - UInt32 a1 = InvS[i]; - UInt32 a2 = xtime(a1); - UInt32 a4 = xtime(a2); - UInt32 a8 = xtime(a4); - UInt32 a9 = a8 ^ a1; - UInt32 aB = a8 ^ a2 ^ a1; - UInt32 aD = a8 ^ a4 ^ a1; - UInt32 aE = a8 ^ a4 ^ a2; + const UInt32 a1 = InvS[i]; + const UInt32 a2 = xtime(a1); + const UInt32 a4 = xtime(a2); + const UInt32 a8 = xtime(a4); + const UInt32 a9 = a8 ^ a1; + const UInt32 aB = a8 ^ a2 ^ a1; + const UInt32 aD = a8 ^ a4 ^ a1; + const UInt32 aE = a8 ^ a4 ^ a2; DD(0)[i] = Ui32(aE, a9, aD, aB); DD(1)[i] = Ui32(aB, aE, a9, aD); DD(2)[i] = Ui32(aD, aB, aE, a9); @@ -90,18 +148,50 @@ void AesGenTables(void) } } - g_AesCbc_Encode = AesCbc_Encode; - g_AesCbc_Decode = AesCbc_Decode; - g_AesCtr_Code = AesCtr_Code; + { + AES_CODE_FUNC d = AesCbc_Decode; + #ifndef Z7_SFX + AES_CODE_FUNC e = AesCbc_Encode; + AES_CODE_FUNC c = AesCtr_Code; + UInt32 flags = 0; + #endif - #ifdef MY_CPU_X86_OR_AMD64 - if (CPU_Is_Aes_Supported()) + #ifdef USE_HW_AES + if (CPU_IsSupported_AES()) { - g_AesCbc_Encode = AesCbc_Encode_Intel; - g_AesCbc_Decode = AesCbc_Decode_Intel; - g_AesCtr_Code = AesCtr_Code_Intel; + // #pragma message ("AES HW") + PRF(printf("\n===AES HW\n")); + d = AesCbc_Decode_HW; + + #ifndef Z7_SFX + e = AesCbc_Encode_HW; + c = AesCtr_Code_HW; + flags = k_Aes_SupportedFunctions_HW; + #endif + + #ifdef MY_CPU_X86_OR_AMD64 + #ifdef USE_HW_VAES + if (CPU_IsSupported_VAES_AVX2()) + { + PRF(printf("\n===vaes avx2\n")); + d = AesCbc_Decode_HW_256; + #ifndef Z7_SFX + c = AesCtr_Code_HW_256; + flags |= k_Aes_SupportedFunctions_HW_256; + #endif + } + #endif + #endif } #endif + + g_AesCbc_Decode = d; + #ifndef Z7_SFX + g_AesCbc_Encode = e; + g_AesCtr_Code = c; + g_Aes_SupportedFunctions_Flags = flags; + #endif + } } @@ -140,10 +230,13 @@ void AesGenTables(void) #define FD(i, x) InvS[gb(x, m[(i - x) & 3])] #define FD4(i) dest[i] = Ui32(FD(i, 0), FD(i, 1), FD(i, 2), FD(i, 3)) ^ w[i]; -void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize) +void Z7_FASTCALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize) { - unsigned i, wSize; - wSize = keySize + 28; + unsigned i, m; + const UInt32 *wLim; + UInt32 t; + UInt32 rcon = 1; + keySize /= 4; w[0] = ((UInt32)keySize / 2) + 3; w += 4; @@ -151,19 +244,29 @@ void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize) for (i = 0; i < keySize; i++, key += 4) w[i] = GetUi32(key); - for (; i < wSize; i++) + t = w[(size_t)keySize - 1]; + wLim = w + (size_t)keySize * 3 + 28; + m = 0; + do { - UInt32 t = w[(size_t)i - 1]; - unsigned rem = i % keySize; - if (rem == 0) - t = Ui32(Sbox[gb1(t)] ^ Rcon[i / keySize], Sbox[gb2(t)], Sbox[gb3(t)], Sbox[gb0(t)]); - else if (keySize > 6 && rem == 4) + if (m == 0) + { + t = Ui32(Sbox[gb1(t)] ^ rcon, Sbox[gb2(t)], Sbox[gb3(t)], Sbox[gb0(t)]); + rcon <<= 1; + if (rcon & 0x100) + rcon = 0x1b; + m = keySize; + } + else if (m == 4 && keySize > 6) t = Ui32(Sbox[gb0(t)], Sbox[gb1(t)], Sbox[gb2(t)], Sbox[gb3(t)]); - w[i] = w[i - keySize] ^ t; + m--; + t ^= w[0]; + w[keySize] = t; } + while (++w != wLim); } -void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize) +void Z7_FASTCALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize) { unsigned i, num; Aes_SetKey_Enc(w, key, keySize); @@ -184,6 +287,7 @@ void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize) src and dest are pointers to 4 UInt32 words. src and dest can point to same block */ +// Z7_FORCE_INLINE static void Aes_Encode(const UInt32 *w, UInt32 *dest, const UInt32 *src) { UInt32 s[4]; @@ -197,16 +301,20 @@ static void Aes_Encode(const UInt32 *w, UInt32 *dest, const UInt32 *src) w += 4; for (;;) { - HT16(m, s, 0); + HT16(m, s, 0) if (--numRounds2 == 0) break; - HT16(s, m, 4); + HT16(s, m, 4) w += 8; } w += 4; - FT4(0); FT4(1); FT4(2); FT4(3); + FT4(0) + FT4(1) + FT4(2) + FT4(3) } +Z7_FORCE_INLINE static void Aes_Decode(const UInt32 *w, UInt32 *dest, const UInt32 *src) { UInt32 s[4]; @@ -220,12 +328,15 @@ static void Aes_Decode(const UInt32 *w, UInt32 *dest, const UInt32 *src) for (;;) { w -= 8; - HD16(m, s, 4); + HD16(m, s, 4) if (--numRounds2 == 0) break; - HD16(s, m, 0); + HD16(s, m, 0) } - FD4(0); FD4(1); FD4(2); FD4(3); + FD4(0) + FD4(1) + FD4(2) + FD4(3) } void AesCbc_Init(UInt32 *p, const Byte *iv) @@ -235,7 +346,7 @@ void AesCbc_Init(UInt32 *p, const Byte *iv) p[i] = GetUi32(iv + i * 4); } -void MY_FAST_CALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks) +void Z7_FASTCALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks) { for (; numBlocks != 0; numBlocks--, data += AES_BLOCK_SIZE) { @@ -246,14 +357,14 @@ void MY_FAST_CALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks) Aes_Encode(p + 4, p, p); - SetUi32(data, p[0]); - SetUi32(data + 4, p[1]); - SetUi32(data + 8, p[2]); - SetUi32(data + 12, p[3]); + SetUi32(data, p[0]) + SetUi32(data + 4, p[1]) + SetUi32(data + 8, p[2]) + SetUi32(data + 12, p[3]) } } -void MY_FAST_CALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks) +void Z7_FASTCALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks) { UInt32 in[4], out[4]; for (; numBlocks != 0; numBlocks--, data += AES_BLOCK_SIZE) @@ -265,10 +376,10 @@ void MY_FAST_CALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks) Aes_Decode(p + 4, out, in); - SetUi32(data, p[0] ^ out[0]); - SetUi32(data + 4, p[1] ^ out[1]); - SetUi32(data + 8, p[2] ^ out[2]); - SetUi32(data + 12, p[3] ^ out[3]); + SetUi32(data, p[0] ^ out[0]) + SetUi32(data + 4, p[1] ^ out[1]) + SetUi32(data + 8, p[2] ^ out[2]) + SetUi32(data + 12, p[3] ^ out[3]) p[0] = in[0]; p[1] = in[1]; @@ -277,7 +388,7 @@ void MY_FAST_CALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks) } } -void MY_FAST_CALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks) +void Z7_FASTCALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks) { for (; numBlocks != 0; numBlocks--) { @@ -291,16 +402,28 @@ void MY_FAST_CALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks) for (i = 0; i < 4; i++, data += 4) { - UInt32 t = temp[i]; + const UInt32 t = temp[i]; #ifdef MY_CPU_LE_UNALIGN - *((UInt32 *)data) ^= t; + *((UInt32 *)(void *)data) ^= t; #else - data[0] ^= (t & 0xFF); - data[1] ^= ((t >> 8) & 0xFF); - data[2] ^= ((t >> 16) & 0xFF); - data[3] ^= ((t >> 24)); + data[0] = (Byte)(data[0] ^ (t & 0xFF)); + data[1] = (Byte)(data[1] ^ ((t >> 8) & 0xFF)); + data[2] = (Byte)(data[2] ^ ((t >> 16) & 0xFF)); + data[3] = (Byte)(data[3] ^ ((t >> 24))); #endif } } } + +#undef xtime +#undef Ui32 +#undef gb0 +#undef gb1 +#undef gb2 +#undef gb3 +#undef gb +#undef TT +#undef DD +#undef USE_HW_AES +#undef PRF diff --git a/src/sdk/C/Aes.h b/src/sdk/C/Aes.h index 64979b5..7f0182a 100644 --- a/src/sdk/C/Aes.h +++ b/src/sdk/C/Aes.h @@ -1,8 +1,8 @@ /* Aes.h -- AES encryption / decryption -2013-01-18 : Igor Pavlov : Public domain */ +2023-04-02 : Igor Pavlov : Public domain */ -#ifndef __AES_H -#define __AES_H +#ifndef ZIP7_INC_AES_H +#define ZIP7_INC_AES_H #include "7zTypes.h" @@ -20,18 +20,40 @@ void AesGenTables(void); /* aes - 16-byte aligned pointer to keyMode+roundKeys sequence */ /* keySize = 16 or 24 or 32 (bytes) */ -typedef void (MY_FAST_CALL *AES_SET_KEY_FUNC)(UInt32 *aes, const Byte *key, unsigned keySize); -void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *aes, const Byte *key, unsigned keySize); -void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *aes, const Byte *key, unsigned keySize); +typedef void (Z7_FASTCALL *AES_SET_KEY_FUNC)(UInt32 *aes, const Byte *key, unsigned keySize); +void Z7_FASTCALL Aes_SetKey_Enc(UInt32 *aes, const Byte *key, unsigned keySize); +void Z7_FASTCALL Aes_SetKey_Dec(UInt32 *aes, const Byte *key, unsigned keySize); /* ivAes - 16-byte aligned pointer to iv+keyMode+roundKeys sequence: UInt32[AES_NUM_IVMRK_WORDS] */ void AesCbc_Init(UInt32 *ivAes, const Byte *iv); /* iv size is AES_BLOCK_SIZE */ + /* data - 16-byte aligned pointer to data */ /* numBlocks - the number of 16-byte blocks in data array */ -typedef void (MY_FAST_CALL *AES_CODE_FUNC)(UInt32 *ivAes, Byte *data, size_t numBlocks); -extern AES_CODE_FUNC g_AesCbc_Encode; +typedef void (Z7_FASTCALL *AES_CODE_FUNC)(UInt32 *ivAes, Byte *data, size_t numBlocks); + extern AES_CODE_FUNC g_AesCbc_Decode; +#ifndef Z7_SFX +extern AES_CODE_FUNC g_AesCbc_Encode; extern AES_CODE_FUNC g_AesCtr_Code; +#define k_Aes_SupportedFunctions_HW (1 << 2) +#define k_Aes_SupportedFunctions_HW_256 (1 << 3) +extern UInt32 g_Aes_SupportedFunctions_Flags; +#endif + + +#define Z7_DECLARE_AES_CODE_FUNC(funcName) \ + void Z7_FASTCALL funcName(UInt32 *ivAes, Byte *data, size_t numBlocks); + +Z7_DECLARE_AES_CODE_FUNC (AesCbc_Encode) +Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode) +Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code) + +Z7_DECLARE_AES_CODE_FUNC (AesCbc_Encode_HW) +Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode_HW) +Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code_HW) + +Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode_HW_256) +Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code_HW_256) EXTERN_C_END diff --git a/src/sdk/C/AesOpt.c b/src/sdk/C/AesOpt.c index 9571c46..b281807 100644 --- a/src/sdk/C/AesOpt.c +++ b/src/sdk/C/AesOpt.c @@ -1,184 +1,1002 @@ -/* AesOpt.c -- Intel's AES -2017-06-08 : Igor Pavlov : Public domain */ +/* AesOpt.c -- AES optimized code for x86 AES hardware instructions +Igor Pavlov : Public domain */ #include "Precomp.h" +#include "Aes.h" #include "CpuArch.h" #ifdef MY_CPU_X86_OR_AMD64 -#if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729) -#define USE_INTEL_AES -#endif -#endif + + #if defined(__INTEL_COMPILER) + #if (__INTEL_COMPILER >= 1110) + #define USE_INTEL_AES + #if (__INTEL_COMPILER >= 1900) + #define USE_INTEL_VAES + #endif + #endif + #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400) + #define USE_INTEL_AES + #if !defined(__AES__) + #define ATTRIB_AES __attribute__((__target__("aes"))) + #endif + #if defined(__clang__) && (__clang_major__ >= 8) \ + || defined(__GNUC__) && (__GNUC__ >= 8) + #define USE_INTEL_VAES + #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__) + #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2"))) + #endif + #endif + #elif defined(_MSC_VER) + #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729) + #define USE_INTEL_AES + #if (_MSC_VER >= 1910) + #define USE_INTEL_VAES + #endif + #endif + #ifndef USE_INTEL_AES + #define Z7_USE_AES_HW_STUB + #endif + #ifndef USE_INTEL_VAES + #define Z7_USE_VAES_HW_STUB + #endif + #endif + + #ifndef USE_INTEL_AES + // #define Z7_USE_AES_HW_STUB // for debug + #endif + #ifndef USE_INTEL_VAES + // #define Z7_USE_VAES_HW_STUB // for debug + #endif + #ifdef USE_INTEL_AES #include -void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks) +#if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB) +#define AES_TYPE_keys UInt32 +#define AES_TYPE_data Byte +// #define AES_TYPE_keys __m128i +// #define AES_TYPE_data __m128i +#endif + +#ifndef ATTRIB_AES + #define ATTRIB_AES +#endif + +#define AES_FUNC_START(name) \ + void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) + // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) + +#define AES_FUNC_START2(name) \ +AES_FUNC_START (name); \ +ATTRIB_AES \ +AES_FUNC_START (name) + +#define MM_OP(op, dest, src) dest = op(dest, src); +#define MM_OP_m(op, src) MM_OP(op, m, src) + +#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) + +#if 1 +// use aligned SSE load/store for data. +// It is required for our Aes functions, that data is aligned for 16-bytes. +// So we can use this branch of code. +// and compiler can use fused load-op SSE instructions: +// xorps xmm0, XMMWORD PTR [rdx] +#define LOAD_128(pp) (*(__m128i *)(void *)(pp)) +#define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v +// use aligned SSE load/store for data. Alternative code with direct access +// #define LOAD_128(pp) _mm_load_si128(pp) +// #define STORE_128(pp, _v) _mm_store_si128(pp, _v) +#else +// use unaligned load/store for data: movdqu XMMWORD PTR [rdx] +#define LOAD_128(pp) _mm_loadu_si128(pp) +#define STORE_128(pp, _v) _mm_storeu_si128(pp, _v) +#endif + +AES_FUNC_START2 (AesCbc_Encode_HW) { + if (numBlocks == 0) + return; + { + __m128i *p = (__m128i *)(void *)ivAes; + __m128i *data = (__m128i *)(void *)data8; __m128i m = *p; - for (; numBlocks != 0; numBlocks--, data++) + const __m128i k0 = p[2]; + const __m128i k1 = p[3]; + const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; + do { - UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; - const __m128i *w = p + 3; - m = _mm_xor_si128(m, *data); - m = _mm_xor_si128(m, p[2]); + UInt32 r = numRounds2; + const __m128i *w = p + 4; + __m128i temp = LOAD_128(data); + MM_XOR (temp, k0) + MM_XOR (m, temp) + MM_OP_m (_mm_aesenc_si128, k1) do { - m = _mm_aesenc_si128(m, w[0]); - m = _mm_aesenc_si128(m, w[1]); + MM_OP_m (_mm_aesenc_si128, w[0]) + MM_OP_m (_mm_aesenc_si128, w[1]) w += 2; } - while (--numRounds2 != 0); - m = _mm_aesenc_si128(m, w[0]); - m = _mm_aesenclast_si128(m, w[1]); - *data = m; + while (--r); + MM_OP_m (_mm_aesenclast_si128, w[0]) + STORE_128(data, m); + data++; } + while (--numBlocks); *p = m; + } } -#define NUM_WAYS 3 -#define AES_OP_W(op, n) { \ - const __m128i t = w[n]; \ - m0 = op(m0, t); \ - m1 = op(m1, t); \ - m2 = op(m2, t); \ - } +#define WOP_1(op) +#define WOP_2(op) WOP_1 (op) op (m1, 1) +#define WOP_3(op) WOP_2 (op) op (m2, 2) +#define WOP_4(op) WOP_3 (op) op (m3, 3) +#ifdef MY_CPU_AMD64 +#define WOP_5(op) WOP_4 (op) op (m4, 4) +#define WOP_6(op) WOP_5 (op) op (m5, 5) +#define WOP_7(op) WOP_6 (op) op (m6, 6) +#define WOP_8(op) WOP_7 (op) op (m7, 7) +#endif +/* +#define WOP_9(op) WOP_8 (op) op (m8, 8); +#define WOP_10(op) WOP_9 (op) op (m9, 9); +#define WOP_11(op) WOP_10(op) op (m10, 10); +#define WOP_12(op) WOP_11(op) op (m11, 11); +#define WOP_13(op) WOP_12(op) op (m12, 12); +#define WOP_14(op) WOP_13(op) op (m13, 13); +*/ + +#ifdef MY_CPU_AMD64 + #define NUM_WAYS 8 + #define WOP_M1 WOP_8 +#else + #define NUM_WAYS 4 + #define WOP_M1 WOP_4 +#endif + +#define WOP(op) op (m0, 0) WOP_M1(op) + +#define DECLARE_VAR(reg, ii) __m128i reg; +#define LOAD_data_ii(ii) LOAD_128(data + (ii)) +#define LOAD_data( reg, ii) reg = LOAD_data_ii(ii); +#define STORE_data( reg, ii) STORE_128(data + (ii), reg); +#if (NUM_WAYS > 1) +#define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1))) +#endif + +#define MM_OP_key(op, reg) MM_OP(op, reg, key); -#define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n) -#define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n) -#define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n) -#define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n) +#define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) +#define AES_DEC_LAST( reg, ii) MM_OP_key (_mm_aesdeclast_si128, reg) +#define AES_ENC( reg, ii) MM_OP_key (_mm_aesenc_si128, reg) +#define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) +#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) -void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks) +#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; +#define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \ + LOAD_128 (data + (ii)))); +#define WOP_KEY(op, n) { \ + const __m128i key = w[n]; \ + WOP(op) } + +#define WIDE_LOOP_START \ + dataEnd = data + numBlocks; \ + if (numBlocks >= NUM_WAYS) \ + { dataEnd -= NUM_WAYS; do { \ + +#define WIDE_LOOP_END \ + data += NUM_WAYS; \ + } while (data <= dataEnd); \ + dataEnd += NUM_WAYS; } \ + +#define SINGLE_LOOP \ + for (; data < dataEnd; data++) + + + +#ifdef USE_INTEL_VAES + +#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) +#define AVX_DECLARE_VAR(reg, ii) __m256i reg; + +#if 1 +// use unaligned AVX load/store for data. +// It is required for our Aes functions, that data is aligned for 16-bytes. +// But we need 32-bytes reading. +// So we use intrinsics for unaligned AVX load/store. +// notes for _mm256_storeu_si256: +// msvc2022: uses vmovdqu and keeps the order of instruction sequence. +// new gcc11 uses vmovdqu +// old gcc9 could use pair of instructions: +// vmovups %xmm7, -224(%rax) +// vextracti128 $0x1, %ymm7, -208(%rax) +#define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p)) +#define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v); +#else +// use aligned AVX load/store for data. +// for debug: we can use this branch, if we are sure that data is aligned for 32-bytes. +// msvc2022 uses vmovdqu still +// gcc uses vmovdqa (that requires 32-bytes alignment) +#define AVX_LOAD(p) (*(const __m256i *)(const void *)(p)) +#define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v; +#endif + +#define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii)); +#define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg) +/* +AVX_XOR_data_M1() needs unaligned memory load, even if (data) +is aligned for 256-bits, because we read 32-bytes chunk that +crosses (data) position: from (data - 16bytes) to (data + 16bytes). +*/ +#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii))) + +#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) +#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) +#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) +#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) +#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) +#define AVX_CTR_START(reg, ii) \ + MM_OP (_mm256_add_epi64, ctr2, two) \ + reg = _mm256_xor_si256(ctr2, key); + +#define AVX_CTR_END(reg, ii) \ + AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \ + AVX_LOAD ((__m256i *)(void *)data + (ii)))); + +#define AVX_WOP_KEY(op, n) { \ + const __m256i key = w[n]; \ + WOP(op) } + +#define NUM_AES_KEYS_MAX 15 + +#define WIDE_LOOP_START_AVX(OP) \ + dataEnd = data + numBlocks; \ + if (numBlocks >= NUM_WAYS * 2) \ + { __m256i keys[NUM_AES_KEYS_MAX]; \ + OP \ + { UInt32 ii; for (ii = 0; ii < numRounds; ii++) \ + keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \ + dataEnd -= NUM_WAYS * 2; \ + do { \ + +#define WIDE_LOOP_END_AVX(OP) \ + data += NUM_WAYS * 2; \ + } while (data <= dataEnd); \ + dataEnd += NUM_WAYS * 2; \ + OP \ + _mm256_zeroupper(); \ + } \ + +/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, + MSVC still can insert vzeroupper instruction. */ + +#endif + + + +AES_FUNC_START2 (AesCbc_Decode_HW) { + __m128i *p = (__m128i *)(void *)ivAes; + __m128i *data = (__m128i *)(void *)data8; __m128i iv = *p; - for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS) + const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1; + const __m128i *dataEnd; + p += 2; + + WIDE_LOOP_START { - UInt32 numRounds2 = *(const UInt32 *)(p + 1); - const __m128i *w = p + numRounds2 * 2; - __m128i m0, m1, m2; + const __m128i *w = wStart; + WOP (DECLARE_VAR) + WOP (LOAD_data) + WOP_KEY (AES_XOR, 1) + do { - const __m128i t = w[2]; - m0 = _mm_xor_si128(t, data[0]); - m1 = _mm_xor_si128(t, data[1]); - m2 = _mm_xor_si128(t, data[2]); + WOP_KEY (AES_DEC, 0) + + w--; } - numRounds2--; + while (w != p); + WOP_KEY (AES_DEC_LAST, 0) + + MM_XOR (m0, iv) + WOP_M1 (XOR_data_M1) + LOAD_data(iv, NUM_WAYS - 1) + WOP (STORE_data) + } + WIDE_LOOP_END + + SINGLE_LOOP + { + const __m128i *w = wStart - 1; + __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); + do { - AES_DEC(1) - AES_DEC(0) + MM_OP_m (_mm_aesdec_si128, w[1]) + MM_OP_m (_mm_aesdec_si128, w[0]) w -= 2; } - while (--numRounds2 != 0); - AES_DEC(1) - AES_DEC_LAST(0) + while (w != p); + MM_OP_m (_mm_aesdec_si128, w[1]) + MM_OP_m (_mm_aesdeclast_si128, w[0]) + MM_XOR (m, iv) + LOAD_data(iv, 0) + STORE_data(m, 0) + } + + p[-2] = iv; +} + + +AES_FUNC_START2 (AesCtr_Code_HW) +{ + __m128i *p = (__m128i *)(void *)ivAes; + __m128i *data = (__m128i *)(void *)data8; + __m128i ctr = *p; + const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; + const __m128i *dataEnd; + const __m128i one = _mm_cvtsi32_si128(1); + p += 2; + + WIDE_LOOP_START + { + const __m128i *w = p; + UInt32 r = numRoundsMinus2; + WOP (DECLARE_VAR) + WOP (CTR_START) + WOP_KEY (AES_XOR, 0) + w += 1; + do { - __m128i t; - t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t; - t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t; - t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t; + WOP_KEY (AES_ENC, 0) + w += 1; } + while (--r); + WOP_KEY (AES_ENC_LAST, 0) + WOP (CTR_END) } - for (; numBlocks != 0; numBlocks--, data++) + WIDE_LOOP_END + + SINGLE_LOOP { - UInt32 numRounds2 = *(const UInt32 *)(p + 1); - const __m128i *w = p + numRounds2 * 2; - __m128i m = _mm_xor_si128(w[2], *data); - numRounds2--; + UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; + const __m128i *w = p; + __m128i m; + MM_OP (_mm_add_epi64, ctr, one) + m = _mm_xor_si128 (ctr, p[0]); + w += 1; do { - m = _mm_aesdec_si128(m, w[1]); - m = _mm_aesdec_si128(m, w[0]); - w -= 2; + MM_OP_m (_mm_aesenc_si128, w[0]) + MM_OP_m (_mm_aesenc_si128, w[1]) + w += 2; } - while (--numRounds2 != 0); - m = _mm_aesdec_si128(m, w[1]); - m = _mm_aesdeclast_si128(m, w[0]); + while (--numRounds2); + MM_OP_m (_mm_aesenc_si128, w[0]) + MM_OP_m (_mm_aesenclast_si128, w[1]) + CTR_END (m, 0) + } + + p[-2] = ctr; +} + + - m = _mm_xor_si128(m, iv); - iv = *data; - *data = m; +#ifdef USE_INTEL_VAES + +/* +GCC before 2013-Jun: + : + #ifdef __AVX__ + #include + #endif +GCC after 2013-Jun: + : + #include +CLANG 3.8+: +{ + : + #if !defined(_MSC_VER) || defined(__AVX__) + #include + #endif + + if (the compiler is clang for Windows and if global arch is not set for __AVX__) + [ if (defined(_MSC_VER) && !defined(__AVX__)) ] + { + doesn't include + and we have 2 ways to fix it: + 1) we can define required __AVX__ before + or + 2) we can include after } - *p = iv; } -void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks) +If we include manually for GCC/CLANG, it's +required that must be included before . +*/ + +/* +#if defined(__clang__) && defined(_MSC_VER) +#define __AVX__ +#define __AVX2__ +#define __VAES__ +#endif +*/ + +#include +#if defined(__clang__) && defined(_MSC_VER) + #if !defined(__AVX__) + #include + #endif + #if !defined(__AVX2__) + #include + #endif + #if !defined(__VAES__) + #include + #endif +#endif // __clang__ && _MSC_VER + +#ifndef ATTRIB_VAES + #define ATTRIB_VAES +#endif + +#define VAES_FUNC_START2(name) \ +AES_FUNC_START (name); \ +ATTRIB_VAES \ +AES_FUNC_START (name) + +VAES_FUNC_START2 (AesCbc_Decode_HW_256) { - __m128i ctr = *p; - __m128i one; - one.m128i_u64[0] = 1; - one.m128i_u64[1] = 0; - for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS) + __m128i *p = (__m128i *)(void *)ivAes; + __m128i *data = (__m128i *)(void *)data8; + __m128i iv = *p; + const __m128i *dataEnd; + const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; + p += 2; + + WIDE_LOOP_START_AVX(;) { - UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; - const __m128i *w = p; - __m128i m0, m1, m2; + const __m256i *w = keys + numRounds - 2; + + WOP (AVX_DECLARE_VAR) + WOP (AVX_LOAD_data) + AVX_WOP_KEY (AVX_AES_XOR, 1) + + do { - const __m128i t = w[2]; - ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t); - ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t); - ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t); + AVX_WOP_KEY (AVX_AES_DEC, 0) + w--; } - w += 3; + while (w != keys); + AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) + + AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0))) + WOP_M1 (AVX_XOR_data_M1) + LOAD_data (iv, NUM_WAYS * 2 - 1) + WOP (AVX_STORE_data) + } + WIDE_LOOP_END_AVX(;) + + SINGLE_LOOP + { + const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2; + __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); do { - AES_ENC(0) - AES_ENC(1) - w += 2; + MM_OP_m (_mm_aesdec_si128, w[1]) + MM_OP_m (_mm_aesdec_si128, w[0]) + w -= 2; + } + while (w != p); + MM_OP_m (_mm_aesdec_si128, w[1]) + MM_OP_m (_mm_aesdeclast_si128, w[0]) + + MM_XOR (m, iv) + LOAD_data(iv, 0) + STORE_data(m, 0) + } + + p[-2] = iv; +} + + +/* +SSE2: _mm_cvtsi32_si128 : movd +AVX: _mm256_setr_m128i : vinsertf128 +AVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm + _mm256_extracti128_si256 : vextracti128 + _mm256_broadcastsi128_si256 : vbroadcasti128 +*/ + +#define AVX_CTR_LOOP_START \ + ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \ + two = _mm256_setr_m128i(one, one); \ + two = _mm256_add_epi64(two, two); \ + +// two = _mm256_setr_epi64x(2, 0, 2, 0); + +#define AVX_CTR_LOOP_ENC \ + ctr = _mm256_extracti128_si256 (ctr2, 1); \ + +VAES_FUNC_START2 (AesCtr_Code_HW_256) +{ + __m128i *p = (__m128i *)(void *)ivAes; + __m128i *data = (__m128i *)(void *)data8; + __m128i ctr = *p; + const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; + const __m128i *dataEnd; + const __m128i one = _mm_cvtsi32_si128(1); + __m256i ctr2, two; + p += 2; + + WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START) + { + const __m256i *w = keys; + UInt32 r = numRounds - 2; + WOP (AVX_DECLARE_VAR) + AVX_WOP_KEY (AVX_CTR_START, 0) + + w += 1; + do + { + AVX_WOP_KEY (AVX_AES_ENC, 0) + w += 1; } - while (--numRounds2 != 0); - AES_ENC(0) - AES_ENC_LAST(1) - data[0] = _mm_xor_si128(data[0], m0); - data[1] = _mm_xor_si128(data[1], m1); - data[2] = _mm_xor_si128(data[2], m2); + while (--r); + AVX_WOP_KEY (AVX_AES_ENC_LAST, 0) + + WOP (AVX_CTR_END) } - for (; numBlocks != 0; numBlocks--, data++) + WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC) + + SINGLE_LOOP { - UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; + UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1; const __m128i *w = p; __m128i m; - ctr = _mm_add_epi64(ctr, one); - m = _mm_xor_si128(ctr, p[2]); - w += 3; + MM_OP (_mm_add_epi64, ctr, one) + m = _mm_xor_si128 (ctr, p[0]); + w += 1; do { - m = _mm_aesenc_si128(m, w[0]); - m = _mm_aesenc_si128(m, w[1]); + MM_OP_m (_mm_aesenc_si128, w[0]) + MM_OP_m (_mm_aesenc_si128, w[1]) w += 2; } - while (--numRounds2 != 0); - m = _mm_aesenc_si128(m, w[0]); - m = _mm_aesenclast_si128(m, w[1]); - *data = _mm_xor_si128(*data, m); + while (--numRounds2); + MM_OP_m (_mm_aesenc_si128, w[0]) + MM_OP_m (_mm_aesenclast_si128, w[1]) + CTR_END (m, 0) } - *p = ctr; + + p[-2] = ctr; } +#endif // USE_INTEL_VAES + +#else // USE_INTEL_AES + +/* no USE_INTEL_AES */ + +#if defined(Z7_USE_AES_HW_STUB) +// We can compile this file with another C compiler, +// or we can compile asm version. +// So we can generate real code instead of this stub function. +// #if defined(_MSC_VER) +#pragma message("AES HW_SW stub was used") +// #endif + +#if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB) +#define AES_TYPE_keys UInt32 +#define AES_TYPE_data Byte +#endif + +#define AES_FUNC_START(name) \ + void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ + +#define AES_COMPAT_STUB(name) \ + AES_FUNC_START(name); \ + AES_FUNC_START(name ## _HW) \ + { name(p, data, numBlocks); } + +AES_COMPAT_STUB (AesCbc_Encode) +AES_COMPAT_STUB (AesCbc_Decode) +AES_COMPAT_STUB (AesCtr_Code) +#endif // Z7_USE_AES_HW_STUB + +#endif // USE_INTEL_AES + + +#ifndef USE_INTEL_VAES +#if defined(Z7_USE_VAES_HW_STUB) +// #if defined(_MSC_VER) +#pragma message("VAES HW_SW stub was used") +// #endif + +#define VAES_COMPAT_STUB(name) \ + void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ + void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \ + { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); } + +VAES_COMPAT_STUB (AesCbc_Decode_HW) +VAES_COMPAT_STUB (AesCtr_Code_HW) +#endif +#endif // ! USE_INTEL_VAES + + + + +#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) + + #if defined(__ARM_FEATURE_AES) \ + || defined(__ARM_FEATURE_CRYPTO) + #define USE_HW_AES + #else + #if defined(MY_CPU_ARM64) \ + || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ + || defined(Z7_MSC_VER_ORIGINAL) + #if defined(__ARM_FP) && \ + ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(__GNUC__) && (__GNUC__ >= 6) \ + ) \ + || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) + #if defined(MY_CPU_ARM64) \ + || !defined(Z7_CLANG_VERSION) \ + || defined(__ARM_NEON) && \ + (Z7_CLANG_VERSION < 170000 || \ + Z7_CLANG_VERSION > 170001) + #define USE_HW_AES + #endif + #endif + #endif + #endif + +#ifdef USE_HW_AES + +// #pragma message("=== AES HW === ") +// __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES + +#if defined(__clang__) || defined(__GNUC__) +#if !defined(__ARM_FEATURE_AES) && \ + !defined(__ARM_FEATURE_CRYPTO) + #ifdef MY_CPU_ARM64 +#if defined(__clang__) + #define ATTRIB_AES __attribute__((__target__("crypto"))) #else + #define ATTRIB_AES __attribute__((__target__("+crypto"))) +#endif + #else +#if defined(__clang__) + #define ATTRIB_AES __attribute__((__target__("armv8-a,aes"))) +#else + #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) +#endif + #endif +#endif +#else + // _MSC_VER + // for arm32 + #define _ARM_USE_NEW_NEON_INTRINSICS +#endif + +#ifndef ATTRIB_AES + #define ATTRIB_AES +#endif + +#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) +#include +#else +/* + clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese + clang + 3.8.1 : __ARM_NEON : defined(__ARM_FEATURE_CRYPTO) + 7.0.1 : __ARM_NEON : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO) + 11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO) + 13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES) + 16 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 +*/ +#if defined(__clang__) && __clang_major__ < 16 +#if !defined(__ARM_FEATURE_AES) && \ + !defined(__ARM_FEATURE_CRYPTO) +// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ") + Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1 +// #if defined(__clang__) && __clang_major__ < 13 + #define __ARM_FEATURE_CRYPTO 1 +// #else + #define __ARM_FEATURE_AES 1 +// #endif + Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif +#endif // clang + +#if defined(__clang__) + +#if defined(__ARM_ARCH) && __ARM_ARCH < 8 + Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +// #pragma message("#define __ARM_ARCH 8") + #undef __ARM_ARCH + #define __ARM_ARCH 8 + Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif + +#endif // clang + +#include + +#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \ + defined(__ARM_FEATURE_CRYPTO) && \ + defined(__ARM_FEATURE_AES) +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #undef __ARM_FEATURE_CRYPTO + #undef __ARM_FEATURE_AES + #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") +#endif + +#endif // Z7_MSC_VER_ORIGINAL + +typedef uint8x16_t v128; -void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks); -void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks); +#define AES_FUNC_START(name) \ + void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) + // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks) -void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks) +#define AES_FUNC_START2(name) \ +AES_FUNC_START (name); \ +ATTRIB_AES \ +AES_FUNC_START (name) + +#define MM_OP(op, dest, src) dest = op(dest, src); +#define MM_OP_m(op, src) MM_OP(op, m, src) +#define MM_OP1_m(op) m = op(m); + +#define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src) +#define MM_XOR_m( src) MM_XOR(m, src) + +#define AES_E_m(k) MM_OP_m (vaeseq_u8, k) +#define AES_E_MC_m(k) AES_E_m (k) MM_OP1_m(vaesmcq_u8) + + +AES_FUNC_START2 (AesCbc_Encode_HW) { - AesCbc_Encode(p, data, numBlocks); + if (numBlocks == 0) + return; + { + v128 * const p = (v128 *)(void *)ivAes; + v128 *data = (v128 *)(void *)data8; + v128 m = *p; + const UInt32 numRounds2 = *(const UInt32 *)(p + 1); + const v128 *w = p + (size_t)numRounds2 * 2; + const v128 k0 = p[2]; + const v128 k1 = p[3]; + const v128 k2 = p[4]; + const v128 k3 = p[5]; + const v128 k4 = p[6]; + const v128 k5 = p[7]; + const v128 k6 = p[8]; + const v128 k7 = p[9]; + const v128 k8 = p[10]; + const v128 k9 = p[11]; + const v128 k_z4 = w[-2]; + const v128 k_z3 = w[-1]; + const v128 k_z2 = w[0]; + const v128 k_z1 = w[1]; + const v128 k_z0 = w[2]; + // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle, + // because gcc/clang compilers are not good for that optimization. + do + { + MM_XOR_m (*data) + AES_E_MC_m (k0) + AES_E_MC_m (k1) + AES_E_MC_m (k2) + AES_E_MC_m (k3) + AES_E_MC_m (k4) + AES_E_MC_m (k5) + if (numRounds2 >= 6) + { + AES_E_MC_m (k6) + AES_E_MC_m (k7) + if (numRounds2 != 6) + { + AES_E_MC_m (k8) + AES_E_MC_m (k9) + } + } + AES_E_MC_m (k_z4) + AES_E_MC_m (k_z3) + AES_E_MC_m (k_z2) + AES_E_m (k_z1) + MM_XOR_m (k_z0) + *data++ = m; + } + while (--numBlocks); + *p = m; + } } -void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks) + +#define WOP_1(op) +#define WOP_2(op) WOP_1 (op) op (m1, 1) +#define WOP_3(op) WOP_2 (op) op (m2, 2) +#define WOP_4(op) WOP_3 (op) op (m3, 3) +#define WOP_5(op) WOP_4 (op) op (m4, 4) +#define WOP_6(op) WOP_5 (op) op (m5, 5) +#define WOP_7(op) WOP_6 (op) op (m6, 6) +#define WOP_8(op) WOP_7 (op) op (m7, 7) + + #define NUM_WAYS 8 + #define WOP_M1 WOP_8 + +#define WOP(op) op (m0, 0) WOP_M1(op) + +#define DECLARE_VAR(reg, ii) v128 reg; +#define LOAD_data( reg, ii) reg = data[ii]; +#define STORE_data( reg, ii) data[ii] = reg; +#if (NUM_WAYS > 1) +#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) +#endif + +#define MM_OP_key(op, reg) MM_OP (op, reg, key) + +#define AES_D_m(k) MM_OP_m (vaesdq_u8, k) +#define AES_D_IMC_m(k) AES_D_m (k) MM_OP1_m (vaesimcq_u8) + +#define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg) +#define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg) +#define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg) + +#define AES_D_IMC( reg, ii) AES_D (reg, ii) reg = vaesimcq_u8(reg); +#define AES_E_MC( reg, ii) AES_E (reg, ii) reg = vaesmcq_u8(reg); + +#define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one) reg = vreinterpretq_u8_u64(ctr); +#define CTR_END( reg, ii) MM_XOR (data[ii], reg) + +#define WOP_KEY(op, n) { \ + const v128 key = w[n]; \ + WOP(op) } + +#define WIDE_LOOP_START \ + dataEnd = data + numBlocks; \ + if (numBlocks >= NUM_WAYS) \ + { dataEnd -= NUM_WAYS; do { \ + +#define WIDE_LOOP_END \ + data += NUM_WAYS; \ + } while (data <= dataEnd); \ + dataEnd += NUM_WAYS; } \ + +#define SINGLE_LOOP \ + for (; data < dataEnd; data++) + + +AES_FUNC_START2 (AesCbc_Decode_HW) { - AesCbc_Decode(p, data, numBlocks); + v128 *p = (v128 *)(void *)ivAes; + v128 *data = (v128 *)(void *)data8; + v128 iv = *p; + const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2; + const v128 *dataEnd; + p += 2; + + WIDE_LOOP_START + { + const v128 *w = wStart; + WOP (DECLARE_VAR) + WOP (LOAD_data) + WOP_KEY (AES_D_IMC, 2) + do + { + WOP_KEY (AES_D_IMC, 1) + WOP_KEY (AES_D_IMC, 0) + w -= 2; + } + while (w != p); + WOP_KEY (AES_D, 1) + WOP_KEY (AES_XOR, 0) + MM_XOR (m0, iv) + WOP_M1 (XOR_data_M1) + LOAD_data(iv, NUM_WAYS - 1) + WOP (STORE_data) + } + WIDE_LOOP_END + + SINGLE_LOOP + { + const v128 *w = wStart; + v128 m; LOAD_data(m, 0) + AES_D_IMC_m (w[2]) + do + { + AES_D_IMC_m (w[1]) + AES_D_IMC_m (w[0]) + w -= 2; + } + while (w != p); + AES_D_m (w[1]) + MM_XOR_m (w[0]) + MM_XOR_m (iv) + LOAD_data(iv, 0) + STORE_data(m, 0) + } + + p[-2] = iv; } -void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks) + +AES_FUNC_START2 (AesCtr_Code_HW) { - AesCtr_Code(p, data, numBlocks); + v128 *p = (v128 *)(void *)ivAes; + v128 *data = (v128 *)(void *)data8; + uint64x2_t ctr = vreinterpretq_u64_u8(*p); + const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2; + const v128 *dataEnd; +// the bug in clang: +// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); +#if defined(__clang__) && (__clang_major__ <= 9) +#pragma GCC diagnostic ignored "-Wvector-conversion" +#endif + const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0); + p += 2; + + WIDE_LOOP_START + { + const v128 *w = p; + WOP (DECLARE_VAR) + WOP (CTR_START) + do + { + WOP_KEY (AES_E_MC, 0) + WOP_KEY (AES_E_MC, 1) + w += 2; + } + while (w != wEnd); + WOP_KEY (AES_E_MC, 0) + WOP_KEY (AES_E, 1) + WOP_KEY (AES_XOR, 2) + WOP (CTR_END) + } + WIDE_LOOP_END + + SINGLE_LOOP + { + const v128 *w = p; + v128 m; + CTR_START (m, 0) + do + { + AES_E_MC_m (w[0]) + AES_E_MC_m (w[1]) + w += 2; + } + while (w != wEnd); + AES_E_MC_m (w[0]) + AES_E_m (w[1]) + MM_XOR_m (w[2]) + CTR_END (m, 0) + } + + p[-2] = vreinterpretq_u8_u64(ctr); } -#endif +#endif // USE_HW_AES + +#endif // MY_CPU_ARM_OR_ARM64 + +#undef NUM_WAYS +#undef WOP_M1 +#undef WOP +#undef DECLARE_VAR +#undef LOAD_data +#undef STORE_data +#undef USE_INTEL_AES +#undef USE_HW_AES diff --git a/src/sdk/C/Alloc.c b/src/sdk/C/Alloc.c index bcede4b..63e1a12 100644 --- a/src/sdk/C/Alloc.c +++ b/src/sdk/C/Alloc.c @@ -1,38 +1,53 @@ /* Alloc.c -- Memory allocation functions -2018-04-27 : Igor Pavlov : Public domain */ +2024-02-18 : Igor Pavlov : Public domain */ #include "Precomp.h" -#include - #ifdef _WIN32 -#include +#include "7zWindows.h" #endif #include #include "Alloc.h" -/* #define _SZ_ALLOC_DEBUG */ +#if defined(Z7_LARGE_PAGES) && defined(_WIN32) && \ + (!defined(Z7_WIN32_WINNT_MIN) || Z7_WIN32_WINNT_MIN < 0x0502) // < Win2003 (xp-64) + #define Z7_USE_DYN_GetLargePageMinimum +#endif -/* use _SZ_ALLOC_DEBUG to debug alloc/free operations */ -#ifdef _SZ_ALLOC_DEBUG +// for debug: +#if 0 +#if defined(__CHERI__) && defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) +// #pragma message("=== Z7_ALLOC_NO_OFFSET_ALLOCATOR === ") +#define Z7_ALLOC_NO_OFFSET_ALLOCATOR +#endif +#endif +// #define SZ_ALLOC_DEBUG +/* #define SZ_ALLOC_DEBUG */ + +/* use SZ_ALLOC_DEBUG to debug alloc/free operations */ +#ifdef SZ_ALLOC_DEBUG + +#include #include -int g_allocCount = 0; -int g_allocCountMid = 0; -int g_allocCountBig = 0; +static int g_allocCount = 0; +#ifdef _WIN32 +static int g_allocCountMid = 0; +static int g_allocCountBig = 0; +#endif #define CONVERT_INT_TO_STR(charType, tempSize) \ - unsigned char temp[tempSize]; unsigned i = 0; \ - while (val >= 10) { temp[i++] = (unsigned char)('0' + (unsigned)(val % 10)); val /= 10; } \ + char temp[tempSize]; unsigned i = 0; \ + while (val >= 10) { temp[i++] = (char)('0' + (unsigned)(val % 10)); val /= 10; } \ *s++ = (charType)('0' + (unsigned)val); \ while (i != 0) { i--; *s++ = temp[i]; } \ *s = 0; static void ConvertUInt64ToString(UInt64 val, char *s) { - CONVERT_INT_TO_STR(char, 24); + CONVERT_INT_TO_STR(char, 24) } #define GET_HEX_CHAR(t) ((char)(((t < 10) ? ('0' + t) : ('A' + (t - 10))))) @@ -77,7 +92,7 @@ static void PrintAligned(const char *s, size_t align) Print(s); } -static void PrintLn() +static void PrintLn(void) { Print("\n"); } @@ -89,10 +104,10 @@ static void PrintHex(UInt64 v, size_t align) PrintAligned(s, align); } -static void PrintDec(UInt64 v, size_t align) +static void PrintDec(int v, size_t align) { char s[32]; - ConvertUInt64ToString(v, s); + ConvertUInt64ToString((unsigned)v, s); PrintAligned(s, align); } @@ -102,12 +117,19 @@ static void PrintAddr(void *p) } -#define PRINT_ALLOC(name, cnt, size, ptr) \ +#define PRINT_REALLOC(name, cnt, size, ptr) { \ + Print(name " "); \ + if (!ptr) PrintDec(cnt++, 10); \ + PrintHex(size, 10); \ + PrintAddr(ptr); \ + PrintLn(); } + +#define PRINT_ALLOC(name, cnt, size, ptr) { \ Print(name " "); \ PrintDec(cnt++, 10); \ PrintHex(size, 10); \ PrintAddr(ptr); \ - PrintLn(); + PrintLn(); } #define PRINT_FREE(name, cnt, ptr) if (ptr) { \ Print(name " "); \ @@ -117,26 +139,45 @@ static void PrintAddr(void *p) #else +#ifdef _WIN32 #define PRINT_ALLOC(name, cnt, size, ptr) +#endif #define PRINT_FREE(name, cnt, ptr) #define Print(s) #define PrintLn() +#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR #define PrintHex(v, align) -#define PrintDec(v, align) +#endif #define PrintAddr(p) #endif +/* +by specification: + malloc(non_NULL, 0) : returns NULL or a unique pointer value that can later be successfully passed to free() + realloc(NULL, size) : the call is equivalent to malloc(size) + realloc(non_NULL, 0) : the call is equivalent to free(ptr) + +in main compilers: + malloc(0) : returns non_NULL + realloc(NULL, 0) : returns non_NULL + realloc(non_NULL, 0) : returns NULL +*/ + void *MyAlloc(size_t size) { if (size == 0) return NULL; - #ifdef _SZ_ALLOC_DEBUG + // PRINT_ALLOC("Alloc ", g_allocCount, size, NULL) + #ifdef SZ_ALLOC_DEBUG { void *p = malloc(size); - PRINT_ALLOC("Alloc ", g_allocCount, size, p); + if (p) + { + PRINT_ALLOC("Alloc ", g_allocCount, size, p) + } return p; } #else @@ -146,65 +187,107 @@ void *MyAlloc(size_t size) void MyFree(void *address) { - PRINT_FREE("Free ", g_allocCount, address); + PRINT_FREE("Free ", g_allocCount, address) free(address); } +void *MyRealloc(void *address, size_t size) +{ + if (size == 0) + { + MyFree(address); + return NULL; + } + // PRINT_REALLOC("Realloc ", g_allocCount, size, address) + #ifdef SZ_ALLOC_DEBUG + { + void *p = realloc(address, size); + if (p) + { + PRINT_REALLOC("Realloc ", g_allocCount, size, address) + } + return p; + } + #else + return realloc(address, size); + #endif +} + + #ifdef _WIN32 void *MidAlloc(size_t size) { if (size == 0) return NULL; - - PRINT_ALLOC("Alloc-Mid", g_allocCountMid, size, NULL); - + #ifdef SZ_ALLOC_DEBUG + { + void *p = VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE); + if (p) + { + PRINT_ALLOC("Alloc-Mid", g_allocCountMid, size, p) + } + return p; + } + #else return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE); + #endif } void MidFree(void *address) { - PRINT_FREE("Free-Mid", g_allocCountMid, address); + PRINT_FREE("Free-Mid", g_allocCountMid, address) if (!address) return; VirtualFree(address, 0, MEM_RELEASE); } -#ifndef MEM_LARGE_PAGES -#undef _7ZIP_LARGE_PAGES +#ifdef Z7_LARGE_PAGES + +#ifdef MEM_LARGE_PAGES + #define MY_MEM_LARGE_PAGES MEM_LARGE_PAGES +#else + #define MY_MEM_LARGE_PAGES 0x20000000 #endif -#ifdef _7ZIP_LARGE_PAGES +extern +SIZE_T g_LargePageSize; SIZE_T g_LargePageSize = 0; -typedef SIZE_T (WINAPI *GetLargePageMinimumP)(); -#endif +typedef SIZE_T (WINAPI *Func_GetLargePageMinimum)(VOID); -void SetLargePageSize() +void SetLargePageSize(void) { - #ifdef _7ZIP_LARGE_PAGES SIZE_T size; - GetLargePageMinimumP largePageMinimum = (GetLargePageMinimumP) - GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "GetLargePageMinimum"); - if (!largePageMinimum) +#ifdef Z7_USE_DYN_GetLargePageMinimum +Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION + + const + Func_GetLargePageMinimum fn = + (Func_GetLargePageMinimum) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), + "GetLargePageMinimum"); + if (!fn) return; - size = largePageMinimum(); + size = fn(); +#else + size = GetLargePageMinimum(); +#endif if (size == 0 || (size & (size - 1)) != 0) return; g_LargePageSize = size; - #endif } +#endif // Z7_LARGE_PAGES void *BigAlloc(size_t size) { if (size == 0) return NULL; - PRINT_ALLOC("Alloc-Big", g_allocCountBig, size, NULL); - - #ifdef _7ZIP_LARGE_PAGES + PRINT_ALLOC("Alloc-Big", g_allocCountBig, size, NULL) + + #ifdef Z7_LARGE_PAGES { SIZE_T ps = g_LargePageSize; if (ps != 0 && ps <= (1 << 30) && size > (ps / 2)) @@ -214,56 +297,43 @@ void *BigAlloc(size_t size) size2 = (size + ps) & ~ps; if (size2 >= size) { - void *res = VirtualAlloc(NULL, size2, MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE); - if (res) - return res; + void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY_MEM_LARGE_PAGES, PAGE_READWRITE); + if (p) + { + PRINT_ALLOC("Alloc-BM ", g_allocCountMid, size2, p) + return p; + } } } } #endif - return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE); + return MidAlloc(size); } void BigFree(void *address) { - PRINT_FREE("Free-Big", g_allocCountBig, address); - - if (!address) - return; - VirtualFree(address, 0, MEM_RELEASE); + PRINT_FREE("Free-Big", g_allocCountBig, address) + MidFree(address); } -#endif +#endif // _WIN32 -static void *SzAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MyAlloc(size); } -static void SzFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MyFree(address); } +static void *SzAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p) return MyAlloc(size); } +static void SzFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p) MyFree(address); } const ISzAlloc g_Alloc = { SzAlloc, SzFree }; -static void *SzMidAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MidAlloc(size); } -static void SzMidFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MidFree(address); } +#ifdef _WIN32 +static void *SzMidAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p) return MidAlloc(size); } +static void SzMidFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p) MidFree(address); } +static void *SzBigAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p) return BigAlloc(size); } +static void SzBigFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p) BigFree(address); } const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree }; - -static void *SzBigAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return BigAlloc(size); } -static void SzBigFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); BigFree(address); } const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; - - -/* - uintptr_t : C99 (optional) - : unsupported in VS6 -*/ - -#ifdef _WIN32 - typedef UINT_PTR UIntPtr; -#else - /* - typedef uintptr_t UIntPtr; - */ - typedef ptrdiff_t UIntPtr; #endif +#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR #define ADJUST_ALLOC_SIZE 0 /* @@ -274,19 +344,43 @@ const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; MyAlloc() can return address that is NOT multiple of sizeof(void *). */ - /* -#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((char *)(p) - ((size_t)(UIntPtr)(p) & ((align) - 1)))) + uintptr_t : C99 (optional) + : unsupported in VS6 */ -#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((((UIntPtr)(p)) & ~((UIntPtr)(align) - 1)))) - -#define MY_ALIGN_PTR_UP_PLUS(p, align) MY_ALIGN_PTR_DOWN(((char *)(p) + (align) + ADJUST_ALLOC_SIZE), align) +typedef + #ifdef _WIN32 + UINT_PTR + #elif 1 + uintptr_t + #else + ptrdiff_t + #endif + MY_uintptr_t; + +#if 0 \ + || (defined(__CHERI__) \ + || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ > 8)) +// for 128-bit pointers (cheri): +#define MY_ALIGN_PTR_DOWN(p, align) \ + ((void *)((char *)(p) - ((size_t)(MY_uintptr_t)(p) & ((align) - 1)))) +#else +#define MY_ALIGN_PTR_DOWN(p, align) \ + ((void *)((((MY_uintptr_t)(p)) & ~((MY_uintptr_t)(align) - 1)))) +#endif +#endif -#if (_POSIX_C_SOURCE >= 200112L) && !defined(_WIN32) +#if !defined(_WIN32) \ + && (defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) \ + || defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L)) #define USE_posix_memalign #endif +#ifndef USE_posix_memalign +#define MY_ALIGN_PTR_UP_PLUS(p, align) MY_ALIGN_PTR_DOWN(((char *)(p) + (align) + ADJUST_ALLOC_SIZE), align) +#endif + /* This posix_memalign() is for test purposes only. We also need special Free() function instead of free(), @@ -319,14 +413,13 @@ static int posix_memalign(void **ptr, size_t align, size_t size) #define ALLOC_ALIGN_SIZE ((size_t)1 << 7) -static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) +void *z7_AlignedAlloc(size_t size) { - #ifndef USE_posix_memalign +#ifndef USE_posix_memalign void *p; void *pAligned; size_t newSize; - UNUSED_VAR(pp); /* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned block to prevent cache line sharing with another allocated blocks */ @@ -351,10 +444,9 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) return pAligned; - #else +#else void *p; - UNUSED_VAR(pp); if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size)) return NULL; @@ -363,19 +455,37 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) return p; - #endif +#endif +} + + +void z7_AlignedFree(void *address) +{ +#ifndef USE_posix_memalign + if (address) + MyFree(((void **)address)[-1]); +#else + free(address); +#endif +} + + +static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) +{ + UNUSED_VAR(pp) + return z7_AlignedAlloc(size); } static void SzAlignedFree(ISzAllocPtr pp, void *address) { - UNUSED_VAR(pp); - #ifndef USE_posix_memalign + UNUSED_VAR(pp) +#ifndef USE_posix_memalign if (address) MyFree(((void **)address)[-1]); - #else +#else free(address); - #endif +#endif } @@ -383,17 +493,45 @@ const ISzAlloc g_AlignedAlloc = { SzAlignedAlloc, SzAlignedFree }; -#define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *)) - /* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */ -#define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1] -/* -#define REAL_BLOCK_PTR_VAR(p) ((void **)(p))[-1] -*/ +#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR +#if 1 + #define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *)) + #define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1] +#else + // we can use this simplified code, + // if (CAlignOffsetAlloc::offset == (k * sizeof(void *)) + #define REAL_BLOCK_PTR_VAR(p) (((void **)(p))[-1]) +#endif +#endif + + +#if 0 +#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR +#include +static void PrintPtr(const char *s, const void *p) +{ + const Byte *p2 = (const Byte *)&p; + unsigned i; + printf("%s %p ", s, p); + for (i = sizeof(p); i != 0;) + { + i--; + printf("%02x", p2[i]); + } + printf("\n"); +} +#endif +#endif + static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) { - CAlignOffsetAlloc *p = CONTAINER_FROM_VTBL(pp, CAlignOffsetAlloc, vt); +#if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) + UNUSED_VAR(pp) + return z7_AlignedAlloc(size); +#else + const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); void *adr; void *pAligned; size_t newSize; @@ -421,6 +559,12 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr + alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset; +#if 0 + printf("\nalignSize = %6x, offset=%6x, size=%8x \n", (unsigned)alignSize, (unsigned)p->offset, (unsigned)size); + PrintPtr("base", adr); + PrintPtr("alig", pAligned); +#endif + PrintLn(); Print("- Aligned: "); Print(" size="); PrintHex(size, 8); @@ -432,19 +576,25 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) REAL_BLOCK_PTR_VAR(pAligned) = adr; return pAligned; +#endif } static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address) { +#if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) + UNUSED_VAR(pp) + z7_AlignedFree(address); +#else if (address) { - CAlignOffsetAlloc *p = CONTAINER_FROM_VTBL(pp, CAlignOffsetAlloc, vt); + const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); PrintLn(); Print("- Aligned Free: "); PrintLn(); ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address)); } +#endif } diff --git a/src/sdk/C/Alloc.h b/src/sdk/C/Alloc.h index 6482376..01bf6b7 100644 --- a/src/sdk/C/Alloc.h +++ b/src/sdk/C/Alloc.h @@ -1,37 +1,62 @@ /* Alloc.h -- Memory allocation functions -2018-02-19 : Igor Pavlov : Public domain */ +2024-01-22 : Igor Pavlov : Public domain */ -#ifndef __COMMON_ALLOC_H -#define __COMMON_ALLOC_H +#ifndef ZIP7_INC_ALLOC_H +#define ZIP7_INC_ALLOC_H #include "7zTypes.h" EXTERN_C_BEGIN +/* + MyFree(NULL) : is allowed, as free(NULL) + MyAlloc(0) : returns NULL : but malloc(0) is allowed to return NULL or non_NULL + MyRealloc(NULL, 0) : returns NULL : but realloc(NULL, 0) is allowed to return NULL or non_NULL +MyRealloc() is similar to realloc() for the following cases: + MyRealloc(non_NULL, 0) : returns NULL and always calls MyFree(ptr) + MyRealloc(NULL, non_ZERO) : returns NULL, if allocation failed + MyRealloc(non_NULL, non_ZERO) : returns NULL, if reallocation failed +*/ + void *MyAlloc(size_t size); void MyFree(void *address); +void *MyRealloc(void *address, size_t size); + +void *z7_AlignedAlloc(size_t size); +void z7_AlignedFree(void *p); #ifdef _WIN32 -void SetLargePageSize(); +#ifdef Z7_LARGE_PAGES +void SetLargePageSize(void); +#endif void *MidAlloc(size_t size); void MidFree(void *address); void *BigAlloc(size_t size); void BigFree(void *address); +/* #define Z7_BIG_ALLOC_IS_ZERO_FILLED */ + #else -#define MidAlloc(size) MyAlloc(size) -#define MidFree(address) MyFree(address) -#define BigAlloc(size) MyAlloc(size) -#define BigFree(address) MyFree(address) +#define MidAlloc(size) z7_AlignedAlloc(size) +#define MidFree(address) z7_AlignedFree(address) +#define BigAlloc(size) z7_AlignedAlloc(size) +#define BigFree(address) z7_AlignedFree(address) #endif extern const ISzAlloc g_Alloc; + +#ifdef _WIN32 extern const ISzAlloc g_BigAlloc; extern const ISzAlloc g_MidAlloc; +#else +#define g_BigAlloc g_AlignedAlloc +#define g_MidAlloc g_AlignedAlloc +#endif + extern const ISzAlloc g_AlignedAlloc; diff --git a/src/sdk/C/Bcj2.c b/src/sdk/C/Bcj2.c index 9a0046a..7cb57ad 100644 --- a/src/sdk/C/Bcj2.c +++ b/src/sdk/C/Bcj2.c @@ -1,29 +1,24 @@ /* Bcj2.c -- BCJ2 Decoder (Converter for x86 code) -2018-04-28 : Igor Pavlov : Public domain */ +2023-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" #include "Bcj2.h" #include "CpuArch.h" -#define CProb UInt16 - #define kTopValue ((UInt32)1 << 24) -#define kNumModelBits 11 -#define kBitModelTotal (1 << kNumModelBits) +#define kNumBitModelTotalBits 11 +#define kBitModelTotal (1 << kNumBitModelTotalBits) #define kNumMoveBits 5 -#define _IF_BIT_0 ttt = *prob; bound = (p->range >> kNumModelBits) * ttt; if (p->code < bound) -#define _UPDATE_0 p->range = bound; *prob = (CProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); -#define _UPDATE_1 p->range -= bound; p->code -= bound; *prob = (CProb)(ttt - (ttt >> kNumMoveBits)); +// UInt32 bcj2_stats[256 + 2][2]; void Bcj2Dec_Init(CBcj2Dec *p) { unsigned i; - - p->state = BCJ2_DEC_STATE_OK; + p->state = BCJ2_STREAM_RC; // BCJ2_DEC_STATE_OK; p->ip = 0; - p->temp[3] = 0; + p->temp = 0; p->range = 0; p->code = 0; for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++) @@ -32,217 +27,248 @@ void Bcj2Dec_Init(CBcj2Dec *p) SRes Bcj2Dec_Decode(CBcj2Dec *p) { + UInt32 v = p->temp; + // const Byte *src; if (p->range <= 5) { - p->state = BCJ2_DEC_STATE_OK; + UInt32 code = p->code; + p->state = BCJ2_DEC_STATE_ERROR; /* for case if we return SZ_ERROR_DATA; */ for (; p->range != 5; p->range++) { - if (p->range == 1 && p->code != 0) + if (p->range == 1 && code != 0) return SZ_ERROR_DATA; - if (p->bufs[BCJ2_STREAM_RC] == p->lims[BCJ2_STREAM_RC]) { p->state = BCJ2_STREAM_RC; return SZ_OK; } - - p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++; + code = (code << 8) | *(p->bufs[BCJ2_STREAM_RC])++; + p->code = code; } - - if (p->code == 0xFFFFFFFF) + if (code == 0xffffffff) return SZ_ERROR_DATA; - - p->range = 0xFFFFFFFF; + p->range = 0xffffffff; } - else if (p->state >= BCJ2_DEC_STATE_ORIG_0) + // else { - while (p->state <= BCJ2_DEC_STATE_ORIG_3) + unsigned state = p->state; + // we check BCJ2_IS_32BIT_STREAM() here instead of check in the main loop + if (BCJ2_IS_32BIT_STREAM(state)) { - Byte *dest = p->dest; - if (dest == p->destLim) + const Byte *cur = p->bufs[state]; + if (cur == p->lims[state]) return SZ_OK; - *dest = p->temp[(size_t)p->state - BCJ2_DEC_STATE_ORIG_0]; - p->state++; - p->dest = dest + 1; + p->bufs[state] = cur + 4; + { + const UInt32 ip = p->ip + 4; + v = GetBe32a(cur) - ip; + p->ip = ip; + } + state = BCJ2_DEC_STATE_ORIG_0; } - } - - /* - if (BCJ2_IS_32BIT_STREAM(p->state)) - { - const Byte *cur = p->bufs[p->state]; - if (cur == p->lims[p->state]) - return SZ_OK; - p->bufs[p->state] = cur + 4; - + if ((unsigned)(state - BCJ2_DEC_STATE_ORIG_0) < 4) { - UInt32 val; - Byte *dest; - SizeT rem; - - p->ip += 4; - val = GetBe32(cur) - p->ip; - dest = p->dest; - rem = p->destLim - dest; - if (rem < 4) + Byte *dest = p->dest; + for (;;) { - SizeT i; - SetUi32(p->temp, val); - for (i = 0; i < rem; i++) - dest[i] = p->temp[i]; - p->dest = dest + rem; - p->state = BCJ2_DEC_STATE_ORIG_0 + (unsigned)rem; - return SZ_OK; + if (dest == p->destLim) + { + p->state = state; + p->temp = v; + return SZ_OK; + } + *dest++ = (Byte)v; + p->dest = dest; + if (++state == BCJ2_DEC_STATE_ORIG_3 + 1) + break; + v >>= 8; } - SetUi32(dest, val); - p->temp[3] = (Byte)(val >> 24); - p->dest = dest + 4; - p->state = BCJ2_DEC_STATE_OK; } } - */ + // src = p->bufs[BCJ2_STREAM_MAIN]; for (;;) { + /* if (BCJ2_IS_32BIT_STREAM(p->state)) p->state = BCJ2_DEC_STATE_OK; else + */ { if (p->range < kTopValue) { if (p->bufs[BCJ2_STREAM_RC] == p->lims[BCJ2_STREAM_RC]) { p->state = BCJ2_STREAM_RC; + p->temp = v; return SZ_OK; } p->range <<= 8; p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++; } - { const Byte *src = p->bufs[BCJ2_STREAM_MAIN]; const Byte *srcLim; - Byte *dest; - SizeT num = p->lims[BCJ2_STREAM_MAIN] - src; - - if (num == 0) + Byte *dest = p->dest; { - p->state = BCJ2_STREAM_MAIN; - return SZ_OK; + const SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - src); + SizeT num = (SizeT)(p->destLim - dest); + if (num >= rem) + num = rem; + #define NUM_ITERS 4 + #if (NUM_ITERS & (NUM_ITERS - 1)) == 0 + num &= ~((SizeT)NUM_ITERS - 1); // if (NUM_ITERS == (1 << x)) + #else + num -= num % NUM_ITERS; // if (NUM_ITERS != (1 << x)) + #endif + srcLim = src + num; } - - dest = p->dest; - if (num > (SizeT)(p->destLim - dest)) + + #define NUM_SHIFT_BITS 24 + #define ONE_ITER(indx) { \ + const unsigned b = src[indx]; \ + *dest++ = (Byte)b; \ + v = (v << NUM_SHIFT_BITS) | b; \ + if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \ + if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \ + ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \ + /* ++dest */; /* v = b; */ } + + if (src != srcLim) + for (;;) { - num = p->destLim - dest; - if (num == 0) - { - p->state = BCJ2_DEC_STATE_ORIG; - return SZ_OK; - } + /* The dependency chain of 2-cycle for (v) calculation is not big problem here. + But we can remove dependency chain with v = b in the end of loop. */ + ONE_ITER(0) + #if (NUM_ITERS > 1) + ONE_ITER(1) + #if (NUM_ITERS > 2) + ONE_ITER(2) + #if (NUM_ITERS > 3) + ONE_ITER(3) + #if (NUM_ITERS > 4) + ONE_ITER(4) + #if (NUM_ITERS > 5) + ONE_ITER(5) + #if (NUM_ITERS > 6) + ONE_ITER(6) + #if (NUM_ITERS > 7) + ONE_ITER(7) + #endif + #endif + #endif + #endif + #endif + #endif + #endif + + src += NUM_ITERS; + if (src == srcLim) + break; } - - srcLim = src + num; - if (p->temp[3] == 0x0F && (src[0] & 0xF0) == 0x80) - *dest = src[0]; - else for (;;) + if (src == srcLim) + #if (NUM_ITERS > 1) + for (;;) + #endif { - Byte b = *src; - *dest = b; - if (b != 0x0F) + #if (NUM_ITERS > 1) + if (src == p->lims[BCJ2_STREAM_MAIN] || dest == p->destLim) + #endif { - if ((b & 0xFE) == 0xE8) - break; - dest++; - if (++src != srcLim) - continue; - break; + const SizeT num = (SizeT)(src - p->bufs[BCJ2_STREAM_MAIN]); + p->bufs[BCJ2_STREAM_MAIN] = src; + p->dest = dest; + p->ip += (UInt32)num; + /* state BCJ2_STREAM_MAIN has more priority than BCJ2_STATE_ORIG */ + p->state = + src == p->lims[BCJ2_STREAM_MAIN] ? + (unsigned)BCJ2_STREAM_MAIN : + (unsigned)BCJ2_DEC_STATE_ORIG; + p->temp = v; + return SZ_OK; } - dest++; - if (++src == srcLim) - break; - if ((*src & 0xF0) != 0x80) - continue; - *dest = *src; - break; + #if (NUM_ITERS > 1) + ONE_ITER(0) + src++; + #endif } - - num = src - p->bufs[BCJ2_STREAM_MAIN]; - - if (src == srcLim) + { - p->temp[3] = src[-1]; - p->bufs[BCJ2_STREAM_MAIN] = src; + const SizeT num = (SizeT)(dest - p->dest); + p->dest = dest; // p->dest += num; + p->bufs[BCJ2_STREAM_MAIN] += num; // = src; p->ip += (UInt32)num; - p->dest += num; - p->state = - p->bufs[BCJ2_STREAM_MAIN] == - p->lims[BCJ2_STREAM_MAIN] ? - (unsigned)BCJ2_STREAM_MAIN : - (unsigned)BCJ2_DEC_STATE_ORIG; - return SZ_OK; } - { UInt32 bound, ttt; - CProb *prob; - Byte b = src[0]; - Byte prev = (Byte)(num == 0 ? p->temp[3] : src[-1]); - - p->temp[3] = b; - p->bufs[BCJ2_STREAM_MAIN] = src + 1; - num++; - p->ip += (UInt32)num; - p->dest += num; - - prob = p->probs + (unsigned)(b == 0xE8 ? 2 + (unsigned)prev : (b == 0xE9 ? 1 : 0)); - - _IF_BIT_0 + CBcj2Prob *prob; // unsigned index; + /* + prob = p->probs + (unsigned)((Byte)v == 0xe8 ? + 2 + (Byte)(v >> 8) : + ((v >> 5) & 1)); // ((Byte)v < 0xe8 ? 0 : 1)); + */ { - _UPDATE_0 + const unsigned c = ((v + 0x17) >> 6) & 1; + prob = p->probs + (unsigned) + (((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1)); + // (Byte) + // 8x->0 : e9->1 : xxe8->xx+2 + // 8x->0x100 : e9->0x101 : xxe8->xx + // (((0x100 - (e & ~v)) & (0x100 | (v >> 8))) + (e & v)); + // (((0x101 + (~e | v)) & (0x100 | (v >> 8))) + (e & v)); + } + ttt = *prob; + bound = (p->range >> kNumBitModelTotalBits) * ttt; + if (p->code < bound) + { + // bcj2_stats[prob - p->probs][0]++; + p->range = bound; + *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); continue; } - _UPDATE_1 - + { + // bcj2_stats[prob - p->probs][1]++; + p->range -= bound; + p->code -= bound; + *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits)); + } } } } - { - UInt32 val; - unsigned cj = (p->temp[3] == 0xE8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP; + /* (v == 0xe8 ? 0 : 1) uses setcc instruction with additional zero register usage in x64 MSVC. */ + // const unsigned cj = ((Byte)v == 0xe8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP; + const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL; const Byte *cur = p->bufs[cj]; Byte *dest; SizeT rem; - if (cur == p->lims[cj]) { p->state = cj; break; } - - val = GetBe32(cur); + v = GetBe32a(cur); p->bufs[cj] = cur + 4; - - p->ip += 4; - val -= p->ip; + { + const UInt32 ip = p->ip + 4; + v -= ip; + p->ip = ip; + } dest = p->dest; - rem = p->destLim - dest; - + rem = (SizeT)(p->destLim - dest); if (rem < 4) { - p->temp[0] = (Byte)val; if (rem > 0) dest[0] = (Byte)val; val >>= 8; - p->temp[1] = (Byte)val; if (rem > 1) dest[1] = (Byte)val; val >>= 8; - p->temp[2] = (Byte)val; if (rem > 2) dest[2] = (Byte)val; val >>= 8; - p->temp[3] = (Byte)val; + if ((unsigned)rem > 0) { dest[0] = (Byte)v; v >>= 8; + if ((unsigned)rem > 1) { dest[1] = (Byte)v; v >>= 8; + if ((unsigned)rem > 2) { dest[2] = (Byte)v; v >>= 8; }}} + p->temp = v; p->dest = dest + rem; p->state = BCJ2_DEC_STATE_ORIG_0 + (unsigned)rem; break; } - - SetUi32(dest, val); - p->temp[3] = (Byte)(val >> 24); + SetUi32(dest, v) + v >>= 24; p->dest = dest + 4; } } @@ -252,6 +278,13 @@ SRes Bcj2Dec_Decode(CBcj2Dec *p) p->range <<= 8; p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++; } - return SZ_OK; } + +#undef NUM_ITERS +#undef ONE_ITER +#undef NUM_SHIFT_BITS +#undef kTopValue +#undef kNumBitModelTotalBits +#undef kBitModelTotal +#undef kNumMoveBits diff --git a/src/sdk/C/Bcj2.h b/src/sdk/C/Bcj2.h index 8824080..4575545 100644 --- a/src/sdk/C/Bcj2.h +++ b/src/sdk/C/Bcj2.h @@ -1,8 +1,8 @@ -/* Bcj2.h -- BCJ2 Converter for x86 code -2014-11-10 : Igor Pavlov : Public domain */ +/* Bcj2.h -- BCJ2 converter for x86 code (Branch CALL/JUMP variant2) +2023-03-02 : Igor Pavlov : Public domain */ -#ifndef __BCJ2_H -#define __BCJ2_H +#ifndef ZIP7_INC_BCJ2_H +#define ZIP7_INC_BCJ2_H #include "7zTypes.h" @@ -26,37 +26,68 @@ enum BCJ2_DEC_STATE_ORIG_3, BCJ2_DEC_STATE_ORIG, - BCJ2_DEC_STATE_OK + BCJ2_DEC_STATE_ERROR /* after detected data error */ }; enum { BCJ2_ENC_STATE_ORIG = BCJ2_NUM_STREAMS, - BCJ2_ENC_STATE_OK + BCJ2_ENC_STATE_FINISHED /* it's state after fully encoded stream */ }; -#define BCJ2_IS_32BIT_STREAM(s) ((s) == BCJ2_STREAM_CALL || (s) == BCJ2_STREAM_JUMP) +/* #define BCJ2_IS_32BIT_STREAM(s) ((s) == BCJ2_STREAM_CALL || (s) == BCJ2_STREAM_JUMP) */ +#define BCJ2_IS_32BIT_STREAM(s) ((unsigned)((unsigned)(s) - (unsigned)BCJ2_STREAM_CALL) < 2) /* CBcj2Dec / CBcj2Enc bufs sizes: BUF_SIZE(n) = lims[n] - bufs[n] -bufs sizes for BCJ2_STREAM_CALL and BCJ2_STREAM_JUMP must be mutliply of 4: +bufs sizes for BCJ2_STREAM_CALL and BCJ2_STREAM_JUMP must be multiply of 4: (BUF_SIZE(BCJ2_STREAM_CALL) & 3) == 0 (BUF_SIZE(BCJ2_STREAM_JUMP) & 3) == 0 */ +// typedef UInt32 CBcj2Prob; +typedef UInt16 CBcj2Prob; + +/* +BCJ2 encoder / decoder internal requirements: + - If last bytes of stream contain marker (e8/e8/0f8x), then + there is also encoded symbol (0 : no conversion) in RC stream. + - One case of overlapped instructions is supported, + if last byte of converted instruction is (0f) and next byte is (8x): + marker [xx xx xx 0f] 8x + then the pair (0f 8x) is treated as marker. +*/ + +/* ---------- BCJ2 Decoder ---------- */ + /* CBcj2Dec: -dest is allowed to overlap with bufs[BCJ2_STREAM_MAIN], with the following conditions: +(dest) is allowed to overlap with bufs[BCJ2_STREAM_MAIN], with the following conditions: bufs[BCJ2_STREAM_MAIN] >= dest && - bufs[BCJ2_STREAM_MAIN] - dest >= tempReserv + + bufs[BCJ2_STREAM_MAIN] - dest >= BUF_SIZE(BCJ2_STREAM_CALL) + BUF_SIZE(BCJ2_STREAM_JUMP) - tempReserv = 0 : for first call of Bcj2Dec_Decode - tempReserv = 4 : for any other calls of Bcj2Dec_Decode - overlap with offset = 1 is not allowed + reserve = bufs[BCJ2_STREAM_MAIN] - dest - + ( BUF_SIZE(BCJ2_STREAM_CALL) + + BUF_SIZE(BCJ2_STREAM_JUMP) ) + and additional conditions: + if (it's first call of Bcj2Dec_Decode() after Bcj2Dec_Init()) + { + (reserve != 1) : if (ver < v23.00) + } + else // if there are more than one calls of Bcj2Dec_Decode() after Bcj2Dec_Init()) + { + (reserve >= 6) : if (ver < v23.00) + (reserve >= 4) : if (ver >= v23.00) + We need that (reserve) because after first call of Bcj2Dec_Decode(), + CBcj2Dec::temp can contain up to 4 bytes for writing to (dest). + } + (reserve == 0) is allowed, if we decode full stream via single call of Bcj2Dec_Decode(). + (reserve == 0) also is allowed in case of multi-call, if we use fixed buffers, + and (reserve) is calculated from full (final) sizes of all streams before first call. */ typedef struct @@ -68,21 +99,65 @@ typedef struct unsigned state; /* BCJ2_STREAM_MAIN has more priority than BCJ2_STATE_ORIG */ - UInt32 ip; - Byte temp[4]; + UInt32 ip; /* property of starting base for decoding */ + UInt32 temp; /* Byte temp[4]; */ UInt32 range; UInt32 code; - UInt16 probs[2 + 256]; + CBcj2Prob probs[2 + 256]; } CBcj2Dec; + +/* Note: + Bcj2Dec_Init() sets (CBcj2Dec::ip = 0) + if (ip != 0) property is required, the caller must set CBcj2Dec::ip after Bcj2Dec_Init() +*/ void Bcj2Dec_Init(CBcj2Dec *p); -/* Returns: SZ_OK or SZ_ERROR_DATA */ + +/* Bcj2Dec_Decode(): + returns: + SZ_OK + SZ_ERROR_DATA : if data in 5 starting bytes of BCJ2_STREAM_RC stream are not correct +*/ SRes Bcj2Dec_Decode(CBcj2Dec *p); -#define Bcj2Dec_IsFinished(_p_) ((_p_)->code == 0) +/* To check that decoding was finished you can compare + sizes of processed streams with sizes known from another sources. + You must do at least one mandatory check from the two following options: + - the check for size of processed output (ORIG) stream. + - the check for size of processed input (MAIN) stream. + additional optional checks: + - the checks for processed sizes of all input streams (MAIN, CALL, JUMP, RC) + - the checks Bcj2Dec_IsMaybeFinished*() + also before actual decoding you can check that the + following condition is met for stream sizes: + ( size(ORIG) == size(MAIN) + size(CALL) + size(JUMP) ) +*/ +/* (state == BCJ2_STREAM_MAIN) means that decoder is ready for + additional input data in BCJ2_STREAM_MAIN stream. + Note that (state == BCJ2_STREAM_MAIN) is allowed for non-finished decoding. +*/ +#define Bcj2Dec_IsMaybeFinished_state_MAIN(_p_) ((_p_)->state == BCJ2_STREAM_MAIN) +/* if the stream decoding was finished correctly, then range decoder + part of CBcj2Dec also was finished, and then (CBcj2Dec::code == 0). + Note that (CBcj2Dec::code == 0) is allowed for non-finished decoding. +*/ +#define Bcj2Dec_IsMaybeFinished_code(_p_) ((_p_)->code == 0) + +/* use Bcj2Dec_IsMaybeFinished() only as additional check + after at least one mandatory check from the two following options: + - the check for size of processed output (ORIG) stream. + - the check for size of processed input (MAIN) stream. +*/ +#define Bcj2Dec_IsMaybeFinished(_p_) ( \ + Bcj2Dec_IsMaybeFinished_state_MAIN(_p_) && \ + Bcj2Dec_IsMaybeFinished_code(_p_)) + + + +/* ---------- BCJ2 Encoder ---------- */ typedef enum { @@ -91,6 +166,91 @@ typedef enum BCJ2_ENC_FINISH_MODE_END_STREAM } EBcj2Enc_FinishMode; +/* + BCJ2_ENC_FINISH_MODE_CONTINUE: + process non finished encoding. + It notifies the encoder that additional further calls + can provide more input data (src) than provided by current call. + In that case the CBcj2Enc encoder still can move (src) pointer + up to (srcLim), but CBcj2Enc encoder can store some of the last + processed bytes (up to 4 bytes) from src to internal CBcj2Enc::temp[] buffer. + at return: + (CBcj2Enc::src will point to position that includes + processed data and data copied to (temp[]) buffer) + That data from (temp[]) buffer will be used in further calls. + + BCJ2_ENC_FINISH_MODE_END_BLOCK: + finish encoding of current block (ended at srcLim) without RC flushing. + at return: if (CBcj2Enc::state == BCJ2_ENC_STATE_ORIG) && + CBcj2Enc::src == CBcj2Enc::srcLim) + : it shows that block encoding was finished. And the encoder is + ready for new (src) data or for stream finish operation. + finished block means + { + CBcj2Enc has completed block encoding up to (srcLim). + (1 + 4 bytes) or (2 + 4 bytes) CALL/JUMP cortages will + not cross block boundary at (srcLim). + temporary CBcj2Enc buffer for (ORIG) src data is empty. + 3 output uncompressed streams (MAIN, CALL, JUMP) were flushed. + RC stream was not flushed. And RC stream will cross block boundary. + } + Note: some possible implementation of BCJ2 encoder could + write branch marker (e8/e8/0f8x) in one call of Bcj2Enc_Encode(), + and it could calculate symbol for RC in another call of Bcj2Enc_Encode(). + BCJ2 encoder uses ip/fileIp/fileSize/relatLimit values to calculate RC symbol. + And these CBcj2Enc variables can have different values in different Bcj2Enc_Encode() calls. + So caller must finish each block with BCJ2_ENC_FINISH_MODE_END_BLOCK + to ensure that RC symbol is calculated and written in proper block. + + BCJ2_ENC_FINISH_MODE_END_STREAM + finish encoding of stream (ended at srcLim) fully including RC flushing. + at return: if (CBcj2Enc::state == BCJ2_ENC_STATE_FINISHED) + : it shows that stream encoding was finished fully, + and all output streams were flushed fully. + also Bcj2Enc_IsFinished() can be called. +*/ + + +/* + 32-bit relative offset in JUMP/CALL commands is + - (mod 4 GiB) for 32-bit x86 code + - signed Int32 for 64-bit x86-64 code + BCJ2 encoder also does internal relative to absolute address conversions. + And there are 2 possible ways to do it: + before v23: we used 32-bit variables and (mod 4 GiB) conversion + since v23: we use 64-bit variables and (signed Int32 offset) conversion. + The absolute address condition for conversion in v23: + ((UInt64)((Int64)ip64 - (Int64)fileIp64 + 5 + (Int32)offset) < (UInt64)fileSize64) + note that if (fileSize64 > 2 GiB). there is difference between + old (mod 4 GiB) way (v22) and new (signed Int32 offset) way (v23). + And new (v23) way is more suitable to encode 64-bit x86-64 code for (fileSize64 > 2 GiB) cases. +*/ + +/* +// for old (v22) way for conversion: +typedef UInt32 CBcj2Enc_ip_unsigned; +typedef Int32 CBcj2Enc_ip_signed; +#define BCJ2_ENC_FileSize_MAX ((UInt32)1 << 31) +*/ +typedef UInt64 CBcj2Enc_ip_unsigned; +typedef Int64 CBcj2Enc_ip_signed; + +/* maximum size of file that can be used for conversion condition */ +#define BCJ2_ENC_FileSize_MAX ((CBcj2Enc_ip_unsigned)0 - 2) + +/* default value of fileSize64_minus1 variable that means + that absolute address limitation will not be used */ +#define BCJ2_ENC_FileSizeField_UNLIMITED ((CBcj2Enc_ip_unsigned)0 - 1) + +/* calculate value that later can be set to CBcj2Enc::fileSize64_minus1 */ +#define BCJ2_ENC_GET_FileSizeField_VAL_FROM_FileSize(fileSize) \ + ((CBcj2Enc_ip_unsigned)(fileSize) - 1) + +/* set CBcj2Enc::fileSize64_minus1 variable from size of file */ +#define Bcj2Enc_SET_FileSize(p, fileSize) \ + (p)->fileSize64_minus1 = BCJ2_ENC_GET_FileSizeField_VAL_FROM_FileSize(fileSize); + + typedef struct { Byte *bufs[BCJ2_NUM_STREAMS]; @@ -101,45 +261,71 @@ typedef struct unsigned state; EBcj2Enc_FinishMode finishMode; - Byte prevByte; + Byte context; + Byte flushRem; + Byte isFlushState; Byte cache; UInt32 range; UInt64 low; UInt64 cacheSize; + + // UInt32 context; // for marker version, it can include marker flag. - UInt32 ip; - - /* 32-bit ralative offset in JUMP/CALL commands is - - (mod 4 GB) in 32-bit mode - - signed Int32 in 64-bit mode - We use (mod 4 GB) check for fileSize. - Use fileSize up to 2 GB, if you want to support 32-bit and 64-bit code conversion. */ - UInt32 fileIp; - UInt32 fileSize; /* (fileSize <= ((UInt32)1 << 31)), 0 means no_limit */ - UInt32 relatLimit; /* (relatLimit <= ((UInt32)1 << 31)), 0 means desable_conversion */ + /* (ip64) and (fileIp64) correspond to virtual source stream position + that doesn't include data in temp[] */ + CBcj2Enc_ip_unsigned ip64; /* current (ip) position */ + CBcj2Enc_ip_unsigned fileIp64; /* start (ip) position of current file */ + CBcj2Enc_ip_unsigned fileSize64_minus1; /* size of current file (for conversion limitation) */ + UInt32 relatLimit; /* (relatLimit <= ((UInt32)1 << 31)) : 0 means disable_conversion */ + // UInt32 relatExcludeBits; UInt32 tempTarget; - unsigned tempPos; - Byte temp[4 * 2]; - - unsigned flushPos; - - UInt16 probs[2 + 256]; + unsigned tempPos; /* the number of bytes that were copied to temp[] buffer + (tempPos <= 4) outside of Bcj2Enc_Encode() */ + // Byte temp[4]; // for marker version + Byte temp[8]; + CBcj2Prob probs[2 + 256]; } CBcj2Enc; void Bcj2Enc_Init(CBcj2Enc *p); -void Bcj2Enc_Encode(CBcj2Enc *p); -#define Bcj2Enc_Get_InputData_Size(p) ((SizeT)((p)->srcLim - (p)->src) + (p)->tempPos) -#define Bcj2Enc_IsFinished(p) ((p)->flushPos == 5) +/* +Bcj2Enc_Encode(): at exit: + p->State < BCJ2_NUM_STREAMS : we need more buffer space for output stream + (bufs[p->State] == lims[p->State]) + p->State == BCJ2_ENC_STATE_ORIG : we need more data in input src stream + (src == srcLim) + p->State == BCJ2_ENC_STATE_FINISHED : after fully encoded stream +*/ +void Bcj2Enc_Encode(CBcj2Enc *p); -#define BCJ2_RELAT_LIMIT_NUM_BITS 26 -#define BCJ2_RELAT_LIMIT ((UInt32)1 << BCJ2_RELAT_LIMIT_NUM_BITS) +/* Bcj2Enc encoder can look ahead for up 4 bytes of source stream. + CBcj2Enc::tempPos : is the number of bytes that were copied from input stream to temp[] buffer. + (CBcj2Enc::src) after Bcj2Enc_Encode() is starting position after + fully processed data and after data copied to temp buffer. + So if the caller needs to get real number of fully processed input + bytes (without look ahead data in temp buffer), + the caller must subtruct (CBcj2Enc::tempPos) value from processed size + value that is calculated based on current (CBcj2Enc::src): + cur_processed_pos = Calc_Big_Processed_Pos(enc.src)) - + Bcj2Enc_Get_AvailInputSize_in_Temp(&enc); +*/ +/* get the size of input data that was stored in temp[] buffer: */ +#define Bcj2Enc_Get_AvailInputSize_in_Temp(p) ((p)->tempPos) -/* limit for CBcj2Enc::fileSize variable */ -#define BCJ2_FileSize_MAX ((UInt32)1 << 31) +#define Bcj2Enc_IsFinished(p) ((p)->flushRem == 0) + +/* Note : the decoder supports overlapping of marker (0f 80). + But we can eliminate such overlapping cases by setting + the limit for relative offset conversion as + CBcj2Enc::relatLimit <= (0x0f << 24) == (240 MiB) +*/ +/* default value for CBcj2Enc::relatLimit */ +#define BCJ2_ENC_RELAT_LIMIT_DEFAULT ((UInt32)0x0f << 24) +#define BCJ2_ENC_RELAT_LIMIT_MAX ((UInt32)1 << 31) +// #define BCJ2_RELAT_EXCLUDE_NUM_BITS 5 EXTERN_C_END diff --git a/src/sdk/C/Bcj2Enc.c b/src/sdk/C/Bcj2Enc.c index bfbeb8e..79460bb 100644 --- a/src/sdk/C/Bcj2Enc.c +++ b/src/sdk/C/Bcj2Enc.c @@ -1,60 +1,62 @@ -/* Bcj2Enc.c -- BCJ2 Encoder (Converter for x86 code) -2019-02-02 : Igor Pavlov : Public domain */ +/* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2) +2023-04-02 : Igor Pavlov : Public domain */ #include "Precomp.h" /* #define SHOW_STAT */ - #ifdef SHOW_STAT #include -#define PRF(x) x +#define PRF2(s) printf("%s ip=%8x tempPos=%d src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src)); #else -#define PRF(x) +#define PRF2(s) #endif -#include - #include "Bcj2.h" #include "CpuArch.h" -#define CProb UInt16 - #define kTopValue ((UInt32)1 << 24) -#define kNumModelBits 11 -#define kBitModelTotal (1 << kNumModelBits) +#define kNumBitModelTotalBits 11 +#define kBitModelTotal (1 << kNumBitModelTotalBits) #define kNumMoveBits 5 void Bcj2Enc_Init(CBcj2Enc *p) { unsigned i; - - p->state = BCJ2_ENC_STATE_OK; + p->state = BCJ2_ENC_STATE_ORIG; p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE; - - p->prevByte = 0; - + p->context = 0; + p->flushRem = 5; + p->isFlushState = 0; p->cache = 0; - p->range = 0xFFFFFFFF; + p->range = 0xffffffff; p->low = 0; p->cacheSize = 1; - - p->ip = 0; - - p->fileIp = 0; - p->fileSize = 0; - p->relatLimit = BCJ2_RELAT_LIMIT; - + p->ip64 = 0; + p->fileIp64 = 0; + p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED; + p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT; + // p->relatExcludeBits = 0; p->tempPos = 0; - - p->flushPos = 0; - for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++) p->probs[i] = kBitModelTotal >> 1; } -static BoolInt MY_FAST_CALL RangeEnc_ShiftLow(CBcj2Enc *p) +// Z7_NO_INLINE +Z7_FORCE_INLINE +static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p) { - if ((UInt32)p->low < (UInt32)0xFF000000 || (UInt32)(p->low >> 32) != 0) + const UInt32 low = (UInt32)p->low; + const unsigned high = (unsigned) + #if defined(Z7_MSC_VER_ORIGINAL) \ + && defined(MY_CPU_X86) \ + && defined(MY_CPU_LE) \ + && !defined(MY_CPU_64BIT) + // we try to rid of __aullshr() call in MSVS-x86 + (((const UInt32 *)&p->low)[1]); // [1] : for little-endian only + #else + (p->low >> 32); + #endif + if (low < (UInt32)0xff000000 || high != 0) { Byte *buf = p->bufs[BCJ2_STREAM_RC]; do @@ -65,247 +67,440 @@ static BoolInt MY_FAST_CALL RangeEnc_ShiftLow(CBcj2Enc *p) p->bufs[BCJ2_STREAM_RC] = buf; return True; } - *buf++ = (Byte)(p->cache + (Byte)(p->low >> 32)); - p->cache = 0xFF; + *buf++ = (Byte)(p->cache + high); + p->cache = 0xff; } while (--p->cacheSize); p->bufs[BCJ2_STREAM_RC] = buf; - p->cache = (Byte)((UInt32)p->low >> 24); + p->cache = (Byte)(low >> 24); } p->cacheSize++; - p->low = (UInt32)p->low << 8; + p->low = low << 8; return False; } -static void Bcj2Enc_Encode_2(CBcj2Enc *p) -{ - if (BCJ2_IS_32BIT_STREAM(p->state)) + +/* +We can use 2 alternative versions of code: +1) non-marker version: + Byte CBcj2Enc::context + Byte temp[8]; + Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer. + Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction + with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call. + +2) marker version: + UInt32 CBcj2Enc::context + Byte CBcj2Enc::temp[4]; + MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker. + it's allowed that + one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest, + and another call of Bcj2Enc_Encode_2() does offset conversion. + So different values of (fileIp) and (fileSize) are possible + in these different Bcj2Enc_Encode_2() calls. + +Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop. +So we use non-marker version. +*/ + +/* + Corner cases with overlap in multi-block. + before v23: there was one corner case, where converted instruction + could start in one sub-stream and finish in next sub-stream. + If multi-block (solid) encoding is used, + and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream. + and (0f) is last byte of previous sub-stream + and (8x) is first byte of current sub-stream + then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder. + BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage, + if that offset meets limit requirements. + If encoder allows 32-bit offset conversion for such overlap case, + then the data in 3 uncompressed BCJ2 streams for some sub-stream + can depend from data of previous sub-stream. + That corner case is not big problem, and it's rare case. + Since v23.00 we do additional check to prevent conversions in such overlap cases. +*/ + +/* + Bcj2Enc_Encode_2() output variables at exit: { - Byte *cur = p->bufs[p->state]; - if (cur == p->lims[p->state]) - return; - SetBe32(cur, p->tempTarget); - p->bufs[p->state] = cur + 4; + if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG)) + { + it means that encoder needs more input data. + if (p->srcLim == p->src) at exit, then + { + (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM) + all input data were read and processed, and we are ready for + new input data. + } + else + { + (p->srcLim != p->src) + (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE) + The encoder have found e8/e9/0f_8x marker, + and p->src points to last byte of that marker, + Bcj2Enc_Encode_2() needs more input data to get totally + 5 bytes (last byte of marker and 32-bit branch offset) + as continuous array starting from p->src. + (p->srcLim - p->src < 5) requirement is met after exit. + So non-processed resedue from p->src to p->srcLim is always less than 5 bytes. + } + } } +*/ - p->state = BCJ2_ENC_STATE_ORIG; - - for (;;) +Z7_NO_INLINE +static void Bcj2Enc_Encode_2(CBcj2Enc *p) +{ + if (!p->isFlushState) { - if (p->range < kTopValue) + const Byte *src; + UInt32 v; { - if (RangeEnc_ShiftLow(p)) - return; - p->range <<= 8; + const unsigned state = p->state; + if (BCJ2_IS_32BIT_STREAM(state)) + { + Byte *cur = p->bufs[state]; + if (cur == p->lims[state]) + return; + SetBe32a(cur, p->tempTarget) + p->bufs[state] = cur + 4; + } } + p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit + src = p->src; + v = p->context; + + // #define WRITE_CONTEXT p->context = v; // for marker version + #define WRITE_CONTEXT p->context = (Byte)v; + #define WRITE_CONTEXT_AND_SRC p->src = src; WRITE_CONTEXT + for (;;) { + // const Byte *src; + // UInt32 v; + CBcj2Enc_ip_unsigned ip; + if (p->range < kTopValue) + { + // to reduce register pressure and code size: we save and restore local variables. + WRITE_CONTEXT_AND_SRC + if (Bcj2_RangeEnc_ShiftLow(p)) + return; + p->range <<= 8; + src = p->src; + v = p->context; + } + // src = p->src; + // #define MARKER_FLAG ((UInt32)1 << 17) + // if ((v & MARKER_FLAG) == 0) // for marker version { - const Byte *src = p->src; const Byte *srcLim; - Byte *dest; - SizeT num = p->srcLim - src; - - if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE) + Byte *dest = p->bufs[BCJ2_STREAM_MAIN]; { - if (num <= 4) - return; - num -= 4; + const SizeT remSrc = (SizeT)(p->srcLim - src); + SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest); + if (rem >= remSrc) + rem = remSrc; + srcLim = src + rem; } - else if (num == 0) - break; - - dest = p->bufs[BCJ2_STREAM_MAIN]; - if (num > (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest)) + /* p->context contains context of previous byte: + bits [0 : 7] : src[-1], if (src) was changed in this call + bits [8 : 31] : are undefined for non-marker version + */ + // v = p->context; + #define NUM_SHIFT_BITS 24 + #define CONV_FLAG ((UInt32)1 << 16) + #define ONE_ITER { \ + b = src[0]; \ + *dest++ = (Byte)b; \ + v = (v << NUM_SHIFT_BITS) | b; \ + if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \ + if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \ + ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \ + src++; if (src == srcLim) { break; } } + + if (src != srcLim) + for (;;) { - num = p->lims[BCJ2_STREAM_MAIN] - dest; - if (num == 0) - { - p->state = BCJ2_STREAM_MAIN; - return; - } + /* clang can generate ineffective code with setne instead of two jcc instructions. + we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */ + unsigned b; + ONE_ITER + ONE_ITER } - - srcLim = src + num; + + ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]); + p->bufs[BCJ2_STREAM_MAIN] = dest; + p->ip64 = ip; - if (p->prevByte == 0x0F && (src[0] & 0xF0) == 0x80) - *dest = src[0]; - else for (;;) + if (src == srcLim) { - Byte b = *src; - *dest = b; - if (b != 0x0F) + WRITE_CONTEXT_AND_SRC + if (src != p->srcLim) { - if ((b & 0xFE) == 0xE8) - break; - dest++; - if (++src != srcLim) - continue; - break; + p->state = BCJ2_STREAM_MAIN; + return; } - dest++; - if (++src == srcLim) - break; - if ((*src & 0xF0) != 0x80) - continue; - *dest = *src; + /* (p->src == p->srcLim) + (p->state == BCJ2_ENC_STATE_ORIG) */ + if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM) + return; + /* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */ + // (p->flushRem == 5); + p->isFlushState = 1; break; } - - num = src - p->src; - - if (src == srcLim) - { - p->prevByte = src[-1]; - p->bufs[BCJ2_STREAM_MAIN] = dest; - p->src = src; - p->ip += (UInt32)num; - continue; - } - + src++; + // p->src = src; + } + // ip = p->ip; // for marker version + /* marker was found */ + /* (v) contains marker that was found: + bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7] + : value of src[-2] : xx/xx/0f + bits [0 : 7] : value of src[-1] : e8/e9/8x + */ + { { - Byte context = (Byte)(num == 0 ? p->prevByte : src[-1]); - BoolInt needConvert; - - p->bufs[BCJ2_STREAM_MAIN] = dest + 1; - p->ip += (UInt32)num + 1; - src++; - - needConvert = False; - + #if NUM_SHIFT_BITS != 24 + v &= ~(UInt32)CONV_FLAG; + #endif + // UInt32 relat = 0; if ((SizeT)(p->srcLim - src) >= 4) { - UInt32 relatVal = GetUi32(src); - if ((p->fileSize == 0 || (UInt32)(p->ip + 4 + relatVal - p->fileIp) < p->fileSize) - && ((relatVal + p->relatLimit) >> 1) < p->relatLimit) - needConvert = True; + /* + if (relat != 0 || (Byte)v != 0xe8) + BoolInt isBigOffset = True; + */ + const UInt32 relat = GetUi32(src); + /* + #define EXCLUDE_FLAG ((UInt32)1 << 4) + #define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0) + if (p->relatExcludeBits != 0) + { + const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1); + isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0); + } + // isBigOffset = False; // for debug + */ + ip -= p->fileIp64; + // Use the following if check, if (ip) is 64-bit: + if (ip > (((v + 0x20) >> 5) & 1)) // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9) + if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1) + if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit) + v |= CONV_FLAG; } - + else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE) { - UInt32 bound; - unsigned ttt; - Byte b = src[-1]; - CProb *prob = p->probs + (unsigned)(b == 0xE8 ? 2 + (unsigned)context : (b == 0xE9 ? 1 : 0)); - - ttt = *prob; - bound = (p->range >> kNumModelBits) * ttt; - - if (!needConvert) + // (p->srcLim - src < 4) + // /* + // for non-marker version + p->ip64--; // p->ip = ip - 1; + p->bufs[BCJ2_STREAM_MAIN]--; + src--; + v >>= NUM_SHIFT_BITS; + // (0 < p->srcLim - p->src <= 4) + // */ + // v |= MARKER_FLAG; // for marker version + /* (p->state == BCJ2_ENC_STATE_ORIG) */ + WRITE_CONTEXT_AND_SRC + return; + } + { + const unsigned c = ((v + 0x17) >> 6) & 1; + CBcj2Prob *prob = p->probs + (unsigned) + (((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1)); + /* + ((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) : + ((Byte)v < 0xe8 ? 0 : 1)); // ((v >> 5) & 1)); + */ + const unsigned ttt = *prob; + const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt; + if ((v & CONV_FLAG) == 0) { + // static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy); + // v = (Byte)v; // for marker version p->range = bound; - *prob = (CProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); - p->src = src; - p->prevByte = b; + *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); + // WRITE_CONTEXT_AND_SRC continue; } - p->low += bound; p->range -= bound; - *prob = (CProb)(ttt - (ttt >> kNumMoveBits)); - + *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits)); + } + // p->context = src[3]; + { + // const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP); + const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL; + ip = p->ip64; + v = GetUi32(src); // relat + ip += 4; + p->ip64 = ip; + src += 4; + // p->src = src; { - UInt32 relatVal = GetUi32(src); - UInt32 absVal; - p->ip += 4; - absVal = p->ip + relatVal; - p->prevByte = src[3]; - src += 4; - p->src = src; + const UInt32 absol = (UInt32)ip + v; + Byte *cur = p->bufs[cj]; + v >>= 24; + // WRITE_CONTEXT + if (cur == p->lims[cj]) { - unsigned cj = (b == 0xE8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP; - Byte *cur = p->bufs[cj]; - if (cur == p->lims[cj]) - { - p->state = cj; - p->tempTarget = absVal; - return; - } - SetBe32(cur, absVal); - p->bufs[cj] = cur + 4; + p->state = cj; + p->tempTarget = absol; + WRITE_CONTEXT_AND_SRC + return; } + SetBe32a(cur, absol) + p->bufs[cj] = cur + 4; } } } } - } + } // end of loop } - if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM) - return; - - for (; p->flushPos < 5; p->flushPos++) - if (RangeEnc_ShiftLow(p)) + for (; p->flushRem != 0; p->flushRem--) + if (Bcj2_RangeEnc_ShiftLow(p)) return; - p->state = BCJ2_ENC_STATE_OK; + p->state = BCJ2_ENC_STATE_FINISHED; } +/* +BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer. +So base function Bcj2Enc_Encode_2() + in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with + (p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim) +Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer. + so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(), + then (p->src == p->srcLim). + And the caller's code is simpler with Bcj2Enc_Encode(). +*/ + +Z7_NO_INLINE void Bcj2Enc_Encode(CBcj2Enc *p) { - PRF(printf("\n")); - PRF(printf("---- ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src)); - + PRF2("\n----") if (p->tempPos != 0) { + /* extra: number of bytes that were copied from (src) to (temp) buffer in this call */ unsigned extra = 0; - + /* We will touch only minimal required number of bytes in input (src) stream. + So we will add input bytes from (src) stream to temp[] with step of 1 byte. + We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call + in first loop iteration because + - previous call of Bcj2Enc_Encode() could use another (finishMode), + - previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG). + the case with full temp[] buffer (p->tempPos == 4) is possible here. + */ for (;;) { + // (0 < p->tempPos <= 5) // in non-marker version + /* p->src : the current src data position including extra bytes + that were copied to temp[] buffer in this call */ const Byte *src = p->src; const Byte *srcLim = p->srcLim; - EBcj2Enc_FinishMode finishMode = p->finishMode; - - p->src = p->temp; - p->srcLim = p->temp + p->tempPos; + const EBcj2Enc_FinishMode finishMode = p->finishMode; if (src != srcLim) + { + /* if there are some src data after the data copied to temp[], + then we use MODE_CONTINUE for temp data */ p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE; - - PRF(printf(" ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src)); - + } + p->src = p->temp; + p->srcLim = p->temp + p->tempPos; + PRF2(" ") Bcj2Enc_Encode_2(p); - { - unsigned num = (unsigned)(p->src - p->temp); - unsigned tempPos = p->tempPos - num; + const unsigned num = (unsigned)(p->src - p->temp); + const unsigned tempPos = p->tempPos - num; unsigned i; p->tempPos = tempPos; for (i = 0; i < tempPos; i++) - p->temp[i] = p->temp[(size_t)i + num]; - + p->temp[i] = p->temp[(SizeT)i + num]; + // tempPos : number of bytes in temp buffer p->src = src; p->srcLim = srcLim; p->finishMode = finishMode; - - if (p->state != BCJ2_ENC_STATE_ORIG || src == srcLim) + if (p->state != BCJ2_ENC_STATE_ORIG) + { + // (p->tempPos <= 4) // in non-marker version + /* if (the reason of exit from Bcj2Enc_Encode_2() + is not BCJ2_ENC_STATE_ORIG), + then we exit from Bcj2Enc_Encode() with same reason */ + // optional code begin : we rollback (src) and tempPos, if it's possible: + if (extra >= tempPos) + extra = tempPos; + p->src = src - extra; + p->tempPos = tempPos - extra; + // optional code end : rollback of (src) and tempPos return; - + } + /* (p->tempPos <= 4) + (p->state == BCJ2_ENC_STATE_ORIG) + so encoder needs more data than in temp[] */ + if (src == srcLim) + return; // src buffer has no more input data. + /* (src != srcLim) + so we can provide more input data from src for Bcj2Enc_Encode_2() */ if (extra >= tempPos) { - p->src = src - tempPos; + /* (extra >= tempPos) means that temp buffer contains + only data from src buffer of this call. + So now we can encode without temp buffer */ + p->src = src - tempPos; // rollback (src) p->tempPos = 0; break; } - - p->temp[tempPos] = src[0]; + // we append one additional extra byte from (src) to temp[] buffer: + p->temp[tempPos] = *src; p->tempPos = tempPos + 1; + // (0 < p->tempPos <= 5) // in non-marker version p->src = src + 1; extra++; } } } - PRF(printf("++++ ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src)); - + PRF2("++++") + // (p->tempPos == 0) Bcj2Enc_Encode_2(p); + PRF2("====") if (p->state == BCJ2_ENC_STATE_ORIG) { const Byte *src = p->src; - unsigned rem = (unsigned)(p->srcLim - src); - unsigned i; - for (i = 0; i < rem; i++) - p->temp[i] = src[i]; - p->tempPos = rem; - p->src = src + rem; + const Byte *srcLim = p->srcLim; + const unsigned rem = (unsigned)(srcLim - src); + /* (rem <= 4) here. + if (p->src != p->srcLim), then + - we copy non-processed bytes from (p->src) to temp[] buffer, + - we set p->src equal to p->srcLim. + */ + if (rem) + { + unsigned i = 0; + p->src = srcLim; + p->tempPos = rem; + // (0 < p->tempPos <= 4) + do + p->temp[i] = src[i]; + while (++i != rem); + } + // (p->tempPos <= 4) + // (p->src == p->srcLim) } } + +#undef PRF2 +#undef CONV_FLAG +#undef MARKER_FLAG +#undef WRITE_CONTEXT +#undef WRITE_CONTEXT_AND_SRC +#undef ONE_ITER +#undef NUM_SHIFT_BITS +#undef kTopValue +#undef kNumBitModelTotalBits +#undef kBitModelTotal +#undef kNumMoveBits diff --git a/src/sdk/C/Bra.c b/src/sdk/C/Bra.c index aed17e3..e61edf8 100644 --- a/src/sdk/C/Bra.c +++ b/src/sdk/C/Bra.c @@ -1,230 +1,709 @@ -/* Bra.c -- Converters for RISC code -2017-04-04 : Igor Pavlov : Public domain */ +/* Bra.c -- Branch converters for RISC code +2024-01-20 : Igor Pavlov : Public domain */ #include "Precomp.h" -#include "CpuArch.h" #include "Bra.h" +#include "RotateDefs.h" +#include "CpuArch.h" + +#if defined(MY_CPU_SIZEOF_POINTER) \ + && ( MY_CPU_SIZEOF_POINTER == 4 \ + || MY_CPU_SIZEOF_POINTER == 8) + #define BR_CONV_USE_OPT_PC_PTR +#endif + +#ifdef BR_CONV_USE_OPT_PC_PTR +#define BR_PC_INIT pc -= (UInt32)(SizeT)p; +#define BR_PC_GET (pc + (UInt32)(SizeT)p) +#else +#define BR_PC_INIT pc += (UInt32)size; +#define BR_PC_GET (pc - (UInt32)(SizeT)(lim - p)) +// #define BR_PC_INIT +// #define BR_PC_GET (pc + (UInt32)(SizeT)(p - data)) +#endif + +#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c; +// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c; + +#define Z7_BRANCH_CONV(name) z7_ ## name + +#define Z7_BRANCH_FUNC_MAIN(name) \ +static \ +Z7_FORCE_INLINE \ +Z7_ATTRIB_NO_VECTOR \ +Byte *Z7_BRANCH_CONV(name)(Byte *p, SizeT size, UInt32 pc, int encoding) + +#define Z7_BRANCH_FUNC_IMP(name, m, encoding) \ +Z7_NO_INLINE \ +Z7_ATTRIB_NO_VECTOR \ +Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \ + { return Z7_BRANCH_CONV(name)(data, size, pc, encoding); } \ + +#ifdef Z7_EXTRACT_ONLY +#define Z7_BRANCH_FUNCS_IMP(name) \ + Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) +#else +#define Z7_BRANCH_FUNCS_IMP(name) \ + Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) \ + Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC_2, 1) +#endif -SizeT ARM_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) +#if defined(__clang__) +#define BR_EXTERNAL_FOR +#define BR_NEXT_ITERATION continue; +#else +#define BR_EXTERNAL_FOR for (;;) +#define BR_NEXT_ITERATION break; +#endif + +#if defined(__clang__) && (__clang_major__ >= 8) \ + || defined(__GNUC__) && (__GNUC__ >= 1000) \ + // GCC is not good for __builtin_expect() here + /* || defined(_MSC_VER) && (_MSC_VER >= 1920) */ + // #define Z7_unlikely [[unlikely]] + // #define Z7_LIKELY(x) (__builtin_expect((x), 1)) + #define Z7_UNLIKELY(x) (__builtin_expect((x), 0)) + // #define Z7_likely [[likely]] +#else + // #define Z7_LIKELY(x) (x) + #define Z7_UNLIKELY(x) (x) + // #define Z7_likely +#endif + + +Z7_BRANCH_FUNC_MAIN(BranchConv_ARM64) { - Byte *p; + // Byte *p = data; const Byte *lim; - size &= ~(size_t)3; - ip += 4; - p = data; - lim = data + size; + const UInt32 flag = (UInt32)1 << (24 - 4); + const UInt32 mask = ((UInt32)1 << 24) - (flag << 1); + size &= ~(SizeT)3; + // if (size == 0) return p; + lim = p + size; + BR_PC_INIT + pc -= 4; // because (p) will point to next instruction + + BR_EXTERNAL_FOR + { + // Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + for (;;) + { + UInt32 v; + if Z7_UNLIKELY(p == lim) + return p; + v = GetUi32a(p); + p += 4; + if Z7_UNLIKELY(((v - 0x94000000) & 0xfc000000) == 0) + { + UInt32 c = BR_PC_GET >> 2; + BR_CONVERT_VAL(v, c) + v &= 0x03ffffff; + v |= 0x94000000; + SetUi32a(p - 4, v) + BR_NEXT_ITERATION + } + // v = rotlFixed(v, 8); v += (flag << 8) - 0x90; if Z7_UNLIKELY((v & ((mask << 8) + 0x9f)) == 0) + v -= 0x90000000; if Z7_UNLIKELY((v & 0x9f000000) == 0) + { + UInt32 z, c; + // v = rotrFixed(v, 8); + v += flag; if Z7_UNLIKELY(v & mask) continue; + z = (v & 0xffffffe0) | (v >> 26); + c = (BR_PC_GET >> (12 - 3)) & ~(UInt32)7; + BR_CONVERT_VAL(z, c) + v &= 0x1f; + v |= 0x90000000; + v |= z << 26; + v |= 0x00ffffe0 & ((z & (((flag << 1) - 1))) - flag); + SetUi32a(p - 4, v) + } + } + } +} +Z7_BRANCH_FUNCS_IMP(BranchConv_ARM64) - if (encoding) +Z7_BRANCH_FUNC_MAIN(BranchConv_ARM) +{ + // Byte *p = data; + const Byte *lim; + size &= ~(SizeT)3; + lim = p + size; + BR_PC_INIT + /* in ARM: branch offset is relative to the +2 instructions from current instruction. + (p) will point to next instruction */ + pc += 8 - 4; + for (;;) { for (;;) { - if (p >= lim) - return p - data; - p += 4; - if (p[-1] == 0xEB) - break; + if Z7_UNLIKELY(p >= lim) { return p; } p += 4; if Z7_UNLIKELY(p[-1] == 0xeb) break; + if Z7_UNLIKELY(p >= lim) { return p; } p += 4; if Z7_UNLIKELY(p[-1] == 0xeb) break; } { - UInt32 v = GetUi32(p - 4); - v <<= 2; - v += ip + (UInt32)(p - data); - v >>= 2; - v &= 0x00FFFFFF; - v |= 0xEB000000; - SetUi32(p - 4, v); + UInt32 v = GetUi32a(p - 4); + UInt32 c = BR_PC_GET >> 2; + BR_CONVERT_VAL(v, c) + v &= 0x00ffffff; + v |= 0xeb000000; + SetUi32a(p - 4, v) } } +} +Z7_BRANCH_FUNCS_IMP(BranchConv_ARM) + +Z7_BRANCH_FUNC_MAIN(BranchConv_PPC) +{ + // Byte *p = data; + const Byte *lim; + size &= ~(SizeT)3; + lim = p + size; + BR_PC_INIT + pc -= 4; // because (p) will point to next instruction + for (;;) { + UInt32 v; for (;;) { - if (p >= lim) - return p - data; + if Z7_UNLIKELY(p == lim) + return p; + // v = GetBe32a(p); + v = *(UInt32 *)(void *)p; p += 4; - if (p[-1] == 0xEB) - break; + // if ((v & 0xfc000003) == 0x48000001) break; + // if ((p[-4] & 0xFC) == 0x48 && (p[-1] & 3) == 1) break; + if Z7_UNLIKELY( + ((v - Z7_CONV_BE_TO_NATIVE_CONST32(0x48000001)) + & Z7_CONV_BE_TO_NATIVE_CONST32(0xfc000003)) == 0) break; } { - UInt32 v = GetUi32(p - 4); - v <<= 2; - v -= ip + (UInt32)(p - data); - v >>= 2; - v &= 0x00FFFFFF; - v |= 0xEB000000; - SetUi32(p - 4, v); + v = Z7_CONV_NATIVE_TO_BE_32(v); + { + UInt32 c = BR_PC_GET; + BR_CONVERT_VAL(v, c) + } + v &= 0x03ffffff; + v |= 0x48000000; + SetBe32a(p - 4, v) } } } +Z7_BRANCH_FUNCS_IMP(BranchConv_PPC) + +#ifdef Z7_CPU_FAST_ROTATE_SUPPORTED +#define BR_SPARC_USE_ROTATE +#endif -SizeT ARMT_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) +Z7_BRANCH_FUNC_MAIN(BranchConv_SPARC) { - Byte *p; + // Byte *p = data; const Byte *lim; - size &= ~(size_t)1; - p = data; - lim = data + size - 4; - - if (encoding) - + const UInt32 flag = (UInt32)1 << 22; + size &= ~(SizeT)3; + lim = p + size; + BR_PC_INIT + pc -= 4; // because (p) will point to next instruction for (;;) { - UInt32 b1; + UInt32 v; for (;;) { - UInt32 b3; - if (p > lim) - return p - data; - b1 = p[1]; - b3 = p[3]; - p += 2; - b1 ^= 8; - if ((b3 & b1) >= 0xF8) + if Z7_UNLIKELY(p == lim) + return p; + /* // the code without GetBe32a(): + { const UInt32 v = GetUi16a(p) & 0xc0ff; p += 4; if (v == 0x40 || v == 0xc07f) break; } + */ + v = GetBe32a(p); + p += 4; + #ifdef BR_SPARC_USE_ROTATE + v = rotlFixed(v, 2); + v += (flag << 2) - 1; + if Z7_UNLIKELY((v & (3 - (flag << 3))) == 0) + #else + v += (UInt32)5 << 29; + v ^= (UInt32)7 << 29; + v += flag; + if Z7_UNLIKELY((v & (0 - (flag << 1))) == 0) + #endif break; } { - UInt32 v = - ((UInt32)b1 << 19) - + (((UInt32)p[1] & 0x7) << 8) - + (((UInt32)p[-2] << 11)) - + (p[0]); - - p += 2; + // UInt32 v = GetBe32a(p - 4); + #ifndef BR_SPARC_USE_ROTATE + v <<= 2; + #endif { - UInt32 cur = (ip + (UInt32)(p - data)) >> 1; - v += cur; + UInt32 c = BR_PC_GET; + BR_CONVERT_VAL(v, c) } - - p[-4] = (Byte)(v >> 11); - p[-3] = (Byte)(0xF0 | ((v >> 19) & 0x7)); - p[-2] = (Byte)v; - p[-1] = (Byte)(0xF8 | (v >> 8)); + v &= (flag << 3) - 1; + #ifdef BR_SPARC_USE_ROTATE + v -= (flag << 2) - 1; + v = rotrFixed(v, 2); + #else + v -= (flag << 2); + v >>= 2; + v |= (UInt32)1 << 30; + #endif + SetBe32a(p - 4, v) } } +} +Z7_BRANCH_FUNCS_IMP(BranchConv_SPARC) + + +Z7_BRANCH_FUNC_MAIN(BranchConv_ARMT) +{ + // Byte *p = data; + Byte *lim; + size &= ~(SizeT)1; + // if (size == 0) return p; + if (size <= 2) return p; + size -= 2; + lim = p + size; + BR_PC_INIT + /* in ARM: branch offset is relative to the +2 instructions from current instruction. + (p) will point to the +2 instructions from current instruction */ + // pc += 4 - 4; + // if (encoding) pc -= 0xf800 << 1; else pc += 0xf800 << 1; + // #define ARMT_TAIL_PROC { goto armt_tail; } + #define ARMT_TAIL_PROC { return p; } - for (;;) + do { - UInt32 b1; + /* in MSVC 32-bit x86 compilers: + UInt32 version : it loads value from memory with movzx + Byte version : it loads value to 8-bit register (AL/CL) + movzx version is slightly faster in some cpus + */ + unsigned b1; + // Byte / unsigned + b1 = p[1]; + // optimized version to reduce one (p >= lim) check: + // unsigned a1 = p[1]; b1 = p[3]; p += 2; if Z7_LIKELY((b1 & (a1 ^ 8)) < 0xf8) for (;;) { - UInt32 b3; - if (p > lim) - return p - data; - b1 = p[1]; - b3 = p[3]; - p += 2; - b1 ^= 8; - if ((b3 & b1) >= 0xF8) - break; + unsigned b3; // Byte / UInt32 + /* (Byte)(b3) normalization can use low byte computations in MSVC. + It gives smaller code, and no loss of speed in some compilers/cpus. + But new MSVC 32-bit x86 compilers use more slow load + from memory to low byte register in that case. + So we try to use full 32-bit computations for faster code. + */ + // if (p >= lim) { ARMT_TAIL_PROC } b3 = b1 + 8; b1 = p[3]; p += 2; if ((b3 & b1) >= 0xf8) break; + if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC } b3 = p[3]; p += 2; if Z7_UNLIKELY((b3 & (b1 ^ 8)) >= 0xf8) break; + if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC } b1 = p[3]; p += 2; if Z7_UNLIKELY((b1 & (b3 ^ 8)) >= 0xf8) break; } { + /* we can adjust pc for (0xf800) to rid of (& 0x7FF) operation. + But gcc/clang for arm64 can use bfi instruction for full code here */ UInt32 v = - ((UInt32)b1 << 19) + ((UInt32)GetUi16a(p - 2) << 11) | + ((UInt32)GetUi16a(p) & 0x7FF); + /* + UInt32 v = + ((UInt32)p[1 - 2] << 19) + (((UInt32)p[1] & 0x7) << 8) + (((UInt32)p[-2] << 11)) + (p[0]); - + */ p += 2; { - UInt32 cur = (ip + (UInt32)(p - data)) >> 1; - v -= cur; + UInt32 c = BR_PC_GET >> 1; + BR_CONVERT_VAL(v, c) } - + SetUi16a(p - 4, (UInt16)(((v >> 11) & 0x7ff) | 0xf000)) + SetUi16a(p - 2, (UInt16)(v | 0xf800)) /* - SetUi16(p - 4, (UInt16)(((v >> 11) & 0x7FF) | 0xF000)); - SetUi16(p - 2, (UInt16)(v | 0xF800)); - */ - p[-4] = (Byte)(v >> 11); - p[-3] = (Byte)(0xF0 | ((v >> 19) & 0x7)); + p[-3] = (Byte)(0xf0 | ((v >> 19) & 0x7)); p[-2] = (Byte)v; - p[-1] = (Byte)(0xF8 | (v >> 8)); + p[-1] = (Byte)(0xf8 | (v >> 8)); + */ } } + while (p < lim); + return p; + // armt_tail: + // if ((Byte)((lim[1] & 0xf8)) != 0xf0) { lim += 2; } return lim; + // return (Byte *)(lim + ((Byte)((lim[1] ^ 0xf0) & 0xf8) == 0 ? 0 : 2)); + // return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2)); + // return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2)); } +Z7_BRANCH_FUNCS_IMP(BranchConv_ARMT) -SizeT PPC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) +// #define BR_IA64_NO_INLINE + +Z7_BRANCH_FUNC_MAIN(BranchConv_IA64) { - Byte *p; + // Byte *p = data; const Byte *lim; - size &= ~(size_t)3; - ip -= 4; - p = data; - lim = data + size; - + size &= ~(SizeT)15; + lim = p + size; + pc -= 1 << 4; + pc >>= 4 - 1; + // pc -= 1 << 1; + for (;;) { + unsigned m; for (;;) { - if (p >= lim) - return p - data; - p += 4; - /* if ((v & 0xFC000003) == 0x48000001) */ - if ((p[-4] & 0xFC) == 0x48 && (p[-1] & 3) == 1) + if Z7_UNLIKELY(p == lim) + return p; + m = (unsigned)((UInt32)0x334b0000 >> (*p & 0x1e)); + p += 16; + pc += 1 << 1; + if (m &= 3) break; } { - UInt32 v = GetBe32(p - 4); - if (encoding) - v += ip + (UInt32)(p - data); - else - v -= ip + (UInt32)(p - data); - v &= 0x03FFFFFF; - v |= 0x48000000; - SetBe32(p - 4, v); + p += (ptrdiff_t)m * 5 - 20; // negative value is expected here. + do + { + const UInt32 t = + #if defined(MY_CPU_X86_OR_AMD64) + // we use 32-bit load here to reduce code size on x86: + GetUi32(p); + #else + GetUi16(p); + #endif + UInt32 z = GetUi32(p + 1) >> m; + p += 5; + if (((t >> m) & (0x70 << 1)) == 0 + && ((z - (0x5000000 << 1)) & (0xf000000 << 1)) == 0) + { + UInt32 v = (UInt32)((0x8fffff << 1) | 1) & z; + z ^= v; + #ifdef BR_IA64_NO_INLINE + v |= (v & ((UInt32)1 << (23 + 1))) >> 3; + { + UInt32 c = pc; + BR_CONVERT_VAL(v, c) + } + v &= (0x1fffff << 1) | 1; + #else + { + if (encoding) + { + // pc &= ~(0xc00000 << 1); // we just need to clear at least 2 bits + pc &= (0x1fffff << 1) | 1; + v += pc; + } + else + { + // pc |= 0xc00000 << 1; // we need to set at least 2 bits + pc |= ~(UInt32)((0x1fffff << 1) | 1); + v -= pc; + } + } + v &= ~(UInt32)(0x600000 << 1); + #endif + v += (0x700000 << 1); + v &= (0x8fffff << 1) | 1; + z |= v; + z <<= m; + SetUi32(p + 1 - 5, z) + } + m++; + } + while (m &= 3); // while (m < 4); } } } +Z7_BRANCH_FUNCS_IMP(BranchConv_IA64) + + +#define BR_CONVERT_VAL_ENC(v) v += BR_PC_GET; +#define BR_CONVERT_VAL_DEC(v) v -= BR_PC_GET; +#if 1 && defined(MY_CPU_LE_UNALIGN) + #define RISCV_USE_UNALIGNED_LOAD +#endif -SizeT SPARC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) +#ifdef RISCV_USE_UNALIGNED_LOAD + #define RISCV_GET_UI32(p) GetUi32(p) + #define RISCV_SET_UI32(p, v) { SetUi32(p, v) } +#else + #define RISCV_GET_UI32(p) \ + ((UInt32)GetUi16a(p) + \ + ((UInt32)GetUi16a((p) + 2) << 16)) + #define RISCV_SET_UI32(p, v) { \ + SetUi16a(p, (UInt16)(v)) \ + SetUi16a((p) + 2, (UInt16)(v >> 16)) } +#endif + +#if 1 && defined(MY_CPU_LE) + #define RISCV_USE_16BIT_LOAD +#endif + +#ifdef RISCV_USE_16BIT_LOAD + #define RISCV_LOAD_VAL(p) GetUi16a(p) +#else + #define RISCV_LOAD_VAL(p) (*(p)) +#endif + +#define RISCV_INSTR_SIZE 2 +#define RISCV_STEP_1 (4 + RISCV_INSTR_SIZE) +#define RISCV_STEP_2 4 +#define RISCV_REG_VAL (2 << 7) +#define RISCV_CMD_VAL 3 +#if 1 + // for code size optimization: + #define RISCV_DELTA_7F 0x7f +#else + #define RISCV_DELTA_7F 0 +#endif + +#define RISCV_CHECK_1(v, b) \ + (((((b) - RISCV_CMD_VAL) ^ ((v) << 8)) & (0xf8000 + RISCV_CMD_VAL)) == 0) + +#if 1 + #define RISCV_CHECK_2(v, r) \ + ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL | 8)) \ + << 18) \ + < ((r) & 0x1d)) +#else + // this branch gives larger code, because + // compilers generate larger code for big constants. + #define RISCV_CHECK_2(v, r) \ + ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \ + & ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \ + < ((r) & 0x1d)) +#endif + + +#define RISCV_SCAN_LOOP \ + Byte *lim; \ + size &= ~(SizeT)(RISCV_INSTR_SIZE - 1); \ + if (size <= 6) return p; \ + size -= 6; \ + lim = p + size; \ + BR_PC_INIT \ + for (;;) \ + { \ + UInt32 a, v; \ + /* Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE */ \ + for (;;) \ + { \ + if Z7_UNLIKELY(p >= lim) { return p; } \ + a = (RISCV_LOAD_VAL(p) ^ 0x10u) + 1; \ + if ((a & 0x77) == 0) break; \ + a = (RISCV_LOAD_VAL(p + RISCV_INSTR_SIZE) ^ 0x10u) + 1; \ + p += RISCV_INSTR_SIZE * 2; \ + if ((a & 0x77) == 0) \ + { \ + p -= RISCV_INSTR_SIZE; \ + if Z7_UNLIKELY(p >= lim) { return p; } \ + break; \ + } \ + } +// (xx6f ^ 10) + 1 = xx7f + 1 = xx80 : JAL +// (xxef ^ 10) + 1 = xxff + 1 = xx00 + 100 : JAL +// (xx17 ^ 10) + 1 = xx07 + 1 = xx08 : AUIPC +// (xx97 ^ 10) + 1 = xx87 + 1 = xx88 : AUIPC + +Byte * Z7_BRANCH_CONV_ENC(RISCV)(Byte *p, SizeT size, UInt32 pc) { - Byte *p; - const Byte *lim; - size &= ~(size_t)3; - ip -= 4; - p = data; - lim = data + size; + RISCV_SCAN_LOOP + v = a; + a = RISCV_GET_UI32(p); +#ifndef RISCV_USE_16BIT_LOAD + v += (UInt32)p[1] << 8; +#endif - for (;;) - { - for (;;) + if ((v & 8) == 0) // JAL { - if (p >= lim) - return p - data; - /* - v = GetBe32(p); - p += 4; - m = v + ((UInt32)5 << 29); - m ^= (UInt32)7 << 29; - m += (UInt32)1 << 22; - if ((m & ((UInt32)0x1FF << 23)) == 0) - break; - */ + if ((v - (0x100 /* - RISCV_DELTA_7F */)) & 0xd80) + { + p += RISCV_INSTR_SIZE; + continue; + } + { + v = ((a & 1u << 31) >> 11) + | ((a & 0x3ff << 21) >> 20) + | ((a & 1 << 20) >> 9) + | (a & 0xff << 12); + BR_CONVERT_VAL_ENC(v) + // ((v & 1) == 0) + // v: bits [1 : 20] contain offset bits +#if 0 && defined(RISCV_USE_UNALIGNED_LOAD) + a &= 0xfff; + a |= ((UInt32)(v << 23)) + | ((UInt32)(v << 7) & ((UInt32)0xff << 16)) + | ((UInt32)(v >> 5) & ((UInt32)0xf0 << 8)); + RISCV_SET_UI32(p, a) +#else // aligned +#if 0 + SetUi16a(p, (UInt16)(((v >> 5) & 0xf000) | (a & 0xfff))) +#else + p[1] = (Byte)(((v >> 13) & 0xf0) | ((a >> 8) & 0xf)); +#endif + +#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) + v <<= 15; + v = Z7_BSWAP32(v); + SetUi16a(p + 2, (UInt16)v) +#else + p[2] = (Byte)(v >> 9); + p[3] = (Byte)(v >> 1); +#endif +#endif // aligned + } p += 4; - if ((p[-4] == 0x40 && (p[-3] & 0xC0) == 0) || - (p[-4] == 0x7F && (p[-3] >= 0xC0))) - break; + continue; + } // JAL + + { + // AUIPC + if (v & 0xe80) // (not x0) and (not x2) + { + const UInt32 b = RISCV_GET_UI32(p + 4); + if (RISCV_CHECK_1(v, b)) + { + { + const UInt32 temp = (b << 12) | (0x17 + RISCV_REG_VAL); + RISCV_SET_UI32(p, temp) + } + a &= 0xfffff000; + { +#if 1 + const int t = -1 >> 1; + if (t != -1) + a += (b >> 20) - ((b >> 19) & 0x1000); // arithmetic right shift emulation + else +#endif + a += (UInt32)((Int32)b >> 20); // arithmetic right shift (sign-extension). + } + BR_CONVERT_VAL_ENC(a) +#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) + a = Z7_BSWAP32(a); + RISCV_SET_UI32(p + 4, a) +#else + SetBe32(p + 4, a) +#endif + p += 8; + } + else + p += RISCV_STEP_1; + } + else + { + UInt32 r = a >> 27; + if (RISCV_CHECK_2(v, r)) + { + v = RISCV_GET_UI32(p + 4); + r = (r << 7) + 0x17 + (v & 0xfffff000); + a = (a >> 12) | (v << 20); + RISCV_SET_UI32(p, r) + RISCV_SET_UI32(p + 4, a) + p += 8; + } + else + p += RISCV_STEP_2; + } } + } // for +} + + +Byte * Z7_BRANCH_CONV_DEC(RISCV)(Byte *p, SizeT size, UInt32 pc) +{ + RISCV_SCAN_LOOP +#ifdef RISCV_USE_16BIT_LOAD + if ((a & 8) == 0) { - UInt32 v = GetBe32(p - 4); - v <<= 2; - if (encoding) - v += ip + (UInt32)(p - data); +#else + v = a; + a += (UInt32)p[1] << 8; + if ((v & 8) == 0) + { +#endif + // JAL + a -= 0x100 - RISCV_DELTA_7F; + if (a & 0xd80) + { + p += RISCV_INSTR_SIZE; + continue; + } + { + const UInt32 a_old = (a + (0xef - RISCV_DELTA_7F)) & 0xfff; +#if 0 // unaligned + a = GetUi32(p); + v = (UInt32)(a >> 23) & ((UInt32)0xff << 1) + | (UInt32)(a >> 7) & ((UInt32)0xff << 9) +#elif 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) + v = GetUi16a(p + 2); + v = Z7_BSWAP32(v) >> 15 +#else + v = (UInt32)p[3] << 1 + | (UInt32)p[2] << 9 +#endif + | (UInt32)((a & 0xf000) << 5); + BR_CONVERT_VAL_DEC(v) + a = a_old + | (v << 11 & 1u << 31) + | (v << 20 & 0x3ff << 21) + | (v << 9 & 1 << 20) + | (v & 0xff << 12); + RISCV_SET_UI32(p, a) + } + p += 4; + continue; + } // JAL + + { + // AUIPC + v = a; +#if 1 && defined(RISCV_USE_UNALIGNED_LOAD) + a = GetUi32(p); +#else + a |= (UInt32)GetUi16a(p + 2) << 16; +#endif + if ((v & 0xe80) == 0) // x0/x2 + { + const UInt32 r = a >> 27; + if (RISCV_CHECK_2(v, r)) + { + UInt32 b; +#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) + b = RISCV_GET_UI32(p + 4); + b = Z7_BSWAP32(b); +#else + b = GetBe32(p + 4); +#endif + v = a >> 12; + BR_CONVERT_VAL_DEC(b) + a = (r << 7) + 0x17; + a += (b + 0x800) & 0xfffff000; + v |= b << 20; + RISCV_SET_UI32(p, a) + RISCV_SET_UI32(p + 4, v) + p += 8; + } + else + p += RISCV_STEP_2; + } else - v -= ip + (UInt32)(p - data); - - v &= 0x01FFFFFF; - v -= (UInt32)1 << 24; - v ^= 0xFF000000; - v >>= 2; - v |= 0x40000000; - SetBe32(p - 4, v); + { + const UInt32 b = RISCV_GET_UI32(p + 4); + if (!RISCV_CHECK_1(v, b)) + p += RISCV_STEP_1; + else + { + v = (a & 0xfffff000) | (b >> 20); + a = (b << 12) | (0x17 + RISCV_REG_VAL); + RISCV_SET_UI32(p, a) + RISCV_SET_UI32(p + 4, v) + p += 8; + } + } } - } + } // for } diff --git a/src/sdk/C/Bra.h b/src/sdk/C/Bra.h index 855e37a..b47112c 100644 --- a/src/sdk/C/Bra.h +++ b/src/sdk/C/Bra.h @@ -1,64 +1,105 @@ /* Bra.h -- Branch converters for executables -2013-01-18 : Igor Pavlov : Public domain */ +2024-01-20 : Igor Pavlov : Public domain */ -#ifndef __BRA_H -#define __BRA_H +#ifndef ZIP7_INC_BRA_H +#define ZIP7_INC_BRA_H #include "7zTypes.h" EXTERN_C_BEGIN +/* #define PPC BAD_PPC_11 // for debug */ + +#define Z7_BRANCH_CONV_DEC_2(name) z7_ ## name ## _Dec +#define Z7_BRANCH_CONV_ENC_2(name) z7_ ## name ## _Enc +#define Z7_BRANCH_CONV_DEC(name) Z7_BRANCH_CONV_DEC_2(BranchConv_ ## name) +#define Z7_BRANCH_CONV_ENC(name) Z7_BRANCH_CONV_ENC_2(BranchConv_ ## name) +#define Z7_BRANCH_CONV_ST_DEC(name) z7_BranchConvSt_ ## name ## _Dec +#define Z7_BRANCH_CONV_ST_ENC(name) z7_BranchConvSt_ ## name ## _Enc + +#define Z7_BRANCH_CONV_DECL(name) Byte * name(Byte *data, SizeT size, UInt32 pc) +#define Z7_BRANCH_CONV_ST_DECL(name) Byte * name(Byte *data, SizeT size, UInt32 pc, UInt32 *state) + +typedef Z7_BRANCH_CONV_DECL( (*z7_Func_BranchConv)); +typedef Z7_BRANCH_CONV_ST_DECL((*z7_Func_BranchConvSt)); + +#define Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL 0 +Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_DEC(X86)); +Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_ENC(X86)); + +#define Z7_BRANCH_FUNCS_DECL(name) \ +Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_DEC_2(name)); \ +Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_ENC_2(name)); + +Z7_BRANCH_FUNCS_DECL (BranchConv_ARM64) +Z7_BRANCH_FUNCS_DECL (BranchConv_ARM) +Z7_BRANCH_FUNCS_DECL (BranchConv_ARMT) +Z7_BRANCH_FUNCS_DECL (BranchConv_PPC) +Z7_BRANCH_FUNCS_DECL (BranchConv_SPARC) +Z7_BRANCH_FUNCS_DECL (BranchConv_IA64) +Z7_BRANCH_FUNCS_DECL (BranchConv_RISCV) + /* -These functions convert relative addresses to absolute addresses -in CALL instructions to increase the compression ratio. - - In: - data - data buffer - size - size of data - ip - current virtual Instruction Pinter (IP) value - state - state variable for x86 converter - encoding - 0 (for decoding), 1 (for encoding) - - Out: - state - state variable for x86 converter +These functions convert data that contain CPU instructions. +Each such function converts relative addresses to absolute addresses in some +branch instructions: CALL (in all converters) and JUMP (X86 converter only). +Such conversion allows to increase compression ratio, if we compress that data. + +There are 2 types of converters: + Byte * Conv_RISC (Byte *data, SizeT size, UInt32 pc); + Byte * ConvSt_X86(Byte *data, SizeT size, UInt32 pc, UInt32 *state); +Each Converter supports 2 versions: one for encoding +and one for decoding (_Enc/_Dec postfixes in function name). - Returns: - The number of processed bytes. If you call these functions with multiple calls, - you must start next call with first byte after block of processed bytes. +In params: + data : data buffer + size : size of data + pc : current virtual Program Counter (Instruction Pointer) value +In/Out param: + state : pointer to state variable (for X86 converter only) + +Return: + The pointer to position in (data) buffer after last byte that was processed. + If the caller calls converter again, it must call it starting with that position. + But the caller is allowed to move data in buffer. So pointer to + current processed position also will be changed for next call. + Also the caller must increase internal (pc) value for next call. +Each converter has some characteristics: Endian, Alignment, LookAhead. Type Endian Alignment LookAhead - x86 little 1 4 + X86 little 1 4 ARMT little 2 2 + RISCV little 2 6 ARM little 4 0 + ARM64 little 4 0 PPC big 4 0 SPARC big 4 0 IA64 little 16 0 - size must be >= Alignment + LookAhead, if it's not last block. - If (size < Alignment + LookAhead), converter returns 0. - - Example: + (data) must be aligned for (Alignment). + processed size can be calculated as: + SizeT processed = Conv(data, size, pc) - data; + if (processed == 0) + it means that converter needs more data for processing. + If (size < Alignment + LookAhead) + then (processed == 0) is allowed. - UInt32 ip = 0; - for () - { - ; size must be >= Alignment + LookAhead, if it's not last block - SizeT processed = Convert(data, size, ip, 1); - data += processed; - size -= processed; - ip += processed; - } +Example code for conversion in loop: + UInt32 pc = 0; + size = 0; + for (;;) + { + size += Load_more_input_data(data + size); + SizeT processed = Conv(data, size, pc) - data; + if (processed == 0 && no_more_input_data_after_size) + break; // we stop convert loop + data += processed; + size -= processed; + pc += processed; + } */ -#define x86_Convert_Init(state) { state = 0; } -SizeT x86_Convert(Byte *data, SizeT size, UInt32 ip, UInt32 *state, int encoding); -SizeT ARM_Convert(Byte *data, SizeT size, UInt32 ip, int encoding); -SizeT ARMT_Convert(Byte *data, SizeT size, UInt32 ip, int encoding); -SizeT PPC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding); -SizeT SPARC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding); -SizeT IA64_Convert(Byte *data, SizeT size, UInt32 ip, int encoding); - EXTERN_C_END #endif diff --git a/src/sdk/C/Bra86.c b/src/sdk/C/Bra86.c index 93ed4d7..d81f392 100644 --- a/src/sdk/C/Bra86.c +++ b/src/sdk/C/Bra86.c @@ -1,82 +1,187 @@ -/* Bra86.c -- Converter for x86 code (BCJ) -2017-04-03 : Igor Pavlov : Public domain */ +/* Bra86.c -- Branch converter for X86 code (BCJ) +2023-04-02 : Igor Pavlov : Public domain */ #include "Precomp.h" #include "Bra.h" +#include "CpuArch.h" -#define Test86MSByte(b) ((((b) + 1) & 0xFE) == 0) -SizeT x86_Convert(Byte *data, SizeT size, UInt32 ip, UInt32 *state, int encoding) +#if defined(MY_CPU_SIZEOF_POINTER) \ + && ( MY_CPU_SIZEOF_POINTER == 4 \ + || MY_CPU_SIZEOF_POINTER == 8) + #define BR_CONV_USE_OPT_PC_PTR +#endif + +#ifdef BR_CONV_USE_OPT_PC_PTR +#define BR_PC_INIT pc -= (UInt32)(SizeT)p; // (MY_uintptr_t) +#define BR_PC_GET (pc + (UInt32)(SizeT)p) +#else +#define BR_PC_INIT pc += (UInt32)size; +#define BR_PC_GET (pc - (UInt32)(SizeT)(lim - p)) +// #define BR_PC_INIT +// #define BR_PC_GET (pc + (UInt32)(SizeT)(p - data)) +#endif + +#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c; +// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c; + +#define Z7_BRANCH_CONV_ST(name) z7_BranchConvSt_ ## name + +#define BR86_NEED_CONV_FOR_MS_BYTE(b) ((((b) + 1) & 0xfe) == 0) + +#ifdef MY_CPU_LE_UNALIGN + #define BR86_PREPARE_BCJ_SCAN const UInt32 v = GetUi32(p) ^ 0xe8e8e8e8; + #define BR86_IS_BCJ_BYTE(n) ((v & ((UInt32)0xfe << (n) * 8)) == 0) +#else + #define BR86_PREPARE_BCJ_SCAN + // bad for MSVC X86 (partial write to byte reg): + #define BR86_IS_BCJ_BYTE(n) ((p[n - 4] & 0xfe) == 0xe8) + // bad for old MSVC (partial write to byte reg): + // #define BR86_IS_BCJ_BYTE(n) (((*p ^ 0xe8) & 0xfe) == 0) +#endif + +static +Z7_FORCE_INLINE +Z7_ATTRIB_NO_VECTOR +Byte *Z7_BRANCH_CONV_ST(X86)(Byte *p, SizeT size, UInt32 pc, UInt32 *state, int encoding) { - SizeT pos = 0; - UInt32 mask = *state & 7; if (size < 5) - return 0; - size -= 4; - ip += 5; + return p; + { + // Byte *p = data; + const Byte *lim = p + size - 4; + unsigned mask = (unsigned)*state; // & 7; +#ifdef BR_CONV_USE_OPT_PC_PTR + /* if BR_CONV_USE_OPT_PC_PTR is defined: we need to adjust (pc) for (+4), + because call/jump offset is relative to the next instruction. + if BR_CONV_USE_OPT_PC_PTR is not defined : we don't need to adjust (pc) for (+4), + because BR_PC_GET uses (pc - (lim - p)), and lim was adjusted for (-4) before. + */ + pc += 4; +#endif + BR_PC_INIT + goto start; - for (;;) + for (;; mask |= 4) { - Byte *p = data + pos; - const Byte *limit = data + size; - for (; p < limit; p++) - if ((*p & 0xFE) == 0xE8) - break; - + // cont: mask |= 4; + start: + if (p >= lim) + goto fin; { - SizeT d = (SizeT)(p - data - pos); - pos = (SizeT)(p - data); - if (p >= limit) - { - *state = (d > 2 ? 0 : mask >> (unsigned)d); - return pos; - } - if (d > 2) - mask = 0; - else - { - mask >>= (unsigned)d; - if (mask != 0 && (mask > 4 || mask == 3 || Test86MSByte(p[(size_t)(mask >> 1) + 1]))) - { - mask = (mask >> 1) | 4; - pos++; - continue; - } - } + BR86_PREPARE_BCJ_SCAN + p += 4; + if (BR86_IS_BCJ_BYTE(0)) { goto m0; } mask >>= 1; + if (BR86_IS_BCJ_BYTE(1)) { goto m1; } mask >>= 1; + if (BR86_IS_BCJ_BYTE(2)) { goto m2; } mask = 0; + if (BR86_IS_BCJ_BYTE(3)) { goto a3; } } + goto main_loop; - if (Test86MSByte(p[4])) + m0: p--; + m1: p--; + m2: p--; + if (mask == 0) + goto a3; + if (p > lim) + goto fin_p; + + // if (((0x17u >> mask) & 1) == 0) + if (mask > 4 || mask == 3) + { + mask >>= 1; + continue; // goto cont; + } + mask >>= 1; + if (BR86_NEED_CONV_FOR_MS_BYTE(p[mask])) + continue; // goto cont; + // if (!BR86_NEED_CONV_FOR_MS_BYTE(p[3])) continue; // goto cont; { - UInt32 v = ((UInt32)p[4] << 24) | ((UInt32)p[3] << 16) | ((UInt32)p[2] << 8) | ((UInt32)p[1]); - UInt32 cur = ip + (UInt32)pos; - pos += 5; - if (encoding) - v += cur; - else - v -= cur; - if (mask != 0) + UInt32 v = GetUi32(p); + UInt32 c; + v += (1 << 24); if (v & 0xfe000000) continue; // goto cont; + c = BR_PC_GET; + BR_CONVERT_VAL(v, c) { - unsigned sh = (mask & 6) << 2; - if (Test86MSByte((Byte)(v >> sh))) + mask <<= 3; + if (BR86_NEED_CONV_FOR_MS_BYTE(v >> mask)) { - v ^= (((UInt32)0x100 << sh) - 1); - if (encoding) - v += cur; - else - v -= cur; + v ^= (((UInt32)0x100 << mask) - 1); + #ifdef MY_CPU_X86 + // for X86 : we can recalculate (c) to reduce register pressure + c = BR_PC_GET; + #endif + BR_CONVERT_VAL(v, c) } mask = 0; } - p[1] = (Byte)v; - p[2] = (Byte)(v >> 8); - p[3] = (Byte)(v >> 16); - p[4] = (Byte)(0 - ((v >> 24) & 1)); + // v = (v & ((1 << 24) - 1)) - (v & (1 << 24)); + v &= (1 << 25) - 1; v -= (1 << 24); + SetUi32(p, v) + p += 4; + goto main_loop; } - else + + main_loop: + if (p >= lim) + goto fin; + for (;;) { - mask = (mask >> 1) | 4; - pos++; + BR86_PREPARE_BCJ_SCAN + p += 4; + if (BR86_IS_BCJ_BYTE(0)) { goto a0; } + if (BR86_IS_BCJ_BYTE(1)) { goto a1; } + if (BR86_IS_BCJ_BYTE(2)) { goto a2; } + if (BR86_IS_BCJ_BYTE(3)) { goto a3; } + if (p >= lim) + goto fin; + } + + a0: p--; + a1: p--; + a2: p--; + a3: + if (p > lim) + goto fin_p; + // if (!BR86_NEED_CONV_FOR_MS_BYTE(p[3])) continue; // goto cont; + { + UInt32 v = GetUi32(p); + UInt32 c; + v += (1 << 24); if (v & 0xfe000000) continue; // goto cont; + c = BR_PC_GET; + BR_CONVERT_VAL(v, c) + // v = (v & ((1 << 24) - 1)) - (v & (1 << 24)); + v &= (1 << 25) - 1; v -= (1 << 24); + SetUi32(p, v) + p += 4; + goto main_loop; } } + +fin_p: + p--; +fin: + // the following processing for tail is optional and can be commented + /* + lim += 4; + for (; p < lim; p++, mask >>= 1) + if ((*p & 0xfe) == 0xe8) + break; + */ + *state = (UInt32)mask; + return p; + } } + + +#define Z7_BRANCH_CONV_ST_FUNC_IMP(name, m, encoding) \ +Z7_NO_INLINE \ +Z7_ATTRIB_NO_VECTOR \ +Byte *m(name)(Byte *data, SizeT size, UInt32 pc, UInt32 *state) \ + { return Z7_BRANCH_CONV_ST(name)(data, size, pc, state, encoding); } + +Z7_BRANCH_CONV_ST_FUNC_IMP(X86, Z7_BRANCH_CONV_ST_DEC, 0) +#ifndef Z7_EXTRACT_ONLY +Z7_BRANCH_CONV_ST_FUNC_IMP(X86, Z7_BRANCH_CONV_ST_ENC, 1) +#endif diff --git a/src/sdk/C/BraIA64.c b/src/sdk/C/BraIA64.c index d1dbc62..9dfe3e2 100644 --- a/src/sdk/C/BraIA64.c +++ b/src/sdk/C/BraIA64.c @@ -1,53 +1,14 @@ /* BraIA64.c -- Converter for IA-64 code -2017-01-26 : Igor Pavlov : Public domain */ +2023-02-20 : Igor Pavlov : Public domain */ #include "Precomp.h" -#include "CpuArch.h" -#include "Bra.h" +// the code was moved to Bra.c -SizeT IA64_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) -{ - SizeT i; - if (size < 16) - return 0; - size -= 16; - i = 0; - do - { - unsigned m = ((UInt32)0x334B0000 >> (data[i] & 0x1E)) & 3; - if (m) - { - m++; - do - { - Byte *p = data + (i + (size_t)m * 5 - 8); - if (((p[3] >> m) & 15) == 5 - && (((p[-1] | ((UInt32)p[0] << 8)) >> m) & 0x70) == 0) - { - unsigned raw = GetUi32(p); - unsigned v = raw >> m; - v = (v & 0xFFFFF) | ((v & (1 << 23)) >> 3); - - v <<= 4; - if (encoding) - v += ip + (UInt32)i; - else - v -= ip + (UInt32)i; - v >>= 4; - - v &= 0x1FFFFF; - v += 0x700000; - v &= 0x8FFFFF; - raw &= ~((UInt32)0x8FFFFF << m); - raw |= (v << m); - SetUi32(p, raw); - } - } - while (++m <= 4); - } - i += 16; - } - while (i <= size); - return i; -} +#ifdef _MSC_VER +#pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty +#endif + +#if defined(__clang__) +#pragma GCC diagnostic ignored "-Wempty-translation-unit" +#endif diff --git a/src/sdk/C/Compiler.h b/src/sdk/C/Compiler.h index 0cc409d..b266b27 100644 --- a/src/sdk/C/Compiler.h +++ b/src/sdk/C/Compiler.h @@ -1,8 +1,105 @@ -/* Compiler.h -2017-04-03 : Igor Pavlov : Public domain */ +/* Compiler.h : Compiler specific defines and pragmas +: Igor Pavlov : Public domain */ + +#ifndef ZIP7_INC_COMPILER_H +#define ZIP7_INC_COMPILER_H + +#if defined(__clang__) +# define Z7_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) +#endif +#if defined(__clang__) && defined(__apple_build_version__) +# define Z7_APPLE_CLANG_VERSION Z7_CLANG_VERSION +#elif defined(__clang__) +# define Z7_LLVM_CLANG_VERSION Z7_CLANG_VERSION +#elif defined(__GNUC__) +# define Z7_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#endif + +#ifdef _MSC_VER +#if !defined(__clang__) && !defined(__GNUC__) +#define Z7_MSC_VER_ORIGINAL _MSC_VER +#endif +#endif + +#if defined(__MINGW32__) || defined(__MINGW64__) +#define Z7_MINGW +#endif + +#if defined(__LCC__) && (defined(__MCST__) || defined(__e2k__)) +#define Z7_MCST_LCC +#define Z7_MCST_LCC_VERSION (__LCC__ * 100 + __LCC_MINOR__) +#endif + +/* +#if defined(__AVX2__) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ + || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ + || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \ + || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ + || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) + #define Z7_COMPILER_AVX2_SUPPORTED + #endif +#endif +*/ + +// #pragma GCC diagnostic ignored "-Wunknown-pragmas" + +#ifdef __clang__ +// padding size of '' with 4 bytes to alignment boundary +#pragma GCC diagnostic ignored "-Wpadded" + +#if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) \ + && defined(__FreeBSD__) +// freebsd: +#pragma GCC diagnostic ignored "-Wexcess-padding" +#endif + +#if __clang_major__ >= 16 +#pragma GCC diagnostic ignored "-Wunsafe-buffer-usage" +#endif + +#if __clang_major__ == 13 +#if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) +// cheri +#pragma GCC diagnostic ignored "-Wcapability-to-integer-cast" +#endif +#endif + +#if __clang_major__ == 13 + // for + #pragma GCC diagnostic ignored "-Wreserved-identifier" +#endif + +#endif // __clang__ + +#if defined(_WIN32) && defined(__clang__) && __clang_major__ >= 16 +// #pragma GCC diagnostic ignored "-Wcast-function-type-strict" +#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION \ + _Pragma("GCC diagnostic ignored \"-Wcast-function-type-strict\"") +#else +#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION +#endif + +typedef void (*Z7_void_Function)(void); +#if defined(__clang__) || defined(__GNUC__) +#define Z7_CAST_FUNC_C (Z7_void_Function) +#elif defined(_MSC_VER) && _MSC_VER > 1920 +#define Z7_CAST_FUNC_C (void *) +// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' +#else +#define Z7_CAST_FUNC_C +#endif +/* +#if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__) + // #pragma GCC diagnostic ignored "-Wcast-function-type" +#endif +*/ +#ifdef __GNUC__ +#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40000) && (Z7_GCC_VERSION < 70000) +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif +#endif -#ifndef __7Z_COMPILER_H -#define __7Z_COMPILER_H #ifdef _MSC_VER @@ -13,18 +110,134 @@ #pragma warning(disable : 4214) // nonstandard extension used : bit field types other than int #endif - #if _MSC_VER >= 1300 - #pragma warning(disable : 4996) // This function or variable may be unsafe - #else - #pragma warning(disable : 4511) // copy constructor could not be generated - #pragma warning(disable : 4512) // assignment operator could not be generated - #pragma warning(disable : 4514) // unreferenced inline function has been removed - #pragma warning(disable : 4702) // unreachable code - #pragma warning(disable : 4710) // not inlined - #pragma warning(disable : 4714) // function marked as __forceinline not inlined - #pragma warning(disable : 4786) // identifier was truncated to '255' characters in the debug information - #endif +#if defined(_MSC_VER) && _MSC_VER >= 1800 +#pragma warning(disable : 4464) // relative include path contains '..' +#endif + +// == 1200 : -O1 : for __forceinline +// >= 1900 : -O1 : for printf +#pragma warning(disable : 4710) // function not inlined + +#if _MSC_VER < 1900 +// winnt.h: 'Int64ShllMod32' +#pragma warning(disable : 4514) // unreferenced inline function has been removed +#endif + +#if _MSC_VER < 1300 +// #pragma warning(disable : 4702) // unreachable code +// Bra.c : -O1: +#pragma warning(disable : 4714) // function marked as __forceinline not inlined +#endif + +/* +#if _MSC_VER > 1400 && _MSC_VER <= 1900 +// strcat: This function or variable may be unsafe +// sysinfoapi.h: kit10: GetVersion was declared deprecated +#pragma warning(disable : 4996) +#endif +*/ + +#if _MSC_VER > 1200 +// -Wall warnings + +#pragma warning(disable : 4711) // function selected for automatic inline expansion +#pragma warning(disable : 4820) // '2' bytes padding added after data member + +#if _MSC_VER >= 1400 && _MSC_VER < 1920 +// 1400: string.h: _DBG_MEMCPY_INLINE_ +// 1600 - 191x : smmintrin.h __cplusplus' +// is not defined as a preprocessor macro, replacing with '0' for '#if/#elif' +#pragma warning(disable : 4668) + +// 1400 - 1600 : WinDef.h : 'FARPROC' : +// 1900 - 191x : immintrin.h: _readfsbase_u32 +// no function prototype given : converting '()' to '(void)' +#pragma warning(disable : 4255) +#endif + +#if _MSC_VER >= 1914 +// Compiler will insert Spectre mitigation for memory load if /Qspectre switch specified +#pragma warning(disable : 5045) +#endif + +#endif // _MSC_VER > 1200 +#endif // _MSC_VER + + +#if defined(__clang__) && (__clang_major__ >= 4) + #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \ + _Pragma("clang loop unroll(disable)") \ + _Pragma("clang loop vectorize(disable)") + #define Z7_ATTRIB_NO_VECTORIZE +#elif defined(__GNUC__) && (__GNUC__ >= 5) \ + && (!defined(Z7_MCST_LCC_VERSION) || (Z7_MCST_LCC_VERSION >= 12610)) + #define Z7_ATTRIB_NO_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) + // __attribute__((optimize("no-unroll-loops"))); + #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE +#elif defined(_MSC_VER) && (_MSC_VER >= 1920) + #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \ + _Pragma("loop( no_vector )") + #define Z7_ATTRIB_NO_VECTORIZE +#else + #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + #define Z7_ATTRIB_NO_VECTORIZE +#endif + +#if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920) + #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )") + #define Z7_PRAGMA_OPTIMIZE_DEFAULT _Pragma("optimize ( \"\", on )") +#else + #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE + #define Z7_PRAGMA_OPTIMIZE_DEFAULT +#endif + + + +#if defined(MY_CPU_X86_OR_AMD64) && ( \ + defined(__clang__) && (__clang_major__ >= 4) \ + || defined(__GNUC__) && (__GNUC__ >= 5)) + #define Z7_ATTRIB_NO_SSE __attribute__((__target__("no-sse"))) +#else + #define Z7_ATTRIB_NO_SSE +#endif + +#define Z7_ATTRIB_NO_VECTOR \ + Z7_ATTRIB_NO_VECTORIZE \ + Z7_ATTRIB_NO_SSE + + +#if defined(__clang__) && (__clang_major__ >= 8) \ + || defined(__GNUC__) && (__GNUC__ >= 1000) \ + /* || defined(_MSC_VER) && (_MSC_VER >= 1920) */ + // GCC is not good for __builtin_expect() + #define Z7_LIKELY(x) (__builtin_expect((x), 1)) + #define Z7_UNLIKELY(x) (__builtin_expect((x), 0)) + // #define Z7_unlikely [[unlikely]] + // #define Z7_likely [[likely]] +#else + #define Z7_LIKELY(x) (x) + #define Z7_UNLIKELY(x) (x) + // #define Z7_likely +#endif + + +#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30600)) + +#if (Z7_CLANG_VERSION < 130000) +#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wreserved-id-macro\"") +#else +#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wreserved-macro-identifier\"") +#endif +#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \ + _Pragma("GCC diagnostic pop") +#else +#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER #endif #define UNUSED_VAR(x) (void)x; diff --git a/src/sdk/C/CpuArch.c b/src/sdk/C/CpuArch.c index 02e482e..6e02551 100644 --- a/src/sdk/C/CpuArch.c +++ b/src/sdk/C/CpuArch.c @@ -1,144 +1,357 @@ /* CpuArch.c -- CPU specific code -2018-02-18: Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #include "Precomp.h" +// #include + #include "CpuArch.h" #ifdef MY_CPU_X86_OR_AMD64 -#if (defined(_MSC_VER) && !defined(MY_CPU_AMD64)) || defined(__GNUC__) -#define USE_ASM +#undef NEED_CHECK_FOR_CPUID +#if !defined(MY_CPU_AMD64) +#define NEED_CHECK_FOR_CPUID #endif -#if !defined(USE_ASM) && _MSC_VER >= 1500 -#include +/* + cpuid instruction supports (subFunction) parameter in ECX, + that is used only with some specific (function) parameter values. + most functions use only (subFunction==0). +*/ +/* + __cpuid(): MSVC and GCC/CLANG use same function/macro name + but parameters are different. + We use MSVC __cpuid() parameters style for our z7_x86_cpuid() function. +*/ + +#if defined(__GNUC__) /* && (__GNUC__ >= 10) */ \ + || defined(__clang__) /* && (__clang_major__ >= 10) */ + +/* there was some CLANG/GCC compilers that have issues with + rbx(ebx) handling in asm blocks in -fPIC mode (__PIC__ is defined). + compiler's contains the macro __cpuid() that is similar to our code. + The history of __cpuid() changes in CLANG/GCC: + GCC: + 2007: it preserved ebx for (__PIC__ && __i386__) + 2013: it preserved rbx and ebx for __PIC__ + 2014: it doesn't preserves rbx and ebx anymore + we suppose that (__GNUC__ >= 5) fixed that __PIC__ ebx/rbx problem. + CLANG: + 2014+: it preserves rbx, but only for 64-bit code. No __PIC__ check. + Why CLANG cares about 64-bit mode only, and doesn't care about ebx (in 32-bit)? + Do we need __PIC__ test for CLANG or we must care about rbx even if + __PIC__ is not defined? +*/ + +#define ASM_LN "\n" + +#if defined(MY_CPU_AMD64) && defined(__PIC__) \ + && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) + + /* "=&r" selects free register. It can select even rbx, if that register is free. + "=&D" for (RDI) also works, but the code can be larger with "=&D" + "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */ + +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ + __asm__ __volatile__ ( \ + ASM_LN "mov %%rbx, %q1" \ + ASM_LN "cpuid" \ + ASM_LN "xchg %%rbx, %q1" \ + : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } + +#elif defined(MY_CPU_X86) && defined(__PIC__) \ + && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) + +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ + __asm__ __volatile__ ( \ + ASM_LN "mov %%ebx, %k1" \ + ASM_LN "cpuid" \ + ASM_LN "xchg %%ebx, %k1" \ + : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } + +#else + +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ + __asm__ __volatile__ ( \ + ASM_LN "cpuid" \ + : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } + #endif -#if defined(USE_ASM) && !defined(MY_CPU_AMD64) -static UInt32 CheckFlag(UInt32 flag) +#define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0) + +void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) { - #ifdef _MSC_VER - __asm pushfd; - __asm pop EAX; - __asm mov EDX, EAX; - __asm xor EAX, flag; - __asm push EAX; - __asm popfd; - __asm pushfd; - __asm pop EAX; - __asm xor EAX, EDX; - __asm push EDX; - __asm popfd; - __asm and flag, EAX; - #else - __asm__ __volatile__ ( - "pushf\n\t" - "pop %%EAX\n\t" - "movl %%EAX,%%EDX\n\t" - "xorl %0,%%EAX\n\t" - "push %%EAX\n\t" - "popf\n\t" - "pushf\n\t" - "pop %%EAX\n\t" - "xorl %%EDX,%%EAX\n\t" - "push %%EDX\n\t" - "popf\n\t" - "andl %%EAX, %0\n\t": - "=c" (flag) : "c" (flag) : - "%eax", "%edx"); - #endif - return flag; + x86_cpuid_MACRO(p, func) } -#define CHECK_CPUID_IS_SUPPORTED if (CheckFlag(1 << 18) == 0 || CheckFlag(1 << 21) == 0) return False; -#else -#define CHECK_CPUID_IS_SUPPORTED -#endif -void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d) +static +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) { - #ifdef USE_ASM + x86_cpuid_MACRO_2(p, func, subFunc) +} - #ifdef _MSC_VER - UInt32 a2, b2, c2, d2; - __asm xor EBX, EBX; - __asm xor ECX, ECX; - __asm xor EDX, EDX; - __asm mov EAX, function; - __asm cpuid; - __asm mov a2, EAX; - __asm mov b2, EBX; - __asm mov c2, ECX; - __asm mov d2, EDX; +Z7_NO_INLINE +UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) +{ + #if defined(NEED_CHECK_FOR_CPUID) + #define EFALGS_CPUID_BIT 21 + UInt32 a; + __asm__ __volatile__ ( + ASM_LN "pushf" + ASM_LN "pushf" + ASM_LN "pop %0" + // ASM_LN "movl %0, %1" + // ASM_LN "xorl $0x200000, %0" + ASM_LN "btc %1, %0" + ASM_LN "push %0" + ASM_LN "popf" + ASM_LN "pushf" + ASM_LN "pop %0" + ASM_LN "xorl (%%esp), %0" - *a = a2; - *b = b2; - *c = c2; - *d = d2; + ASM_LN "popf" + ASM_LN + : "=&r" (a) // "=a" + : "i" (EFALGS_CPUID_BIT) + ); + if ((a & (1 << EFALGS_CPUID_BIT)) == 0) + return 0; + #endif + { + UInt32 p[4]; + x86_cpuid_MACRO(p, 0) + return p[0]; + } +} - #else +#undef ASM_LN - __asm__ __volatile__ ( - #if defined(MY_CPU_AMD64) && defined(__PIC__) - "mov %%rbx, %%rdi;" - "cpuid;" - "xchg %%rbx, %%rdi;" - : "=a" (*a) , - "=D" (*b) , - #elif defined(MY_CPU_X86) && defined(__PIC__) - "mov %%ebx, %%edi;" - "cpuid;" - "xchgl %%ebx, %%edi;" - : "=a" (*a) , - "=D" (*b) , - #else - "cpuid" - : "=a" (*a) , - "=b" (*b) , - #endif - "=c" (*c) , - "=d" (*d) - : "0" (function)) ; +#elif !defined(_MSC_VER) - #endif - - #else +/* +// for gcc/clang and other: we can try to use __cpuid macro: +#include +void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) +{ + __cpuid(func, p[0], p[1], p[2], p[3]); +} +UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) +{ + return (UInt32)__get_cpuid_max(0, NULL); +} +*/ +// for unsupported cpuid: +void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) +{ + UNUSED_VAR(func) + p[0] = p[1] = p[2] = p[3] = 0; +} +UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) +{ + return 0; +} + +#else // _MSC_VER - int CPUInfo[4]; - __cpuid(CPUInfo, function); - *a = CPUInfo[0]; - *b = CPUInfo[1]; - *c = CPUInfo[2]; - *d = CPUInfo[3]; +#if !defined(MY_CPU_AMD64) +UInt32 __declspec(naked) Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) +{ + #if defined(NEED_CHECK_FOR_CPUID) + #define EFALGS_CPUID_BIT 21 + __asm pushfd + __asm pushfd + /* + __asm pop eax + // __asm mov edx, eax + __asm btc eax, EFALGS_CPUID_BIT + __asm push eax + */ + __asm btc dword ptr [esp], EFALGS_CPUID_BIT + __asm popfd + __asm pushfd + __asm pop eax + // __asm xor eax, edx + __asm xor eax, [esp] + // __asm push edx + __asm popfd + __asm and eax, (1 shl EFALGS_CPUID_BIT) + __asm jz end_func + #endif + __asm push ebx + __asm xor eax, eax // func + __asm xor ecx, ecx // subFunction (optional) for (func == 0) + __asm cpuid + __asm pop ebx + #if defined(NEED_CHECK_FOR_CPUID) + end_func: #endif + __asm ret 0 } -BoolInt x86cpuid_CheckAndRead(Cx86cpuid *p) +void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) +{ + UNUSED_VAR(p) + UNUSED_VAR(func) + __asm push ebx + __asm push edi + __asm mov edi, ecx // p + __asm mov eax, edx // func + __asm xor ecx, ecx // subfunction (optional) for (func == 0) + __asm cpuid + __asm mov [edi ], eax + __asm mov [edi + 4], ebx + __asm mov [edi + 8], ecx + __asm mov [edi + 12], edx + __asm pop edi + __asm pop ebx + __asm ret 0 +} + +static +void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + UNUSED_VAR(p) + UNUSED_VAR(func) + UNUSED_VAR(subFunc) + __asm push ebx + __asm push edi + __asm mov edi, ecx // p + __asm mov eax, edx // func + __asm mov ecx, [esp + 12] // subFunc + __asm cpuid + __asm mov [edi ], eax + __asm mov [edi + 4], ebx + __asm mov [edi + 8], ecx + __asm mov [edi + 12], edx + __asm pop edi + __asm pop ebx + __asm ret 4 +} + +#else // MY_CPU_AMD64 + + #if _MSC_VER >= 1600 + #include + #define MY_cpuidex __cpuidex + +static +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + __cpuidex((int *)p, func, subFunc); +} + + #else +/* + __cpuid (func == (0 or 7)) requires subfunction number in ECX. + MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction. + __cpuid() in new MSVC clears ECX. + __cpuid() in old MSVC (14.00) x64 doesn't clear ECX + We still can use __cpuid for low (func) values that don't require ECX, + but __cpuid() in old MSVC will be incorrect for some func values: (func == 7). + So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, + where ECX value is first parameter for FASTCALL / NO_INLINE func. + So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and + old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. + +DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!! +*/ +static +Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo) +{ + UNUSED_VAR(subFunction) + __cpuid(CPUInfo, func); +} + #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info) + #pragma message("======== MY_cpuidex_HACK WAS USED ========") +static +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + MY_cpuidex_HACK(subFunc, func, (Int32 *)p); +} + #endif // _MSC_VER >= 1600 + +#if !defined(MY_CPU_AMD64) +/* inlining for __cpuid() in MSVC x86 (32-bit) produces big ineffective code, + so we disable inlining here */ +Z7_NO_INLINE +#endif +void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) +{ + MY_cpuidex((Int32 *)p, (Int32)func, 0); +} + +Z7_NO_INLINE +UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) +{ + Int32 a[4]; + MY_cpuidex(a, 0, 0); + return a[0]; +} + +#endif // MY_CPU_AMD64 +#endif // _MSC_VER + +#if defined(NEED_CHECK_FOR_CPUID) +#define CHECK_CPUID_IS_SUPPORTED { if (z7_x86_cpuid_GetMaxFunc() == 0) return 0; } +#else +#define CHECK_CPUID_IS_SUPPORTED +#endif +#undef NEED_CHECK_FOR_CPUID + + +static +BoolInt x86cpuid_Func_1(UInt32 *p) { CHECK_CPUID_IS_SUPPORTED - MyCPUID(0, &p->maxFunc, &p->vendor[0], &p->vendor[2], &p->vendor[1]); - MyCPUID(1, &p->ver, &p->b, &p->c, &p->d); + z7_x86_cpuid(p, 1); return True; } -static const UInt32 kVendors[][3] = +/* +static const UInt32 kVendors[][1] = { - { 0x756E6547, 0x49656E69, 0x6C65746E}, - { 0x68747541, 0x69746E65, 0x444D4163}, - { 0x746E6543, 0x48727561, 0x736C7561} + { 0x756E6547 }, // , 0x49656E69, 0x6C65746E }, + { 0x68747541 }, // , 0x69746E65, 0x444D4163 }, + { 0x746E6543 } // , 0x48727561, 0x736C7561 } }; +*/ + +/* +typedef struct +{ + UInt32 maxFunc; + UInt32 vendor[3]; + UInt32 ver; + UInt32 b; + UInt32 c; + UInt32 d; +} Cx86cpuid; + +enum +{ + CPU_FIRM_INTEL, + CPU_FIRM_AMD, + CPU_FIRM_VIA +}; +int x86cpuid_GetFirm(const Cx86cpuid *p); +#define x86cpuid_ver_GetFamily(ver) (((ver >> 16) & 0xff0) | ((ver >> 8) & 0xf)) +#define x86cpuid_ver_GetModel(ver) (((ver >> 12) & 0xf0) | ((ver >> 4) & 0xf)) +#define x86cpuid_ver_GetStepping(ver) (ver & 0xf) int x86cpuid_GetFirm(const Cx86cpuid *p) { unsigned i; - for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[i]); i++) + for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[0]); i++) { const UInt32 *v = kVendors[i]; - if (v[0] == p->vendor[0] && - v[1] == p->vendor[1] && - v[2] == p->vendor[2]) + if (v[0] == p->vendor[0] + // && v[1] == p->vendor[1] + // && v[2] == p->vendor[2] + ) return (int)i; } return -1; @@ -147,72 +360,611 @@ int x86cpuid_GetFirm(const Cx86cpuid *p) BoolInt CPU_Is_InOrder() { Cx86cpuid p; - int firm; UInt32 family, model; if (!x86cpuid_CheckAndRead(&p)) return True; - family = x86cpuid_GetFamily(p.ver); - model = x86cpuid_GetModel(p.ver); - - firm = x86cpuid_GetFirm(&p); + family = x86cpuid_ver_GetFamily(p.ver); + model = x86cpuid_ver_GetModel(p.ver); - switch (firm) + switch (x86cpuid_GetFirm(&p)) { case CPU_FIRM_INTEL: return (family < 6 || (family == 6 && ( - /* In-Order Atom CPU */ - model == 0x1C /* 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330 */ - || model == 0x26 /* 45 nm, Z6xx */ - || model == 0x27 /* 32 nm, Z2460 */ - || model == 0x35 /* 32 nm, Z2760 */ - || model == 0x36 /* 32 nm, N2xxx, D2xxx */ + // In-Order Atom CPU + model == 0x1C // 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330 + || model == 0x26 // 45 nm, Z6xx + || model == 0x27 // 32 nm, Z2460 + || model == 0x35 // 32 nm, Z2760 + || model == 0x36 // 32 nm, N2xxx, D2xxx ))); case CPU_FIRM_AMD: return (family < 5 || (family == 5 && (model < 6 || model == 0xA))); case CPU_FIRM_VIA: return (family < 6 || (family == 6 && model < 0xF)); } - return True; + return False; // v23 : unknown processors are not In-Order } +*/ + +#ifdef _WIN32 +#include "7zWindows.h" +#endif #if !defined(MY_CPU_AMD64) && defined(_WIN32) -#include -static BoolInt CPU_Sys_Is_SSE_Supported() + +/* for legacy SSE ia32: there is no user-space cpu instruction to check + that OS supports SSE register storing/restoring on context switches. + So we need some OS-specific function to check that it's safe to use SSE registers. +*/ + +Z7_FORCE_INLINE +static BoolInt CPU_Sys_Is_SSE_Supported(void) { - OSVERSIONINFO vi; - vi.dwOSVersionInfoSize = sizeof(vi); - if (!GetVersionEx(&vi)) - return False; - return (vi.dwMajorVersion >= 5); +#ifdef _MSC_VER + #pragma warning(push) + #pragma warning(disable : 4996) // `GetVersion': was declared deprecated +#endif + /* low byte is major version of Windows + We suppose that any Windows version since + Windows2000 (major == 5) supports SSE registers */ + return (Byte)GetVersion() >= 5; +#if defined(_MSC_VER) + #pragma warning(pop) +#endif } #define CHECK_SYS_SSE_SUPPORT if (!CPU_Sys_Is_SSE_Supported()) return False; #else #define CHECK_SYS_SSE_SUPPORT #endif -BoolInt CPU_Is_Aes_Supported() + +#if !defined(MY_CPU_AMD64) + +BoolInt CPU_IsSupported_CMOV(void) +{ + UInt32 a[4]; + if (!x86cpuid_Func_1(&a[0])) + return 0; + return (BoolInt)(a[3] >> 15) & 1; +} + +BoolInt CPU_IsSupported_SSE(void) +{ + UInt32 a[4]; + CHECK_SYS_SSE_SUPPORT + if (!x86cpuid_Func_1(&a[0])) + return 0; + return (BoolInt)(a[3] >> 25) & 1; +} + +BoolInt CPU_IsSupported_SSE2(void) +{ + UInt32 a[4]; + CHECK_SYS_SSE_SUPPORT + if (!x86cpuid_Func_1(&a[0])) + return 0; + return (BoolInt)(a[3] >> 26) & 1; +} + +#endif + + +static UInt32 x86cpuid_Func_1_ECX(void) { - Cx86cpuid p; + UInt32 a[4]; CHECK_SYS_SSE_SUPPORT - if (!x86cpuid_CheckAndRead(&p)) + if (!x86cpuid_Func_1(&a[0])) + return 0; + return a[2]; +} + +BoolInt CPU_IsSupported_AES(void) +{ + return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1; +} + +BoolInt CPU_IsSupported_SSSE3(void) +{ + return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1; +} + +BoolInt CPU_IsSupported_SSE41(void) +{ + return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1; +} + +BoolInt CPU_IsSupported_SHA(void) +{ + CHECK_SYS_SSE_SUPPORT + + if (z7_x86_cpuid_GetMaxFunc() < 7) return False; - return (p.c >> 25) & 1; + { + UInt32 d[4]; + z7_x86_cpuid(d, 7); + return (BoolInt)(d[1] >> 29) & 1; + } } -BoolInt CPU_IsSupported_PageGB() + +BoolInt CPU_IsSupported_SHA512(void) { - Cx86cpuid cpuid; - if (!x86cpuid_CheckAndRead(&cpuid)) + if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here + + if (z7_x86_cpuid_GetMaxFunc() < 7) return False; { - UInt32 d[4] = { 0 }; - MyCPUID(0x80000000, &d[0], &d[1], &d[2], &d[3]); + UInt32 d[4]; + z7_x86_cpuid_subFunc(d, 7, 0); + if (d[0] < 1) // d[0] - is max supported subleaf value + return False; + z7_x86_cpuid_subFunc(d, 7, 1); + return (BoolInt)(d[0]) & 1; + } +} + +/* +MSVC: _xgetbv() intrinsic is available since VS2010SP1. + MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in + that we can use or check. + For any 32-bit x86 we can use asm code in MSVC, + but MSVC asm code is huge after compilation. + So _xgetbv() is better + +ICC: _xgetbv() intrinsic is available (in what version of ICC?) + ICC defines (__GNUC___) and it supports gnu assembler + also ICC supports MASM style code with -use-msasm switch. + but ICC doesn't support __attribute__((__target__)) + +GCC/CLANG 9: + _xgetbv() is macro that works via __builtin_ia32_xgetbv() + and we need __attribute__((__target__("xsave")). + But with __target__("xsave") the function will be not + inlined to function that has no __target__("xsave") attribute. + If we want _xgetbv() call inlining, then we should use asm version + instead of calling _xgetbv(). + Note:intrinsic is broke before GCC 8.2: + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85684 +*/ + +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1100) \ + || defined(_MSC_VER) && (_MSC_VER >= 1600) && (_MSC_FULL_VER >= 160040219) \ + || defined(__GNUC__) && (__GNUC__ >= 9) \ + || defined(__clang__) && (__clang_major__ >= 9) +// we define ATTRIB_XGETBV, if we want to use predefined _xgetbv() from compiler +#if defined(__INTEL_COMPILER) +#define ATTRIB_XGETBV +#elif defined(__GNUC__) || defined(__clang__) +// we don't define ATTRIB_XGETBV here, because asm version is better for inlining. +// #define ATTRIB_XGETBV __attribute__((__target__("xsave"))) +#else +#define ATTRIB_XGETBV +#endif +#endif + +#if defined(ATTRIB_XGETBV) +#include +#endif + + +// XFEATURE_ENABLED_MASK/XCR0 +#define MY_XCR_XFEATURE_ENABLED_MASK 0 + +#if defined(ATTRIB_XGETBV) +ATTRIB_XGETBV +#endif +static UInt64 x86_xgetbv_0(UInt32 num) +{ +#if defined(ATTRIB_XGETBV) + { + return + #if (defined(_MSC_VER)) + _xgetbv(num); + #else + __builtin_ia32_xgetbv( + #if !defined(__clang__) + (int) + #endif + num); + #endif + } + +#elif defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_CC) + + UInt32 a, d; + #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) + __asm__ + ( + "xgetbv" + : "=a"(a), "=d"(d) : "c"(num) : "cc" + ); + #else // is old gcc + __asm__ + ( + ".byte 0x0f, 0x01, 0xd0" "\n\t" + : "=a"(a), "=d"(d) : "c"(num) : "cc" + ); + #endif + return ((UInt64)d << 32) | a; + // return a; + +#elif defined(_MSC_VER) && !defined(MY_CPU_AMD64) + + UInt32 a, d; + __asm { + push eax + push edx + push ecx + mov ecx, num; + // xor ecx, ecx // = MY_XCR_XFEATURE_ENABLED_MASK + _emit 0x0f + _emit 0x01 + _emit 0xd0 + mov a, eax + mov d, edx + pop ecx + pop edx + pop eax + } + return ((UInt64)d << 32) | a; + // return a; + +#else // it's unknown compiler + // #error "Need xgetbv function" + UNUSED_VAR(num) + // for MSVC-X64 we could call external function from external file. + /* Actually we had checked OSXSAVE/AVX in cpuid before. + So it's expected that OS supports at least AVX and below. */ + // if (num != MY_XCR_XFEATURE_ENABLED_MASK) return 0; // if not XCR0 + return + // (1 << 0) | // x87 + (1 << 1) // SSE + | (1 << 2); // AVX + +#endif +} + +#ifdef _WIN32 +/* + Windows versions do not know about new ISA extensions that + can be introduced. But we still can use new extensions, + even if Windows doesn't report about supporting them, + But we can use new extensions, only if Windows knows about new ISA extension + that changes the number or size of registers: SSE, AVX/XSAVE, AVX512 + So it's enough to check + MY_PF_AVX_INSTRUCTIONS_AVAILABLE + instead of + MY_PF_AVX2_INSTRUCTIONS_AVAILABLE +*/ +#define MY_PF_XSAVE_ENABLED 17 +// #define MY_PF_SSSE3_INSTRUCTIONS_AVAILABLE 36 +// #define MY_PF_SSE4_1_INSTRUCTIONS_AVAILABLE 37 +// #define MY_PF_SSE4_2_INSTRUCTIONS_AVAILABLE 38 +// #define MY_PF_AVX_INSTRUCTIONS_AVAILABLE 39 +// #define MY_PF_AVX2_INSTRUCTIONS_AVAILABLE 40 +// #define MY_PF_AVX512F_INSTRUCTIONS_AVAILABLE 41 +#endif + +BoolInt CPU_IsSupported_AVX(void) +{ + #ifdef _WIN32 + if (!IsProcessorFeaturePresent(MY_PF_XSAVE_ENABLED)) + return False; + /* PF_AVX_INSTRUCTIONS_AVAILABLE probably is supported starting from + some latest Win10 revisions. But we need AVX in older Windows also. + So we don't use the following check: */ + /* + if (!IsProcessorFeaturePresent(MY_PF_AVX_INSTRUCTIONS_AVAILABLE)) + return False; + */ + #endif + + /* + OS must use new special XSAVE/XRSTOR instructions to save + AVX registers when it required for context switching. + At OS statring: + OS sets CR4.OSXSAVE flag to signal the processor that OS supports the XSAVE extensions. + Also OS sets bitmask in XCR0 register that defines what + registers will be processed by XSAVE instruction: + XCR0.SSE[bit 0] - x87 registers and state + XCR0.SSE[bit 1] - SSE registers and state + XCR0.AVX[bit 2] - AVX registers and state + CR4.OSXSAVE is reflected to CPUID.1:ECX.OSXSAVE[bit 27]. + So we can read that bit in user-space. + XCR0 is available for reading in user-space by new XGETBV instruction. + */ + { + const UInt32 c = x86cpuid_Func_1_ECX(); + if (0 == (1 + & (c >> 28) // AVX instructions are supported by hardware + & (c >> 27))) // OSXSAVE bit: XSAVE and related instructions are enabled by OS. + return False; + } + + /* also we can check + CPUID.1:ECX.XSAVE [bit 26] : that shows that + XSAVE, XRESTOR, XSETBV, XGETBV instructions are supported by hardware. + But that check is redundant, because if OSXSAVE bit is set, then XSAVE is also set */ + + /* If OS have enabled XSAVE extension instructions (OSXSAVE == 1), + in most cases we expect that OS also will support storing/restoring + for AVX and SSE states at least. + But to be ensure for that we call user-space instruction + XGETBV(0) to get XCR0 value that contains bitmask that defines + what exact states(registers) OS have enabled for storing/restoring. + */ + + { + const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); + // printf("\n=== XGetBV=0x%x\n", bm); + return 1 + & (BoolInt)(bm >> 1) // SSE state is supported (set by OS) for storing/restoring + & (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring + } + // since Win7SP1: we can use GetEnabledXStateFeatures(); +} + + +BoolInt CPU_IsSupported_AVX2(void) +{ + if (!CPU_IsSupported_AVX()) + return False; + if (z7_x86_cpuid_GetMaxFunc() < 7) + return False; + { + UInt32 d[4]; + z7_x86_cpuid(d, 7); + // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); + return 1 + & (BoolInt)(d[1] >> 5); // avx2 + } +} + +#if 0 +BoolInt CPU_IsSupported_AVX512F_AVX512VL(void) +{ + if (!CPU_IsSupported_AVX()) + return False; + if (z7_x86_cpuid_GetMaxFunc() < 7) + return False; + { + UInt32 d[4]; + BoolInt v; + z7_x86_cpuid(d, 7); + // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); + v = 1 + & (BoolInt)(d[1] >> 16) // avx512f + & (BoolInt)(d[1] >> 31); // avx512vl + if (!v) + return False; + } + { + const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); + // printf("\n=== XGetBV=0x%x\n", bm); + return 1 + & (BoolInt)(bm >> 5) // OPMASK + & (BoolInt)(bm >> 6) // ZMM upper 256-bit + & (BoolInt)(bm >> 7); // ZMM16 ... ZMM31 + } +} +#endif + +BoolInt CPU_IsSupported_VAES_AVX2(void) +{ + if (!CPU_IsSupported_AVX()) + return False; + if (z7_x86_cpuid_GetMaxFunc() < 7) + return False; + { + UInt32 d[4]; + z7_x86_cpuid(d, 7); + // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); + return 1 + & (BoolInt)(d[1] >> 5) // avx2 + // & (d[1] >> 31) // avx512vl + & (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX + } +} + +BoolInt CPU_IsSupported_PageGB(void) +{ + CHECK_CPUID_IS_SUPPORTED + { + UInt32 d[4]; + z7_x86_cpuid(d, 0x80000000); if (d[0] < 0x80000001) return False; + z7_x86_cpuid(d, 0x80000001); + return (BoolInt)(d[3] >> 26) & 1; } +} + + +#elif defined(MY_CPU_ARM_OR_ARM64) + +#ifdef _WIN32 + +#include "7zWindows.h" + +BoolInt CPU_IsSupported_CRC32(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } +BoolInt CPU_IsSupported_CRYPTO(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } +BoolInt CPU_IsSupported_NEON(void) { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; } + +#else + +#if defined(__APPLE__) + +/* +#include +#include +static void Print_sysctlbyname(const char *name) +{ + size_t bufSize = 256; + char buf[256]; + int res = sysctlbyname(name, &buf, &bufSize, NULL, 0); { - UInt32 d[4] = { 0 }; - MyCPUID(0x80000001, &d[0], &d[1], &d[2], &d[3]); - return (d[3] >> 26) & 1; + int i; + printf("\nres = %d : %s : '%s' : bufSize = %d, numeric", res, name, buf, (unsigned)bufSize); + for (i = 0; i < 20; i++) + printf(" %2x", (unsigned)(Byte)buf[i]); + } } +*/ +/* + Print_sysctlbyname("hw.pagesize"); + Print_sysctlbyname("machdep.cpu.brand_string"); +*/ + +static BoolInt z7_sysctlbyname_Get_BoolInt(const char *name) +{ + UInt32 val = 0; + if (z7_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1) + return 1; + return 0; +} + +BoolInt CPU_IsSupported_CRC32(void) +{ + return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32"); +} + +BoolInt CPU_IsSupported_NEON(void) +{ + return z7_sysctlbyname_Get_BoolInt("hw.optional.neon"); +} + +BoolInt CPU_IsSupported_SHA512(void) +{ + return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512"); +} + +/* +BoolInt CPU_IsSupported_SHA3(void) +{ + return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3"); +} +*/ + +#ifdef MY_CPU_ARM64 +#define APPLE_CRYPTO_SUPPORT_VAL 1 +#else +#define APPLE_CRYPTO_SUPPORT_VAL 0 +#endif + +BoolInt CPU_IsSupported_SHA1(void) { return APPLE_CRYPTO_SUPPORT_VAL; } +BoolInt CPU_IsSupported_SHA2(void) { return APPLE_CRYPTO_SUPPORT_VAL; } +BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; } + + +#else // __APPLE__ + +#if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216) + #define Z7_GETAUXV_AVAILABLE +#else +// #pragma message("=== is not NEW GLIBC === ") + #if defined __has_include + #if __has_include () +// #pragma message("=== sys/auxv.h is avail=== ") + #define Z7_GETAUXV_AVAILABLE + #endif + #endif +#endif + +#ifdef Z7_GETAUXV_AVAILABLE +// #pragma message("=== Z7_GETAUXV_AVAILABLE === ") +#include +#define USE_HWCAP +#endif + +#ifdef USE_HWCAP + +#if defined(__FreeBSD__) +static unsigned long MY_getauxval(int aux) +{ + unsigned long val; + if (elf_aux_info(aux, &val, sizeof(val))) + return 0; + return val; +} +#else +#define MY_getauxval getauxval + #if defined __has_include + #if __has_include () +#include + #endif + #endif +#endif + + #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ + BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP) & (HWCAP_ ## name2)); } + +#ifdef MY_CPU_ARM64 + #define MY_HWCAP_CHECK_FUNC(name) \ + MY_HWCAP_CHECK_FUNC_2(name, name) +#if 1 || defined(__ARM_NEON) + BoolInt CPU_IsSupported_NEON(void) { return True; } +#else + MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD) +#endif +// MY_HWCAP_CHECK_FUNC (ASIMD) +#elif defined(MY_CPU_ARM) + #define MY_HWCAP_CHECK_FUNC(name) \ + BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); } + MY_HWCAP_CHECK_FUNC_2(NEON, NEON) +#endif + +#else // USE_HWCAP + + #define MY_HWCAP_CHECK_FUNC(name) \ + BoolInt CPU_IsSupported_ ## name(void) { return 0; } +#if defined(__ARM_NEON) + BoolInt CPU_IsSupported_NEON(void) { return True; } +#else + MY_HWCAP_CHECK_FUNC(NEON) +#endif + +#endif // USE_HWCAP + +MY_HWCAP_CHECK_FUNC (CRC32) +MY_HWCAP_CHECK_FUNC (SHA1) +MY_HWCAP_CHECK_FUNC (SHA2) +MY_HWCAP_CHECK_FUNC (AES) +#ifdef MY_CPU_ARM64 +// supports HWCAP_SHA512 and HWCAP_SHA3 since 2017. +// we define them here, if they are not defined +#ifndef HWCAP_SHA3 +// #define HWCAP_SHA3 (1 << 17) +#endif +#ifndef HWCAP_SHA512 +// #pragma message("=== HWCAP_SHA512 define === ") +#define HWCAP_SHA512 (1 << 21) +#endif +MY_HWCAP_CHECK_FUNC (SHA512) +// MY_HWCAP_CHECK_FUNC (SHA3) +#endif + +#endif // __APPLE__ +#endif // _WIN32 + +#endif // MY_CPU_ARM_OR_ARM64 + + + +#ifdef __APPLE__ + +#include + +int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize) +{ + return sysctlbyname(name, buf, bufSize, NULL, 0); +} + +int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val) +{ + size_t bufSize = sizeof(*val); + const int res = z7_sysctlbyname_Get(name, val, &bufSize); + if (res == 0 && bufSize != sizeof(*val)) + return EFAULT; + return res; +} #endif diff --git a/src/sdk/C/CpuArch.h b/src/sdk/C/CpuArch.h index f1edae3..1690a5b 100644 --- a/src/sdk/C/CpuArch.h +++ b/src/sdk/C/CpuArch.h @@ -1,8 +1,8 @@ /* CpuArch.h -- CPU specific code -2018-02-18 : Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ -#ifndef __CPU_ARCH_H -#define __CPU_ARCH_H +#ifndef ZIP7_INC_CPU_ARCH_H +#define ZIP7_INC_CPU_ARCH_H #include "7zTypes.h" @@ -14,8 +14,13 @@ MY_CPU_BE means that CPU is BIG ENDIAN. If MY_CPU_LE and MY_CPU_BE are not defined, we don't know about ENDIANNESS of platform. MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned memory accesses. + +MY_CPU_64BIT means that processor can work with 64-bit registers. + MY_CPU_64BIT can be used to select fast code branch + MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8) */ +#if !defined(_M_ARM64EC) #if defined(_M_X64) \ || defined(_M_AMD64) \ || defined(__x86_64__) \ @@ -24,27 +29,52 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #define MY_CPU_AMD64 #ifdef __ILP32__ #define MY_CPU_NAME "x32" + #define MY_CPU_SIZEOF_POINTER 4 #else #define MY_CPU_NAME "x64" + #define MY_CPU_SIZEOF_POINTER 8 #endif #define MY_CPU_64BIT #endif +#endif #if defined(_M_IX86) \ || defined(__i386__) #define MY_CPU_X86 #define MY_CPU_NAME "x86" - #define MY_CPU_32BIT + /* #define MY_CPU_32BIT */ + #define MY_CPU_SIZEOF_POINTER 4 +#endif + +#if defined(__SSE2__) \ + || defined(MY_CPU_AMD64) \ + || defined(_M_IX86_FP) && (_M_IX86_FP >= 2) +#define MY_CPU_SSE2 #endif #if defined(_M_ARM64) \ + || defined(_M_ARM64EC) \ || defined(__AARCH64EL__) \ || defined(__AARCH64EB__) \ || defined(__aarch64__) #define MY_CPU_ARM64 - #define MY_CPU_NAME "arm64" +#if defined(__ILP32__) \ + || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) + #define MY_CPU_NAME "arm64-32" + #define MY_CPU_SIZEOF_POINTER 4 +#elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) + #define MY_CPU_NAME "arm64-128" + #define MY_CPU_SIZEOF_POINTER 16 +#else +#if defined(_M_ARM64EC) + #define MY_CPU_NAME "arm64ec" +#else + #define MY_CPU_NAME "arm64" +#endif + #define MY_CPU_SIZEOF_POINTER 8 +#endif #define MY_CPU_64BIT #endif @@ -59,8 +89,16 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem || defined(__THUMBEL__) \ || defined(__THUMBEB__) #define MY_CPU_ARM - #define MY_CPU_NAME "arm" - #define MY_CPU_32BIT + + #if defined(__thumb__) || defined(__THUMBEL__) || defined(_M_ARMT) + #define MY_CPU_ARMT + #define MY_CPU_NAME "armt" + #else + #define MY_CPU_ARM32 + #define MY_CPU_NAME "arm" + #endif + /* #define MY_CPU_32BIT */ + #define MY_CPU_SIZEOF_POINTER 4 #endif @@ -84,26 +122,104 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #if defined(__ppc64__) \ - || defined(__powerpc64__) + || defined(__powerpc64__) \ + || defined(__ppc__) \ + || defined(__powerpc__) \ + || defined(__PPC__) \ + || defined(_POWER) + +#define MY_CPU_PPC_OR_PPC64 + +#if defined(__ppc64__) \ + || defined(__powerpc64__) \ + || defined(_LP64) \ + || defined(__64BIT__) #ifdef __ILP32__ #define MY_CPU_NAME "ppc64-32" + #define MY_CPU_SIZEOF_POINTER 4 #else #define MY_CPU_NAME "ppc64" + #define MY_CPU_SIZEOF_POINTER 8 #endif #define MY_CPU_64BIT -#elif defined(__ppc__) \ - || defined(__powerpc__) +#else #define MY_CPU_NAME "ppc" - #define MY_CPU_32BIT + #define MY_CPU_SIZEOF_POINTER 4 + /* #define MY_CPU_32BIT */ +#endif #endif -#if defined(__sparc64__) - #define MY_CPU_NAME "sparc64" +#if defined(__sparc__) \ + || defined(__sparc) + #define MY_CPU_SPARC + #if defined(__LP64__) \ + || defined(_LP64) \ + || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8) + #define MY_CPU_NAME "sparcv9" + #define MY_CPU_SIZEOF_POINTER 8 + #define MY_CPU_64BIT + #elif defined(__sparc_v9__) \ + || defined(__sparcv9) + #define MY_CPU_64BIT + #if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) + #define MY_CPU_NAME "sparcv9-32" + #else + #define MY_CPU_NAME "sparcv9m" + #endif + #elif defined(__sparc_v8__) \ + || defined(__sparcv8) + #define MY_CPU_NAME "sparcv8" + #define MY_CPU_SIZEOF_POINTER 4 + #else + #define MY_CPU_NAME "sparc" + #endif +#endif + + +#if defined(__riscv) \ + || defined(__riscv__) + #define MY_CPU_RISCV + #if __riscv_xlen == 32 + #define MY_CPU_NAME "riscv32" + #elif __riscv_xlen == 64 + #define MY_CPU_NAME "riscv64" + #else + #define MY_CPU_NAME "riscv" + #endif +#endif + + +#if defined(__loongarch__) + #define MY_CPU_LOONGARCH + #if defined(__loongarch64) || defined(__loongarch_grlen) && (__loongarch_grlen == 64) + #define MY_CPU_64BIT + #endif + #if defined(__loongarch64) + #define MY_CPU_NAME "loongarch64" + #define MY_CPU_LOONGARCH64 + #else + #define MY_CPU_NAME "loongarch" + #endif +#endif + + +// #undef MY_CPU_NAME +// #undef MY_CPU_SIZEOF_POINTER +// #define __e2k__ +// #define __SIZEOF_POINTER__ 4 +#if defined(__e2k__) + #define MY_CPU_E2K + #if defined(__ILP32__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) + #define MY_CPU_NAME "e2k-32" + #define MY_CPU_SIZEOF_POINTER 4 + #else + #define MY_CPU_NAME "e2k" + #if defined(__LP64__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8) + #define MY_CPU_SIZEOF_POINTER 8 + #endif + #endif #define MY_CPU_64BIT -#elif defined(__sparc__) - #define MY_CPU_NAME "sparc" - /* #define MY_CPU_32BIT */ #endif @@ -111,6 +227,10 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #define MY_CPU_X86_OR_AMD64 #endif +#if defined(MY_CPU_ARM) || defined(MY_CPU_ARM64) +#define MY_CPU_ARM_OR_ARM64 +#endif + #ifdef _WIN32 @@ -133,6 +253,7 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem || defined(MY_CPU_ARM_LE) \ || defined(MY_CPU_ARM64_LE) \ || defined(MY_CPU_IA64_LE) \ + || defined(_LITTLE_ENDIAN) \ || defined(__LITTLE_ENDIAN__) \ || defined(__ARMEL__) \ || defined(__THUMBEL__) \ @@ -165,13 +286,51 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #error Stop_Compiling_Bad_Endian #endif +#if !defined(MY_CPU_LE) && !defined(MY_CPU_BE) + #error Stop_Compiling_CPU_ENDIAN_must_be_detected_at_compile_time +#endif #if defined(MY_CPU_32BIT) && defined(MY_CPU_64BIT) #error Stop_Compiling_Bad_32_64_BIT #endif +#ifdef __SIZEOF_POINTER__ + #ifdef MY_CPU_SIZEOF_POINTER + #if MY_CPU_SIZEOF_POINTER != __SIZEOF_POINTER__ + #error Stop_Compiling_Bad_MY_CPU_PTR_SIZE + #endif + #else + #define MY_CPU_SIZEOF_POINTER __SIZEOF_POINTER__ + #endif +#endif + +#if defined(MY_CPU_SIZEOF_POINTER) && (MY_CPU_SIZEOF_POINTER == 4) +#if defined (_LP64) + #error Stop_Compiling_Bad_MY_CPU_PTR_SIZE +#endif +#endif + +#ifdef _MSC_VER + #if _MSC_VER >= 1300 + #define MY_CPU_pragma_pack_push_1 __pragma(pack(push, 1)) + #define MY_CPU_pragma_pop __pragma(pack(pop)) + #else + #define MY_CPU_pragma_pack_push_1 + #define MY_CPU_pragma_pop + #endif +#else + #ifdef __xlC__ + #define MY_CPU_pragma_pack_push_1 _Pragma("pack(1)") + #define MY_CPU_pragma_pop _Pragma("pack()") + #else + #define MY_CPU_pragma_pack_push_1 _Pragma("pack(push, 1)") + #define MY_CPU_pragma_pop _Pragma("pack(pop)") + #endif +#endif + #ifndef MY_CPU_NAME + // #define MY_CPU_IS_UNKNOWN #ifdef MY_CPU_LE #define MY_CPU_NAME "LE" #elif defined(MY_CPU_BE) @@ -187,11 +346,121 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem +#ifdef __has_builtin + #define Z7_has_builtin(x) __has_builtin(x) +#else + #define Z7_has_builtin(x) 0 +#endif + + +#define Z7_BSWAP32_CONST(v) \ + ( (((UInt32)(v) << 24) ) \ + | (((UInt32)(v) << 8) & (UInt32)0xff0000) \ + | (((UInt32)(v) >> 8) & (UInt32)0xff00 ) \ + | (((UInt32)(v) >> 24) )) + + +#if defined(_MSC_VER) && (_MSC_VER >= 1300) + +#include + +/* Note: these macros will use bswap instruction (486), that is unsupported in 386 cpu */ + +#pragma intrinsic(_byteswap_ushort) +#pragma intrinsic(_byteswap_ulong) +#pragma intrinsic(_byteswap_uint64) + +#define Z7_BSWAP16(v) _byteswap_ushort(v) +#define Z7_BSWAP32(v) _byteswap_ulong (v) +#define Z7_BSWAP64(v) _byteswap_uint64(v) +#define Z7_CPU_FAST_BSWAP_SUPPORTED + +/* GCC can generate slow code that calls function for __builtin_bswap32() for: + - GCC for RISCV, if Zbb/XTHeadBb extension is not used. + - GCC for SPARC. + The code from CLANG for SPARC also is not fastest. + So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases. +*/ +#elif (!defined(MY_CPU_RISCV) || defined (__riscv_zbb) || defined(__riscv_xtheadbb)) \ + && !defined(MY_CPU_SPARC) \ + && ( \ + (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ + || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \ + ) + +#define Z7_BSWAP16(v) __builtin_bswap16(v) +#define Z7_BSWAP32(v) __builtin_bswap32(v) +#define Z7_BSWAP64(v) __builtin_bswap64(v) +#define Z7_CPU_FAST_BSWAP_SUPPORTED + +#else + +#define Z7_BSWAP16(v) ((UInt16) \ + ( ((UInt32)(v) << 8) \ + | ((UInt32)(v) >> 8) \ + )) + +#define Z7_BSWAP32(v) Z7_BSWAP32_CONST(v) + +#define Z7_BSWAP64(v) \ + ( ( ( (UInt64)(v) ) << 8 * 7 ) \ + | ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 1) ) << 8 * 5 ) \ + | ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 2) ) << 8 * 3 ) \ + | ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 3) ) << 8 * 1 ) \ + | ( ( (UInt64)(v) >> 8 * 1 ) & ((UInt32)0xff << 8 * 3) ) \ + | ( ( (UInt64)(v) >> 8 * 3 ) & ((UInt32)0xff << 8 * 2) ) \ + | ( ( (UInt64)(v) >> 8 * 5 ) & ((UInt32)0xff << 8 * 1) ) \ + | ( ( (UInt64)(v) >> 8 * 7 ) ) \ + ) + +#endif + + + #ifdef MY_CPU_LE #if defined(MY_CPU_X86_OR_AMD64) \ || defined(MY_CPU_ARM64) \ - || defined(__ARM_FEATURE_UNALIGNED) + || defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \ + || defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6) #define MY_CPU_LE_UNALIGN + #define MY_CPU_LE_UNALIGN_64 + #elif defined(__ARM_FEATURE_UNALIGNED) +/* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions. + Description of problems: +problem-1 : 32-bit ARM architecture: + multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM) + require 32-bit (WORD) alignment (by 32-bit ARM architecture). + So there is "Alignment fault exception", if data is not aligned for 32-bit. + +problem-2 : 32-bit kernels and arm64 kernels: + 32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception". + So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux. + + But some arm64 kernels do not handle these faults in 32-bit programs. + So we have unhandled exception for such instructions. + Probably some new arm64 kernels have fixed it, and unaligned + paired-access instructions work in new kernels? + +problem-3 : compiler for 32-bit arm: + Compilers use LDRD/STRD/LDM/STM for UInt64 accesses + and for another cases where two 32-bit accesses are fused + to one multi-access instruction. + So UInt64 variables must be aligned for 32-bit, and each + 32-bit access must be aligned for 32-bit, if we want to + avoid "Alignment fault" exception (handled or unhandled). + +problem-4 : performace: + Even if unaligned access is handled by kernel, it will be slow. + So if we allow unaligned access, we can get fast unaligned + single-access, and slow unaligned paired-access. + + We don't allow unaligned access on 32-bit arm, because compiler + genarates paired-access instructions that require 32-bit alignment, + and some arm64 kernels have no handler for these instructions. + Also unaligned paired-access instructions will be slow, if kernel handles them. +*/ + // it must be disabled: + // #define MY_CPU_LE_UNALIGN #endif #endif @@ -200,11 +469,13 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #define GetUi16(p) (*(const UInt16 *)(const void *)(p)) #define GetUi32(p) (*(const UInt32 *)(const void *)(p)) +#ifdef MY_CPU_LE_UNALIGN_64 #define GetUi64(p) (*(const UInt64 *)(const void *)(p)) +#define SetUi64(p, v) { *(UInt64 *)(void *)(p) = (v); } +#endif -#define SetUi16(p, v) { *(UInt16 *)(p) = (v); } -#define SetUi32(p, v) { *(UInt32 *)(p) = (v); } -#define SetUi64(p, v) { *(UInt64 *)(p) = (v); } +#define SetUi16(p, v) { *(UInt16 *)(void *)(p) = (v); } +#define SetUi32(p, v) { *(UInt32 *)(void *)(p) = (v); } #else @@ -218,8 +489,6 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem ((UInt32)((const Byte *)(p))[2] << 16) | \ ((UInt32)((const Byte *)(p))[3] << 24)) -#define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32)) - #define SetUi16(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \ _ppp_[0] = (Byte)_vvv_; \ _ppp_[1] = (Byte)(_vvv_ >> 8); } @@ -230,43 +499,36 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem _ppp_[2] = (Byte)(_vvv_ >> 16); \ _ppp_[3] = (Byte)(_vvv_ >> 24); } -#define SetUi64(p, v) { Byte *_ppp2_ = (Byte *)(p); UInt64 _vvv2_ = (v); \ - SetUi32(_ppp2_ , (UInt32)_vvv2_); \ - SetUi32(_ppp2_ + 4, (UInt32)(_vvv2_ >> 32)); } - #endif -#ifdef __has_builtin - #define MY__has_builtin(x) __has_builtin(x) -#else - #define MY__has_builtin(x) 0 -#endif - -#if defined(MY_CPU_LE_UNALIGN) && /* defined(_WIN64) && */ (_MSC_VER >= 1300) -/* Note: we use bswap instruction, that is unsupported in 386 cpu */ - -#include +#ifndef GetUi64 +#define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32)) +#endif -#pragma intrinsic(_byteswap_ushort) -#pragma intrinsic(_byteswap_ulong) -#pragma intrinsic(_byteswap_uint64) +#ifndef SetUi64 +#define SetUi64(p, v) { Byte *_ppp2_ = (Byte *)(p); UInt64 _vvv2_ = (v); \ + SetUi32(_ppp2_ , (UInt32)_vvv2_) \ + SetUi32(_ppp2_ + 4, (UInt32)(_vvv2_ >> 32)) } +#endif -/* #define GetBe16(p) _byteswap_ushort(*(const UInt16 *)(const Byte *)(p)) */ -#define GetBe32(p) _byteswap_ulong(*(const UInt32 *)(const Byte *)(p)) -#define GetBe64(p) _byteswap_uint64(*(const UInt64 *)(const Byte *)(p)) -#define SetBe32(p, v) (*(UInt32 *)(void *)(p)) = _byteswap_ulong(v) +#if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) -#elif defined(MY_CPU_LE_UNALIGN) && ( \ - (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ - || (defined(__clang__) && MY__has_builtin(__builtin_bswap16)) ) +#if 0 +// Z7_BSWAP16 can be slow for x86-msvc +#define GetBe16_to32(p) (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p))) +#else +#define GetBe16_to32(p) (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16) +#endif -/* #define GetBe16(p) __builtin_bswap16(*(const UInt16 *)(const Byte *)(p)) */ -#define GetBe32(p) __builtin_bswap32(*(const UInt32 *)(const Byte *)(p)) -#define GetBe64(p) __builtin_bswap64(*(const UInt64 *)(const Byte *)(p)) +#define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p)) +#define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); } -#define SetBe32(p, v) (*(UInt32 *)(void *)(p)) = __builtin_bswap32(v) +#if defined(MY_CPU_LE_UNALIGN_64) +#define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p)) +#define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); } +#endif #else @@ -276,8 +538,6 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem ((UInt32)((const Byte *)(p))[2] << 8) | \ ((const Byte *)(p))[3] ) -#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4)) - #define SetBe32(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \ _ppp_[0] = (Byte)(_vvv_ >> 24); \ _ppp_[1] = (Byte)(_vvv_ >> 16); \ @@ -286,49 +546,139 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem #endif +#ifndef GetBe64 +#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4)) +#endif -#ifndef GetBe16 +#ifndef SetBe64 +#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \ + _ppp_[0] = (Byte)(_vvv_ >> 56); \ + _ppp_[1] = (Byte)(_vvv_ >> 48); \ + _ppp_[2] = (Byte)(_vvv_ >> 40); \ + _ppp_[3] = (Byte)(_vvv_ >> 32); \ + _ppp_[4] = (Byte)(_vvv_ >> 24); \ + _ppp_[5] = (Byte)(_vvv_ >> 16); \ + _ppp_[6] = (Byte)(_vvv_ >> 8); \ + _ppp_[7] = (Byte)_vvv_; } +#endif +#ifndef GetBe16 +#ifdef GetBe16_to32 +#define GetBe16(p) ( (UInt16) GetBe16_to32(p)) +#else #define GetBe16(p) ( (UInt16) ( \ ((UInt16)((const Byte *)(p))[0] << 8) | \ ((const Byte *)(p))[1] )) +#endif +#endif + + +#if defined(MY_CPU_BE) +#define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v) +#define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) +#define Z7_CONV_NATIVE_TO_BE_32(v) (v) +// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b1) | ((b0) << 8)) +#elif defined(MY_CPU_LE) +#define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) +#define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v) +#define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v) +// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b0) | ((b1) << 8)) +#else +#error Stop_Compiling_Unknown_Endian_CONV +#endif + + +#if defined(MY_CPU_BE) + +#define GetBe64a(p) (*(const UInt64 *)(const void *)(p)) +#define GetBe32a(p) (*(const UInt32 *)(const void *)(p)) +#define GetBe16a(p) (*(const UInt16 *)(const void *)(p)) +#define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); } +#define SetBe16a(p, v) { *(UInt16 *)(void *)(p) = (v); } + +#define GetUi64a(p) GetUi64(p) +#define GetUi32a(p) GetUi32(p) +#define GetUi16a(p) GetUi16(p) +#define SetUi32a(p, v) SetUi32(p, v) +#define SetUi16a(p, v) SetUi16(p, v) + +#elif defined(MY_CPU_LE) + +#define GetUi64a(p) (*(const UInt64 *)(const void *)(p)) +#define GetUi32a(p) (*(const UInt32 *)(const void *)(p)) +#define GetUi16a(p) (*(const UInt16 *)(const void *)(p)) +#define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); } +#define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); } +#define GetBe64a(p) GetBe64(p) +#define GetBe32a(p) GetBe32(p) +#define GetBe16a(p) GetBe16(p) +#define SetBe32a(p, v) SetBe32(p, v) +#define SetBe16a(p, v) SetBe16(p, v) + +#else +#error Stop_Compiling_Unknown_Endian_CPU_a #endif +#ifndef GetBe16_to32 +#define GetBe16_to32(p) GetBe16(p) +#endif + + +#if defined(MY_CPU_X86_OR_AMD64) \ + || defined(MY_CPU_ARM_OR_ARM64) \ + || defined(MY_CPU_PPC_OR_PPC64) + #define Z7_CPU_FAST_ROTATE_SUPPORTED +#endif + #ifdef MY_CPU_X86_OR_AMD64 -typedef struct -{ - UInt32 maxFunc; - UInt32 vendor[3]; - UInt32 ver; - UInt32 b; - UInt32 c; - UInt32 d; -} Cx86cpuid; - -enum -{ - CPU_FIRM_INTEL, - CPU_FIRM_AMD, - CPU_FIRM_VIA -}; - -void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d); - -BoolInt x86cpuid_CheckAndRead(Cx86cpuid *p); -int x86cpuid_GetFirm(const Cx86cpuid *p); - -#define x86cpuid_GetFamily(ver) (((ver >> 16) & 0xFF0) | ((ver >> 8) & 0xF)) -#define x86cpuid_GetModel(ver) (((ver >> 12) & 0xF0) | ((ver >> 4) & 0xF)) -#define x86cpuid_GetStepping(ver) (ver & 0xF) - -BoolInt CPU_Is_InOrder(void); -BoolInt CPU_Is_Aes_Supported(void); +void Z7_FASTCALL z7_x86_cpuid(UInt32 a[4], UInt32 function); +UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void); +#if defined(MY_CPU_AMD64) +#define Z7_IF_X86_CPUID_SUPPORTED +#else +#define Z7_IF_X86_CPUID_SUPPORTED if (z7_x86_cpuid_GetMaxFunc()) +#endif + +BoolInt CPU_IsSupported_AES(void); +BoolInt CPU_IsSupported_AVX(void); +BoolInt CPU_IsSupported_AVX2(void); +BoolInt CPU_IsSupported_AVX512F_AVX512VL(void); +BoolInt CPU_IsSupported_VAES_AVX2(void); +BoolInt CPU_IsSupported_CMOV(void); +BoolInt CPU_IsSupported_SSE(void); +BoolInt CPU_IsSupported_SSE2(void); +BoolInt CPU_IsSupported_SSSE3(void); +BoolInt CPU_IsSupported_SSE41(void); +BoolInt CPU_IsSupported_SHA(void); +BoolInt CPU_IsSupported_SHA512(void); BoolInt CPU_IsSupported_PageGB(void); +#elif defined(MY_CPU_ARM_OR_ARM64) + +BoolInt CPU_IsSupported_CRC32(void); +BoolInt CPU_IsSupported_NEON(void); + +#if defined(_WIN32) +BoolInt CPU_IsSupported_CRYPTO(void); +#define CPU_IsSupported_SHA1 CPU_IsSupported_CRYPTO +#define CPU_IsSupported_SHA2 CPU_IsSupported_CRYPTO +#define CPU_IsSupported_AES CPU_IsSupported_CRYPTO +#else +BoolInt CPU_IsSupported_SHA1(void); +BoolInt CPU_IsSupported_SHA2(void); +BoolInt CPU_IsSupported_AES(void); +#endif +BoolInt CPU_IsSupported_SHA512(void); + +#endif + +#if defined(__APPLE__) +int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize); +int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val); #endif EXTERN_C_END diff --git a/src/sdk/C/Delta.c b/src/sdk/C/Delta.c index e3edd21..c4a4499 100644 --- a/src/sdk/C/Delta.c +++ b/src/sdk/C/Delta.c @@ -1,5 +1,5 @@ /* Delta.c -- Delta converter -2009-05-26 : Igor Pavlov : Public domain */ +2021-02-09 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -12,53 +12,158 @@ void Delta_Init(Byte *state) state[i] = 0; } -static void MyMemCpy(Byte *dest, const Byte *src, unsigned size) -{ - unsigned i; - for (i = 0; i < size; i++) - dest[i] = src[i]; -} void Delta_Encode(Byte *state, unsigned delta, Byte *data, SizeT size) { - Byte buf[DELTA_STATE_SIZE]; - unsigned j = 0; - MyMemCpy(buf, state, delta); + Byte temp[DELTA_STATE_SIZE]; + + if (size == 0) + return; + + { + unsigned i = 0; + do + temp[i] = state[i]; + while (++i != delta); + } + + if (size <= delta) + { + unsigned i = 0, k; + do + { + Byte b = *data; + *data++ = (Byte)(b - temp[i]); + temp[i] = b; + } + while (++i != size); + + k = 0; + + do + { + if (i == delta) + i = 0; + state[k] = temp[i++]; + } + while (++k != delta); + + return; + } + { - SizeT i; - for (i = 0; i < size;) + Byte *p = data + size - delta; + { + unsigned i = 0; + do + state[i] = *p++; + while (++i != delta); + } { - for (j = 0; j < delta && i < size; i++, j++) + const Byte *lim = data + delta; + ptrdiff_t dif = -(ptrdiff_t)delta; + + if (((ptrdiff_t)size + dif) & 1) { - Byte b = data[i]; - data[i] = (Byte)(b - buf[j]); - buf[j] = b; + --p; *p = (Byte)(*p - p[dif]); } + + while (p != lim) + { + --p; *p = (Byte)(*p - p[dif]); + --p; *p = (Byte)(*p - p[dif]); + } + + dif = -dif; + + do + { + --p; *p = (Byte)(*p - temp[--dif]); + } + while (dif != 0); } } - if (j == delta) - j = 0; - MyMemCpy(state, buf + j, delta - j); - MyMemCpy(state + delta - j, buf, j); } + void Delta_Decode(Byte *state, unsigned delta, Byte *data, SizeT size) { - Byte buf[DELTA_STATE_SIZE]; - unsigned j = 0; - MyMemCpy(buf, state, delta); + unsigned i; + const Byte *lim; + + if (size == 0) + return; + + i = 0; + lim = data + size; + + if (size <= delta) + { + do + *data = (Byte)(*data + state[i++]); + while (++data != lim); + + for (; delta != i; state++, delta--) + *state = state[i]; + data -= i; + } + else { - SizeT i; - for (i = 0; i < size;) + /* + #define B(n) b ## n + #define I(n) Byte B(n) = state[n]; + #define U(n) { B(n) = (Byte)((B(n)) + *data++); data[-1] = (B(n)); } + #define F(n) if (data != lim) { U(n) } + + if (delta == 1) + { + I(0) + if ((lim - data) & 1) { U(0) } + while (data != lim) { U(0) U(0) } + data -= 1; + } + else if (delta == 2) { - for (j = 0; j < delta && i < size; i++, j++) + I(0) I(1) + lim -= 1; while (data < lim) { U(0) U(1) } + lim += 1; F(0) + data -= 2; + } + else if (delta == 3) + { + I(0) I(1) I(2) + lim -= 2; while (data < lim) { U(0) U(1) U(2) } + lim += 2; F(0) F(1) + data -= 3; + } + else if (delta == 4) + { + I(0) I(1) I(2) I(3) + lim -= 3; while (data < lim) { U(0) U(1) U(2) U(3) } + lim += 3; F(0) F(1) F(2) + data -= 4; + } + else + */ + { + do + { + *data = (Byte)(*data + state[i++]); + data++; + } + while (i != delta); + { - buf[j] = data[i] = (Byte)(buf[j] + data[i]); + ptrdiff_t dif = -(ptrdiff_t)delta; + do + *data = (Byte)(*data + data[dif]); + while (++data != lim); + data += dif; } } } - if (j == delta) - j = 0; - MyMemCpy(state, buf + j, delta - j); - MyMemCpy(state + delta - j, buf, j); + + do + *state++ = *data; + while (++data != lim); } diff --git a/src/sdk/C/Delta.h b/src/sdk/C/Delta.h index 2fa54ad..7060954 100644 --- a/src/sdk/C/Delta.h +++ b/src/sdk/C/Delta.h @@ -1,8 +1,8 @@ /* Delta.h -- Delta converter -2013-01-18 : Igor Pavlov : Public domain */ +2023-03-03 : Igor Pavlov : Public domain */ -#ifndef __DELTA_H -#define __DELTA_H +#ifndef ZIP7_INC_DELTA_H +#define ZIP7_INC_DELTA_H #include "7zTypes.h" diff --git a/src/sdk/C/DllSecur.c b/src/sdk/C/DllSecur.c index 5ea108a..bbbfc0a 100644 --- a/src/sdk/C/DllSecur.c +++ b/src/sdk/C/DllSecur.c @@ -1,108 +1,99 @@ /* DllSecur.c -- DLL loading security -2018-02-21 : Igor Pavlov : Public domain */ +2023-12-03 : Igor Pavlov : Public domain */ #include "Precomp.h" #ifdef _WIN32 -#include +#include "7zWindows.h" #include "DllSecur.h" #ifndef UNDER_CE +Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION + typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags); #define MY_LOAD_LIBRARY_SEARCH_USER_DIRS 0x400 #define MY_LOAD_LIBRARY_SEARCH_SYSTEM32 0x800 +#define DELIM "\0" + static const char * const g_Dlls = + "userenv" + DELIM "setupapi" + DELIM "apphelp" + DELIM "propsys" + DELIM "dwmapi" + DELIM "cryptbase" + DELIM "oleacc" + DELIM "clbcatq" + DELIM "version" #ifndef _CONSOLE - "UXTHEME\0" + DELIM "uxtheme" #endif - "USERENV\0" - "SETUPAPI\0" - "APPHELP\0" - "PROPSYS\0" - "DWMAPI\0" - "CRYPTBASE\0" - "OLEACC\0" - "CLBCATQ\0" - "VERSION\0" - ; + DELIM; + +#endif +#ifdef __clang__ + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif +#if defined (_MSC_VER) && _MSC_VER >= 1900 +// sysinfoapi.h: kit10: GetVersion was declared deprecated +#pragma warning(disable : 4996) #endif -void My_SetDefaultDllDirectories() +#define IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN \ + if ((UInt16)GetVersion() != 6) { \ + const \ + Func_SetDefaultDllDirectories setDllDirs = \ + (Func_SetDefaultDllDirectories) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \ + "SetDefaultDllDirectories"); \ + if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; } + +void My_SetDefaultDllDirectories(void) { #ifndef UNDER_CE - - OSVERSIONINFO vi; - vi.dwOSVersionInfoSize = sizeof(vi); - GetVersionEx(&vi); - if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0) - { - Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories) - GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories"); - if (setDllDirs) - if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) - return; - } - + IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN #endif } -void LoadSecurityDlls() +void LoadSecurityDlls(void) { #ifndef UNDER_CE - - wchar_t buf[MAX_PATH + 100]; - - { - // at Vista (ver 6.0) : CoCreateInstance(CLSID_ShellLink, ...) doesn't work after SetDefaultDllDirectories() : Check it ??? - OSVERSIONINFO vi; - vi.dwOSVersionInfoSize = sizeof(vi); - if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0) - { - Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories) - GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories"); - if (setDllDirs) - if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) - return; - } - } - - { - unsigned len = GetSystemDirectoryW(buf, MAX_PATH + 2); - if (len == 0 || len > MAX_PATH) - return; - } + // at Vista (ver 6.0) : CoCreateInstance(CLSID_ShellLink, ...) doesn't work after SetDefaultDllDirectories() : Check it ??? + IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN { + wchar_t buf[MAX_PATH + 100]; const char *dll; - unsigned pos = (unsigned)lstrlenW(buf); - + unsigned pos = GetSystemDirectoryW(buf, MAX_PATH + 2); + if (pos == 0 || pos > MAX_PATH) + return; if (buf[pos - 1] != '\\') buf[pos++] = '\\'; - - for (dll = g_Dlls; dll[0] != 0;) + for (dll = g_Dlls; *dll != 0;) { - unsigned k = 0; + wchar_t *dest = &buf[pos]; for (;;) { - char c = *dll++; - buf[pos + k] = (Byte)c; - k++; + const char c = *dll++; if (c == 0) break; + *dest++ = (Byte)c; } - - lstrcatW(buf, L".dll"); + dest[0] = '.'; + dest[1] = 'd'; + dest[2] = 'l'; + dest[3] = 'l'; + dest[4] = 0; + // lstrcatW(buf, L".dll"); LoadLibraryExW(buf, NULL, LOAD_WITH_ALTERED_SEARCH_PATH); } } - #endif } -#endif +#endif // _WIN32 diff --git a/src/sdk/C/DllSecur.h b/src/sdk/C/DllSecur.h index e2a049a..9fa4153 100644 --- a/src/sdk/C/DllSecur.h +++ b/src/sdk/C/DllSecur.h @@ -1,8 +1,8 @@ /* DllSecur.h -- DLL loading for security -2018-02-19 : Igor Pavlov : Public domain */ +2023-03-03 : Igor Pavlov : Public domain */ -#ifndef __DLL_SECUR_H -#define __DLL_SECUR_H +#ifndef ZIP7_INC_DLL_SECUR_H +#define ZIP7_INC_DLL_SECUR_H #include "7zTypes.h" @@ -10,8 +10,8 @@ EXTERN_C_BEGIN #ifdef _WIN32 -void My_SetDefaultDllDirectories(); -void LoadSecurityDlls(); +void My_SetDefaultDllDirectories(void); +void LoadSecurityDlls(void); #endif diff --git a/src/sdk/C/LzFind.c b/src/sdk/C/LzFind.c index df55e86..330bc17 100644 --- a/src/sdk/C/LzFind.c +++ b/src/sdk/C/LzFind.c @@ -1,74 +1,140 @@ /* LzFind.c -- Match finder for LZ algorithms -2018-07-08 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" #include +// #include +#include "CpuArch.h" #include "LzFind.h" #include "LzHash.h" +#define kBlockMoveAlign (1 << 7) // alignment for memmove() +#define kBlockSizeAlign (1 << 16) // alignment for block allocation +#define kBlockSizeReserveMin (1 << 24) // it's 1/256 from 4 GB dictinary + #define kEmptyHashValue 0 -#define kMaxValForNormalize ((UInt32)0xFFFFFFFF) -#define kNormalizeStepMin (1 << 10) /* it must be power of 2 */ -#define kNormalizeMask (~(UInt32)(kNormalizeStepMin - 1)) -#define kMaxHistorySize ((UInt32)7 << 29) -#define kStartMaxLen 3 +#define kMaxValForNormalize ((UInt32)0) +// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug + +// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses + +#define GET_AVAIL_BYTES(p) \ + Inline_MatchFinder_GetNumAvailableBytes(p) + + +// #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size) +#define kFix5HashSize kFix4HashSize + +/* + HASH2_CALC: + if (hv) match, then cur[0] and cur[1] also match +*/ +#define HASH2_CALC hv = GetUi16(cur); + +// (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255] + +/* + HASH3_CALC: + if (cur[0]) and (h2) match, then cur[1] also match + if (cur[0]) and (hv) match, then cur[1] and cur[2] also match +*/ +#define HASH3_CALC { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; } + +#define HASH4_CALC { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + temp ^= ((UInt32)cur[2] << 8); \ + h3 = temp & (kHash3Size - 1); \ + hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; } + +#define HASH5_CALC { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + temp ^= ((UInt32)cur[2] << 8); \ + h3 = temp & (kHash3Size - 1); \ + temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \ + /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \ + hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; } + +#define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF; + static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc) { - if (!p->directInput) + // if (!p->directInput) { - ISzAlloc_Free(alloc, p->bufferBase); - p->bufferBase = NULL; + ISzAlloc_Free(alloc, p->bufBase); + p->bufBase = NULL; } } -/* keepSizeBefore + keepSizeAfter + keepSizeReserv must be < 4G) */ -static int LzInWindow_Create(CMatchFinder *p, UInt32 keepSizeReserv, ISzAllocPtr alloc) +static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc) { - UInt32 blockSize = p->keepSizeBefore + p->keepSizeAfter + keepSizeReserv; - if (p->directInput) - { - p->blockSize = blockSize; - return 1; - } - if (!p->bufferBase || p->blockSize != blockSize) + if (blockSize == 0) + return 0; + if (!p->bufBase || p->blockSize != blockSize) { + // size_t blockSizeT; LzInWindow_Free(p, alloc); p->blockSize = blockSize; - p->bufferBase = (Byte *)ISzAlloc_Alloc(alloc, (size_t)blockSize); + // blockSizeT = blockSize; + + // printf("\nblockSize = 0x%x\n", blockSize); + /* + #if defined _WIN64 + // we can allocate 4GiB, but still use UInt32 for (p->blockSize) + // we use UInt32 type for (p->blockSize), because + // we don't want to wrap over 4 GiB, + // when we use (p->streamPos - p->pos) that is UInt32. + if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign) + { + blockSizeT = ((size_t)1 << 32); + printf("\nchanged to blockSizeT = 4GiB\n"); + } + #endif + */ + + p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize); + // printf("\nbufferBase = %p\n", p->bufBase); + // return 0; // for debug } - return (p->bufferBase != NULL); + return (p->bufBase != NULL); } -Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } - -UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return p->streamPos - p->pos; } +static const Byte *MatchFinder_GetPointerToCurrentPos(void *p) +{ + return ((CMatchFinder *)p)->buffer; +} -void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue) +static UInt32 MatchFinder_GetNumAvailableBytes(void *p) { - p->posLimit -= subValue; - p->pos -= subValue; - p->streamPos -= subValue; + return GET_AVAIL_BYTES((CMatchFinder *)p); } + +Z7_NO_INLINE static void MatchFinder_ReadBlock(CMatchFinder *p) { if (p->streamEndWasReached || p->result != SZ_OK) return; - /* We use (p->streamPos - p->pos) value. (p->streamPos < p->pos) is allowed. */ + /* We use (p->streamPos - p->pos) value. + (p->streamPos < p->pos) is allowed. */ if (p->directInput) { - UInt32 curSize = 0xFFFFFFFF - (p->streamPos - p->pos); + UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p); if (curSize > p->directInputRem) curSize = (UInt32)p->directInputRem; - p->directInputRem -= curSize; p->streamPos += curSize; + p->directInputRem -= curSize; if (p->directInputRem == 0) p->streamEndWasReached = 1; return; @@ -76,12 +142,31 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) for (;;) { - Byte *dest = p->buffer + (p->streamPos - p->pos); - size_t size = (p->bufferBase + p->blockSize - dest); + const Byte *dest = p->buffer + GET_AVAIL_BYTES(p); + size_t size = (size_t)(p->bufBase + p->blockSize - dest); if (size == 0) + { + /* we call ReadBlock() after NeedMove() and MoveBlock(). + NeedMove() and MoveBlock() povide more than (keepSizeAfter) + to the end of (blockSize). + So we don't execute this branch in normal code flow. + We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock(). + */ + // p->result = SZ_ERROR_FAIL; // we can show error here return; + } + + // #define kRead 3 + // if (size > kRead) size = kRead; // for debug - p->result = ISeqInStream_Read(p->stream, dest, &size); + /* + // we need cast (Byte *)dest. + #ifdef __clang__ + #pragma GCC diagnostic ignored "-Wcast-qual" + #endif + */ + p->result = ISeqInStream_Read(p->stream, + p->bufBase + (dest - p->bufBase), &size); if (p->result != SZ_OK) return; if (size == 0) @@ -90,47 +175,60 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) return; } p->streamPos += (UInt32)size; - if (p->streamPos - p->pos > p->keepSizeAfter) + if (GET_AVAIL_BYTES(p) > p->keepSizeAfter) return; + /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function + (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */ } + + // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter) } + + +Z7_NO_INLINE void MatchFinder_MoveBlock(CMatchFinder *p) { - memmove(p->bufferBase, - p->buffer - p->keepSizeBefore, - (size_t)(p->streamPos - p->pos) + p->keepSizeBefore); - p->buffer = p->bufferBase + p->keepSizeBefore; + const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore; + const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore; + p->buffer = p->bufBase + keepBefore; + memmove(p->bufBase, + p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)), + keepBefore + (size_t)GET_AVAIL_BYTES(p)); } +/* We call MoveBlock() before ReadBlock(). + So MoveBlock() can be wasteful operation, if the whole input data + can fit in current block even without calling MoveBlock(). + in important case where (dataSize <= historySize) + condition (p->blockSize > dataSize + p->keepSizeAfter) is met + So there is no MoveBlock() in that case case. +*/ + int MatchFinder_NeedMove(CMatchFinder *p) { if (p->directInput) return 0; - /* if (p->streamEndWasReached) return 0; */ - return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter); + if (p->streamEndWasReached || p->result != SZ_OK) + return 0; + return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter); } void MatchFinder_ReadIfRequired(CMatchFinder *p) { - if (p->streamEndWasReached) - return; - if (p->keepSizeAfter >= p->streamPos - p->pos) + if (p->keepSizeAfter >= GET_AVAIL_BYTES(p)) MatchFinder_ReadBlock(p); } -static void MatchFinder_CheckAndMoveAndRead(CMatchFinder *p) -{ - if (MatchFinder_NeedMove(p)) - MatchFinder_MoveBlock(p); - MatchFinder_ReadBlock(p); -} + static void MatchFinder_SetDefaultSettings(CMatchFinder *p) { p->cutValue = 32; p->btMode = 1; p->numHashBytes = 4; + p->numHashBytes_Min = 2; + p->numHashOutBits = 0; p->bigHash = 0; } @@ -139,8 +237,10 @@ static void MatchFinder_SetDefaultSettings(CMatchFinder *p) void MatchFinder_Construct(CMatchFinder *p) { unsigned i; - p->bufferBase = NULL; + p->buffer = NULL; + p->bufBase = NULL; p->directInput = 0; + p->stream = NULL; p->hash = NULL; p->expectedDataSize = (UInt64)(Int64)-1; MatchFinder_SetDefaultSettings(p); @@ -155,6 +255,8 @@ void MatchFinder_Construct(CMatchFinder *p) } } +#undef kCrcPoly + static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->hash); @@ -169,87 +271,213 @@ void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc) static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc) { - size_t sizeInBytes = (size_t)num * sizeof(CLzRef); + const size_t sizeInBytes = (size_t)num * sizeof(CLzRef); if (sizeInBytes / sizeof(CLzRef) != num) return NULL; return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes); } -int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, - UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, - ISzAllocPtr alloc) +#if (kBlockSizeReserveMin < kBlockSizeAlign * 2) + #error Stop_Compiling_Bad_Reserve +#endif + + + +static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize) { - UInt32 sizeReserv; - + UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter); + /* if (historySize > kMaxHistorySize) - { - MatchFinder_Free(p, alloc); return 0; - } + */ + // printf("\nhistorySize == 0x%x\n", historySize); - sizeReserv = historySize >> 1; - if (historySize >= ((UInt32)3 << 30)) sizeReserv = historySize >> 3; - else if (historySize >= ((UInt32)2 << 30)) sizeReserv = historySize >> 2; + if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore) // if 32-bit overflow + return 0; - sizeReserv += (keepAddBufferBefore + matchMaxLen + keepAddBufferAfter) / 2 + (1 << 19); + { + const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign; + const UInt32 rem = kBlockSizeMax - blockSize; + const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2)) + + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here + if (blockSize >= kBlockSizeMax + || rem < kBlockSizeReserveMin) // we reject settings that will be slow + return 0; + if (reserve >= rem) + blockSize = kBlockSizeMax; + else + { + blockSize += reserve; + blockSize &= ~(UInt32)(kBlockSizeAlign - 1); + } + } + // printf("\n LzFind_blockSize = %x\n", blockSize); + // printf("\n LzFind_blockSize = %d\n", blockSize >> 20); + return blockSize; +} + + +// input is historySize +static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs) +{ + if (p->numHashBytes == 2) + return (1 << 16) - 1; + if (hs != 0) + hs--; + hs |= (hs >> 1); + hs |= (hs >> 2); + hs |= (hs >> 4); + hs |= (hs >> 8); + // we propagated 16 bits in (hs). Low 16 bits must be set later + if (hs >= (1 << 24)) + { + if (p->numHashBytes == 3) + hs = (1 << 24) - 1; + /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ + } + // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) + hs |= (1 << 16) - 1; /* don't change it! */ + // bt5: we adjust the size with recommended minimum size + if (p->numHashBytes >= 5) + hs |= (256 << kLzHash_CrcShift_2) - 1; + return hs; +} + +// input is historySize +static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs) +{ + if (p->numHashBytes == 2) + return (1 << 16) - 1; + if (hs != 0) + hs--; + hs |= (hs >> 1); + hs |= (hs >> 2); + hs |= (hs >> 4); + hs |= (hs >> 8); + // we propagated 16 bits in (hs). Low 16 bits must be set later + hs >>= 1; + if (hs >= (1 << 24)) + { + if (p->numHashBytes == 3) + hs = (1 << 24) - 1; + else + hs >>= 1; + /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ + } + // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) + hs |= (1 << 16) - 1; /* don't change it! */ + // bt5: we adjust the size with recommended minimum size + if (p->numHashBytes >= 5) + hs |= (256 << kLzHash_CrcShift_2) - 1; + return hs; +} + +int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, + UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, + ISzAllocPtr alloc) +{ + /* we need one additional byte in (p->keepSizeBefore), + since we use MoveBlock() after (p->pos++) and before dictionary using */ + // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug p->keepSizeBefore = historySize + keepAddBufferBefore + 1; - p->keepSizeAfter = matchMaxLen + keepAddBufferAfter; - - /* we need one additional byte, since we use MoveBlock after pos++ and before dictionary using */ - - if (LzInWindow_Create(p, sizeReserv, alloc)) + + keepAddBufferAfter += matchMaxLen; + /* we need (p->keepSizeAfter >= p->numHashBytes) */ + if (keepAddBufferAfter < p->numHashBytes) + keepAddBufferAfter = p->numHashBytes; + // keepAddBufferAfter -= 2; // for debug + p->keepSizeAfter = keepAddBufferAfter; + + if (p->directInput) + p->blockSize = 0; + if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc)) { - UInt32 newCyclicBufferSize = historySize + 1; - UInt32 hs; - p->matchMaxLen = matchMaxLen; + size_t hashSizeSum; { - p->fixedHashSize = 0; - if (p->numHashBytes == 2) - hs = (1 << 16) - 1; + UInt32 hs; + UInt32 hsCur; + + if (p->numHashOutBits != 0) + { + unsigned numBits = p->numHashOutBits; + const unsigned nbMax = + (p->numHashBytes == 2 ? 16 : + (p->numHashBytes == 3 ? 24 : 32)); + if (numBits >= nbMax) + numBits = nbMax; + if (numBits >= 32) + hs = (UInt32)0 - 1; + else + hs = ((UInt32)1 << numBits) - 1; + // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) + hs |= (1 << 16) - 1; /* don't change it! */ + if (p->numHashBytes >= 5) + hs |= (256 << kLzHash_CrcShift_2) - 1; + { + const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); + if (hs >= hs2) + hs = hs2; + } + hsCur = hs; + if (p->expectedDataSize < historySize) + { + const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); + if (hsCur >= hs2) + hsCur = hs2; + } + } else { - hs = historySize; - if (hs > p->expectedDataSize) - hs = (UInt32)p->expectedDataSize; - if (hs != 0) - hs--; - hs |= (hs >> 1); - hs |= (hs >> 2); - hs |= (hs >> 4); - hs |= (hs >> 8); - hs >>= 1; - hs |= 0xFFFF; /* don't change it! It's required for Deflate */ - if (hs > (1 << 24)) + hs = MatchFinder_GetHashMask(p, historySize); + hsCur = hs; + if (p->expectedDataSize < historySize) { - if (p->numHashBytes == 3) - hs = (1 << 24) - 1; - else - hs >>= 1; - /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ + hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); + if (hsCur >= hs) // is it possible? + hsCur = hs; } } - p->hashMask = hs; - hs++; - if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size; - if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size; - if (p->numHashBytes > 4) p->fixedHashSize += kHash4Size; - hs += p->fixedHashSize; + + p->hashMask = hsCur; + + hashSizeSum = hs; + hashSizeSum++; + if (hashSizeSum < hs) + return 0; + { + UInt32 fixedHashSize = 0; + if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size; + if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size; + // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size; + hashSizeSum += fixedHashSize; + p->fixedHashSize = fixedHashSize; + } } + p->matchMaxLen = matchMaxLen; + { size_t newSize; size_t numSons; + const UInt32 newCyclicBufferSize = historySize + 1; // do not change it p->historySize = historySize; - p->hashSizeSum = hs; - p->cyclicBufferSize = newCyclicBufferSize; + p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1) numSons = newCyclicBufferSize; if (p->btMode) numSons <<= 1; - newSize = hs + numSons; + newSize = hashSizeSum + numSons; + + if (numSons < newCyclicBufferSize || newSize < numSons) + return 0; + + // aligned size is not required here, but it can be better for some loops + #define NUM_REFS_ALIGN_MASK 0xF + newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK; - if (p->hash && p->numRefs == newSize) + // 22.02: we don't reallocate buffer, if old size is enough + if (p->hash && p->numRefs >= newSize) return 1; MatchFinder_FreeThisClassMemory(p, alloc); @@ -258,7 +486,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, if (p->hash) { - p->son = p->hash + p->hashSizeSum; + p->son = p->hash + hashSizeSum; return 1; } } @@ -268,33 +496,43 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, return 0; } + static void MatchFinder_SetLimits(CMatchFinder *p) { - UInt32 limit = kMaxValForNormalize - p->pos; - UInt32 limit2 = p->cyclicBufferSize - p->cyclicBufferPos; - - if (limit2 < limit) - limit = limit2; - limit2 = p->streamPos - p->pos; + UInt32 k; + UInt32 n = kMaxValForNormalize - p->pos; + if (n == 0) + n = (UInt32)(Int32)-1; // we allow (pos == 0) at start even with (kMaxValForNormalize == 0) - if (limit2 <= p->keepSizeAfter) + k = p->cyclicBufferSize - p->cyclicBufferPos; + if (k < n) + n = k; + + k = GET_AVAIL_BYTES(p); { - if (limit2 > 0) - limit2 = 1; + const UInt32 ksa = p->keepSizeAfter; + UInt32 mm = p->matchMaxLen; + if (k > ksa) + k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock + else if (k >= mm) + { + // the limitation for (p->lenLimit) update + k -= mm; // optimization : to reduce the number of checks + k++; + // k = 1; // non-optimized version : for debug + } + else + { + mm = k; + if (k != 0) + k = 1; + } + p->lenLimit = mm; } - else - limit2 -= p->keepSizeAfter; - - if (limit2 < limit) - limit = limit2; + if (k < n) + n = k; - { - UInt32 lenLimit = p->streamPos - p->pos; - if (lenLimit > p->matchMaxLen) - lenLimit = p->matchMaxLen; - p->lenLimit = lenLimit; - } - p->posLimit = p->pos + limit; + p->posLimit = p->pos + n; } @@ -302,7 +540,7 @@ void MatchFinder_Init_LowHash(CMatchFinder *p) { size_t i; CLzRef *items = p->hash; - size_t numItems = p->fixedHashSize; + const size_t numItems = p->fixedHashSize; for (i = 0; i < numItems; i++) items[i] = kEmptyHashValue; } @@ -312,72 +550,325 @@ void MatchFinder_Init_HighHash(CMatchFinder *p) { size_t i; CLzRef *items = p->hash + p->fixedHashSize; - size_t numItems = (size_t)p->hashMask + 1; + const size_t numItems = (size_t)p->hashMask + 1; for (i = 0; i < numItems; i++) items[i] = kEmptyHashValue; } -void MatchFinder_Init_3(CMatchFinder *p, int readData) +void MatchFinder_Init_4(CMatchFinder *p) { - p->cyclicBufferPos = 0; - p->buffer = p->bufferBase; - p->pos = - p->streamPos = p->cyclicBufferSize; + if (!p->directInput) + p->buffer = p->bufBase; + { + /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker. + the code in CMatchFinderMt expects (pos = 1) */ + p->pos = + p->streamPos = + 1; // it's smallest optimal value. do not change it + // 0; // for debug + } p->result = SZ_OK; p->streamEndWasReached = 0; - - if (readData) - MatchFinder_ReadBlock(p); - - MatchFinder_SetLimits(p); } -void MatchFinder_Init(CMatchFinder *p) +// (CYC_TO_POS_OFFSET == 0) is expected by some optimized code +#define CYC_TO_POS_OFFSET 0 +// #define CYC_TO_POS_OFFSET 1 // for debug + +void MatchFinder_Init(void *_p) { + CMatchFinder *p = (CMatchFinder *)_p; MatchFinder_Init_HighHash(p); MatchFinder_Init_LowHash(p); - MatchFinder_Init_3(p, True); + MatchFinder_Init_4(p); + // if (readData) + MatchFinder_ReadBlock(p); + + /* if we init (cyclicBufferPos = pos), then we can use one variable + instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */ + p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos) + // p->cyclicBufferPos = 0; // smallest value + // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses. + MatchFinder_SetLimits(p); } - -static UInt32 MatchFinder_GetSubValue(CMatchFinder *p) + + +#ifdef MY_CPU_X86_OR_AMD64 + #if defined(__clang__) && (__clang_major__ >= 4) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) + // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) + + #define USE_LZFIND_SATUR_SUB_128 + #define USE_LZFIND_SATUR_SUB_256 + #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1"))) + #define LZFIND_ATTRIB_AVX2 __attribute__((__target__("avx2"))) + #elif defined(_MSC_VER) + #if (_MSC_VER >= 1600) + #define USE_LZFIND_SATUR_SUB_128 + #endif + #if (_MSC_VER >= 1900) + #define USE_LZFIND_SATUR_SUB_256 + #endif + #endif + +#elif defined(MY_CPU_ARM64) \ + /* || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) */ + + #if defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(__GNUC__) && (__GNUC__ >= 6) + #define USE_LZFIND_SATUR_SUB_128 + #ifdef MY_CPU_ARM64 + // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__(""))) + #else + #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=neon"))) + #endif + + #elif defined(_MSC_VER) + #if (_MSC_VER >= 1910) + #define USE_LZFIND_SATUR_SUB_128 + #endif + #endif + + #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) + #include + #else + #include + #endif + +#endif + + +#ifdef USE_LZFIND_SATUR_SUB_128 + +// #define Z7_SHOW_HW_STATUS + +#ifdef Z7_SHOW_HW_STATUS +#include +#define PRF(x) x +PRF(;) +#else +#define PRF(x) +#endif + + +#ifdef MY_CPU_ARM_OR_ARM64 + +#ifdef MY_CPU_ARM64 +// #define FORCE_LZFIND_SATUR_SUB_128 +#endif +typedef uint32x4_t LzFind_v128; +#define SASUB_128_V(v, s) \ + vsubq_u32(vmaxq_u32(v, s), s) + +#else // MY_CPU_ARM_OR_ARM64 + +#include // sse4.1 + +typedef __m128i LzFind_v128; +// SSE 4.1 +#define SASUB_128_V(v, s) \ + _mm_sub_epi32(_mm_max_epu32(v, s), s) + +#endif // MY_CPU_ARM_OR_ARM64 + + +#define SASUB_128(i) \ + *( LzFind_v128 *)( void *)(items + (i) * 4) = SASUB_128_V( \ + *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2); + + +Z7_NO_INLINE +static +#ifdef LZFIND_ATTRIB_SSE41 +LZFIND_ATTRIB_SSE41 +#endif +void +Z7_FASTCALL +LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim) { - return (p->pos - p->historySize - 1) & kNormalizeMask; + const LzFind_v128 sub2 = + #ifdef MY_CPU_ARM_OR_ARM64 + vdupq_n_u32(subValue); + #else + _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); + #endif + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SASUB_128(0) SASUB_128(1) items += 2 * 4; + SASUB_128(0) SASUB_128(1) items += 2 * 4; + } + while (items != lim); } -void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) + + +#ifdef USE_LZFIND_SATUR_SUB_256 + +#include // avx +/* +clang :immintrin.h uses +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX2__) +#include +#endif +so we need for clang-cl */ + +#if defined(__clang__) +#include +#include +#endif + +// AVX2: +#define SASUB_256(i) \ + *( __m256i *)( void *)(items + (i) * 8) = \ + _mm256_sub_epi32(_mm256_max_epu32( \ + *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2); + +Z7_NO_INLINE +static +#ifdef LZFIND_ATTRIB_AVX2 +LZFIND_ATTRIB_AVX2 +#endif +void +Z7_FASTCALL +LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim) { - size_t i; - for (i = 0; i < numItems; i++) + const __m256i sub2 = _mm256_set_epi32( + (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue, + (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do { - UInt32 value = items[i]; - if (value <= subValue) - value = kEmptyHashValue; - else - value -= subValue; - items[i] = value; + SASUB_256(0) SASUB_256(1) items += 2 * 8; + SASUB_256(0) SASUB_256(1) items += 2 * 8; } + while (items != lim); } +#endif // USE_LZFIND_SATUR_SUB_256 + +#ifndef FORCE_LZFIND_SATUR_SUB_128 +typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)( + UInt32 subValue, CLzRef *items, const CLzRef *lim); +static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub; +#endif // FORCE_LZFIND_SATUR_SUB_128 + +#endif // USE_LZFIND_SATUR_SUB_128 + + +// kEmptyHashValue must be zero +// #define SASUB_32(i) { UInt32 v = items[i]; UInt32 m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; } +#define SASUB_32(i) { UInt32 v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; } + +#ifdef FORCE_LZFIND_SATUR_SUB_128 -static void MatchFinder_Normalize(CMatchFinder *p) +#define DEFAULT_SaturSub LzFind_SaturSub_128 + +#else + +#define DEFAULT_SaturSub LzFind_SaturSub_32 + +Z7_NO_INLINE +static +void +Z7_FASTCALL +LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim) { - UInt32 subValue = MatchFinder_GetSubValue(p); - MatchFinder_Normalize3(subValue, p->hash, p->numRefs); - MatchFinder_ReduceOffsets(p, subValue); + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SASUB_32(0) SASUB_32(1) items += 2; + SASUB_32(0) SASUB_32(1) items += 2; + SASUB_32(0) SASUB_32(1) items += 2; + SASUB_32(0) SASUB_32(1) items += 2; + } + while (items != lim); } +#endif + -MY_NO_INLINE +Z7_NO_INLINE +void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) +{ + #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7) + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--) + { + SASUB_32(0) + items++; + } + { + const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1); + CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask); + numItems &= k_Align_Mask; + if (items != lim) + { + #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128) + if (g_LzFind_SaturSub) + g_LzFind_SaturSub(subValue, items, lim); + else + #endif + DEFAULT_SaturSub(subValue, items, lim); + } + items = lim; + } + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + for (; numItems != 0; numItems--) + { + SASUB_32(0) + items++; + } +} + + + +// call MatchFinder_CheckLimits() only after (p->pos++) update + +Z7_NO_INLINE static void MatchFinder_CheckLimits(CMatchFinder *p) { + if (// !p->streamEndWasReached && p->result == SZ_OK && + p->keepSizeAfter == GET_AVAIL_BYTES(p)) + { + // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p)) + if (MatchFinder_NeedMove(p)) + MatchFinder_MoveBlock(p); + MatchFinder_ReadBlock(p); + } + if (p->pos == kMaxValForNormalize) - MatchFinder_Normalize(p); - if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos) - MatchFinder_CheckAndMoveAndRead(p); + if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data. + /* + if we disable normalization for last bytes of data, and + if (data_size == 4 GiB), we don't call wastfull normalization, + but (pos) will be wrapped over Zero (0) in that case. + And we cannot resume later to normal operation + */ + { + // MatchFinder_Normalize(p); + /* after normalization we need (p->pos >= p->historySize + 1); */ + /* we can reduce subValue to aligned value, if want to keep alignment + of (p->pos) and (p->buffer) for speculated accesses. */ + const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */; + // const UInt32 subValue = (1 << 15); // for debug + // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue); + MatchFinder_REDUCE_OFFSETS(p, subValue) + MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize); + { + size_t numSonRefs = p->cyclicBufferSize; + if (p->btMode) + numSonRefs <<= 1; + MatchFinder_Normalize3(subValue, p->son, numSonRefs); + } + } + if (p->cyclicBufferPos == p->cyclicBufferSize) p->cyclicBufferPos = 0; + MatchFinder_SetLimits(p); } @@ -385,10 +876,10 @@ static void MatchFinder_CheckLimits(CMatchFinder *p) /* (lenLimit > maxLen) */ -MY_FORCE_INLINE -static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, - UInt32 *distances, unsigned maxLen) +Z7_FORCE_INLINE +static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, + UInt32 *d, unsigned maxLen) { /* son[_cyclicBufferPos] = curMatch; @@ -396,10 +887,10 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos { UInt32 delta = pos - curMatch; if (cutValue-- == 0 || delta >= _cyclicBufferSize) - return distances; + return d; { const Byte *pb = cur - delta; - curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; + curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)]; if (pb[maxLen] == cur[maxLen] && *pb == *cur) { UInt32 len = 0; @@ -409,10 +900,10 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos if (maxLen < len) { maxLen = len; - *distances++ = len; - *distances++ = delta - 1; + *d++ = len; + *d++ = delta - 1; if (len == lenLimit) - return distances; + return d; } } } @@ -421,35 +912,41 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos const Byte *lim = cur + lenLimit; son[_cyclicBufferPos] = curMatch; + do { - UInt32 delta = pos - curMatch; + UInt32 delta; + + if (curMatch == 0) + break; + // if (curMatch2 >= curMatch) return NULL; + delta = pos - curMatch; if (delta >= _cyclicBufferSize) break; { ptrdiff_t diff; - curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; - diff = (ptrdiff_t)0 - delta; - if (cur[maxLen] == cur[maxLen + diff]) + curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)]; + diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff]) { const Byte *c = cur; while (*c == c[diff]) { if (++c == lim) { - distances[0] = (UInt32)(lim - cur); - distances[1] = delta - 1; - return distances + 2; + d[0] = (UInt32)(lim - cur); + d[1] = delta - 1; + return d + 2; } } { - unsigned len = (unsigned)(c - cur); + const unsigned len = (unsigned)(c - cur); if (maxLen < len) { maxLen = len; - distances[0] = (UInt32)len; - distances[1] = delta - 1; - distances += 2; + d[0] = (UInt32)len; + d[1] = delta - 1; + d += 2; } } } @@ -457,31 +954,36 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos } while (--cutValue); - return distances; + return d; } -MY_FORCE_INLINE +Z7_FORCE_INLINE UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, - UInt32 *distances, UInt32 maxLen) + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, + UInt32 *d, UInt32 maxLen) { CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); unsigned len0 = 0, len1 = 0; - for (;;) + + UInt32 cmCheck; + + // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } + + cmCheck = (UInt32)(pos - _cyclicBufferSize); + if ((UInt32)pos < _cyclicBufferSize) + cmCheck = 0; + + if (cmCheck < curMatch) + do { - UInt32 delta = pos - curMatch; - if (cutValue-- == 0 || delta >= _cyclicBufferSize) - { - *ptr0 = *ptr1 = kEmptyHashValue; - return distances; - } + const UInt32 delta = pos - curMatch; { - CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); + CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1); const Byte *pb = cur - delta; unsigned len = (len0 < len1 ? len0 : len1); - UInt32 pair0 = pair[0]; + const UInt32 pair0 = pair[0]; if (pb[len] == cur[len]) { if (++len != lenLimit && pb[len] == cur[len]) @@ -491,50 +993,62 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt if (maxLen < len) { maxLen = (UInt32)len; - *distances++ = (UInt32)len; - *distances++ = delta - 1; + *d++ = (UInt32)len; + *d++ = delta - 1; if (len == lenLimit) { *ptr1 = pair0; *ptr0 = pair[1]; - return distances; + return d; } } } if (pb[len] < cur[len]) { *ptr1 = curMatch; + // const UInt32 curMatch2 = pair[1]; + // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } + // curMatch = curMatch2; + curMatch = pair[1]; ptr1 = pair + 1; - curMatch = *ptr1; len1 = len; } else { *ptr0 = curMatch; + curMatch = pair[0]; ptr0 = pair; - curMatch = *ptr0; len0 = len; } } } + while(--cutValue && cmCheck < curMatch); + + *ptr0 = *ptr1 = kEmptyHashValue; + return d; } + static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue) + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue) { CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); unsigned len0 = 0, len1 = 0; - for (;;) + + UInt32 cmCheck; + + cmCheck = (UInt32)(pos - _cyclicBufferSize); + if ((UInt32)pos < _cyclicBufferSize) + cmCheck = 0; + + if (// curMatch >= pos || // failure + cmCheck < curMatch) + do { - UInt32 delta = pos - curMatch; - if (cutValue-- == 0 || delta >= _cyclicBufferSize) + const UInt32 delta = pos - curMatch; { - *ptr0 = *ptr1 = kEmptyHashValue; - return; - } - { - CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); + CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1); const Byte *pb = cur - delta; unsigned len = (len0 < len1 ? len0 : len1); if (pb[len] == cur[len]) @@ -554,84 +1068,122 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const if (pb[len] < cur[len]) { *ptr1 = curMatch; + curMatch = pair[1]; ptr1 = pair + 1; - curMatch = *ptr1; len1 = len; } else { *ptr0 = curMatch; + curMatch = pair[0]; ptr0 = pair; - curMatch = *ptr0; len0 = len; } } } + while(--cutValue && cmCheck < curMatch); + + *ptr0 = *ptr1 = kEmptyHashValue; + return; } + #define MOVE_POS \ - ++p->cyclicBufferPos; \ + p->cyclicBufferPos++; \ p->buffer++; \ - if (++p->pos == p->posLimit) MatchFinder_CheckLimits(p); + { const UInt32 pos1 = p->pos + 1; \ + p->pos = pos1; \ + if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } -#define MOVE_POS_RET MOVE_POS return (UInt32)offset; +#define MOVE_POS_RET MOVE_POS return distances; -static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; } +Z7_NO_INLINE +static void MatchFinder_MovePos(CMatchFinder *p) +{ + /* we go here at the end of stream data, when (avail < num_hash_bytes) + We don't update sons[cyclicBufferPos << btMode]. + So (sons) record will contain junk. And we cannot resume match searching + to normal operation, even if we will provide more input data in buffer. + p->sons[p->cyclicBufferPos << p->btMode] = 0; // kEmptyHashValue + if (p->btMode) + p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue + */ + MOVE_POS +} #define GET_MATCHES_HEADER2(minLen, ret_op) \ - unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \ - lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ + UInt32 hv; const Byte *cur; UInt32 curMatch; \ + UInt32 lenLimit = p->lenLimit; \ + if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; } \ cur = p->buffer; -#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return 0) -#define SKIP_HEADER(minLen) GET_MATCHES_HEADER2(minLen, continue) +#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances) +#define SKIP_HEADER(minLen) \ + do { GET_MATCHES_HEADER2(minLen, continue) + +#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, \ + p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue + +#define SKIP_FOOTER \ + SkipMatchesSpec(MF_PARAMS(p)); \ + MOVE_POS \ + } while (--num); + +#define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ + distances = func(MF_PARAMS(p), distances, (UInt32)_maxLen_); \ + MOVE_POS_RET + +#define GET_MATCHES_FOOTER_BT(_maxLen_) \ + GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) -#define MF_PARAMS(p) p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue +#define GET_MATCHES_FOOTER_HC(_maxLen_) \ + GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec) -#define GET_MATCHES_FOOTER(offset, maxLen) \ - offset = (unsigned)(GetMatchesSpec1((UInt32)lenLimit, curMatch, MF_PARAMS(p), \ - distances + offset, (UInt32)maxLen) - distances); MOVE_POS_RET; -#define SKIP_FOOTER \ - SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p)); MOVE_POS; #define UPDATE_maxLen { \ - ptrdiff_t diff = (ptrdiff_t)0 - d2; \ + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \ const Byte *c = cur + maxLen; \ const Byte *lim = cur + lenLimit; \ for (; c != lim; c++) if (*(c + diff) != *c) break; \ maxLen = (unsigned)(c - cur); } -static UInt32 Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32* Bt2_MatchFinder_GetMatches(void *_p, UInt32 *distances) { - unsigned offset; + CMatchFinder *p = (CMatchFinder *)_p; GET_MATCHES_HEADER(2) - HASH2_CALC; + HASH2_CALC curMatch = p->hash[hv]; p->hash[hv] = p->pos; - offset = 0; - GET_MATCHES_FOOTER(offset, 1) + GET_MATCHES_FOOTER_BT(1) } -UInt32 Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - unsigned offset; GET_MATCHES_HEADER(3) - HASH_ZIP_CALC; + HASH_ZIP_CALC curMatch = p->hash[hv]; p->hash[hv] = p->pos; - offset = 0; - GET_MATCHES_FOOTER(offset, 2) + GET_MATCHES_FOOTER_BT(2) } -static UInt32 Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +#define SET_mmm \ + mmm = p->cyclicBufferSize; \ + if (pos < mmm) \ + mmm = pos; + + +static UInt32* Bt3_MatchFinder_GetMatches(void *_p, UInt32 *distances) { + CMatchFinder *p = (CMatchFinder *)_p; + UInt32 mmm; UInt32 h2, d2, pos; - unsigned maxLen, offset; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(3) - HASH3_CALC; + HASH3_CALC hash = p->hash; pos = p->pos; @@ -643,167 +1195,180 @@ static UInt32 Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) hash[h2] = pos; (hash + kFix3HashSize)[hv] = pos; + SET_mmm + maxLen = 2; - offset = 0; - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + if (d2 < mmm && *(cur - d2) == *cur) { UPDATE_maxLen distances[0] = (UInt32)maxLen; distances[1] = d2 - 1; - offset = 2; + distances += 2; if (maxLen == lenLimit) { - SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p)); - MOVE_POS_RET; + SkipMatchesSpec(MF_PARAMS(p)); + MOVE_POS_RET } } - GET_MATCHES_FOOTER(offset, maxLen) + GET_MATCHES_FOOTER_BT(maxLen) } -static UInt32 Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +static UInt32* Bt4_MatchFinder_GetMatches(void *_p, UInt32 *distances) { + CMatchFinder *p = (CMatchFinder *)_p; + UInt32 mmm; UInt32 h2, h3, d2, d3, pos; - unsigned maxLen, offset; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(4) - HASH4_CALC; + HASH4_CALC hash = p->hash; pos = p->pos; d2 = pos - hash [h2]; d3 = pos - (hash + kFix3HashSize)[h3]; - curMatch = (hash + kFix4HashSize)[hv]; hash [h2] = pos; (hash + kFix3HashSize)[h3] = pos; (hash + kFix4HashSize)[hv] = pos; - maxLen = 0; - offset = 0; - - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) - { - maxLen = 2; - distances[0] = 2; - distances[1] = d2 - 1; - offset = 2; - } + SET_mmm + + maxLen = 3; - if (d2 != d3 && d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + for (;;) { - maxLen = 3; - distances[(size_t)offset + 1] = d3 - 1; - offset += 2; - d2 = d3; - } + if (d2 < mmm && *(cur - d2) == *cur) + { + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + // distances[-2] = 3; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; - if (offset != 0) - { UPDATE_maxLen - distances[(size_t)offset - 2] = (UInt32)maxLen; + distances[-2] = (UInt32)maxLen; if (maxLen == lenLimit) { - SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p)); - MOVE_POS_RET; + SkipMatchesSpec(MF_PARAMS(p)); + MOVE_POS_RET } + break; } - if (maxLen < 3) - maxLen = 3; - - GET_MATCHES_FOOTER(offset, maxLen) + GET_MATCHES_FOOTER_BT(maxLen) } -/* -static UInt32 Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +static UInt32* Bt5_MatchFinder_GetMatches(void *_p, UInt32 *distances) { - UInt32 h2, h3, h4, d2, d3, d4, maxLen, offset, pos; + CMatchFinder *p = (CMatchFinder *)_p; + UInt32 mmm; + UInt32 h2, h3, d2, d3, pos; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(5) - HASH5_CALC; + HASH5_CALC hash = p->hash; pos = p->pos; d2 = pos - hash [h2]; d3 = pos - (hash + kFix3HashSize)[h3]; - d4 = pos - (hash + kFix4HashSize)[h4]; + // d4 = pos - (hash + kFix4HashSize)[h4]; curMatch = (hash + kFix5HashSize)[hv]; hash [h2] = pos; (hash + kFix3HashSize)[h3] = pos; - (hash + kFix4HashSize)[h4] = pos; + // (hash + kFix4HashSize)[h4] = pos; (hash + kFix5HashSize)[hv] = pos; - maxLen = 0; - offset = 0; + SET_mmm + + maxLen = 4; - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + for (;;) { - distances[0] = maxLen = 2; - distances[1] = d2 - 1; - offset = 2; - if (*(cur - d2 + 2) == cur[2]) - distances[0] = maxLen = 3; - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + if (d2 < mmm && *(cur - d2) == *cur) { - distances[2] = maxLen = 3; - distances[3] = d3 - 1; - offset = 4; + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; + d2 = d3; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; d2 = d3; } - } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) - { - distances[0] = maxLen = 3; - distances[1] = d3 - 1; - offset = 2; - d2 = d3; - } - - if (d2 != d4 && d4 < p->cyclicBufferSize - && *(cur - d4) == *cur - && *(cur - d4 + 3) == *(cur + 3)) - { - maxLen = 4; - distances[(size_t)offset + 1] = d4 - 1; - offset += 2; - d2 = d4; - } - - if (offset != 0) - { + else + break; + + distances[-2] = 3; + if (*(cur - d2 + 3) != cur[3]) + break; UPDATE_maxLen - distances[(size_t)offset - 2] = maxLen; + distances[-2] = (UInt32)maxLen; if (maxLen == lenLimit) { - SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); - MOVE_POS_RET; + SkipMatchesSpec(MF_PARAMS(p)); + MOVE_POS_RET } + break; } - - if (maxLen < 4) - maxLen = 4; - GET_MATCHES_FOOTER(offset, maxLen) + GET_MATCHES_FOOTER_BT(maxLen) } -*/ -static UInt32 Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +static UInt32* Hc4_MatchFinder_GetMatches(void *_p, UInt32 *distances) { + CMatchFinder *p = (CMatchFinder *)_p; + UInt32 mmm; UInt32 h2, h3, d2, d3, pos; - unsigned maxLen, offset; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(4) - HASH4_CALC; + HASH4_CALC hash = p->hash; pos = p->pos; @@ -816,312 +1381,366 @@ static UInt32 Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) (hash + kFix3HashSize)[h3] = pos; (hash + kFix4HashSize)[hv] = pos; - maxLen = 0; - offset = 0; + SET_mmm - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) - { - maxLen = 2; - distances[0] = 2; - distances[1] = d2 - 1; - offset = 2; - } - - if (d2 != d3 && d3 < p->cyclicBufferSize && *(cur - d3) == *cur) - { - maxLen = 3; - distances[(size_t)offset + 1] = d3 - 1; - offset += 2; - d2 = d3; - } - - if (offset != 0) + maxLen = 3; + + for (;;) { + if (d2 < mmm && *(cur - d2) == *cur) + { + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + // distances[-2] = 3; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + d2 = d3; + distances[1] = d3 - 1; + distances += 2; + } + else + break; + UPDATE_maxLen - distances[(size_t)offset - 2] = (UInt32)maxLen; + distances[-2] = (UInt32)maxLen; if (maxLen == lenLimit) { p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS_RET; + MOVE_POS_RET } + break; } - if (maxLen < 3) - maxLen = 3; - - offset = (unsigned)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), - distances + offset, maxLen) - (distances)); - MOVE_POS_RET + GET_MATCHES_FOOTER_HC(maxLen) } -/* -static UInt32 Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +static UInt32 * Hc5_MatchFinder_GetMatches(void *_p, UInt32 *distances) { - UInt32 h2, h3, h4, d2, d3, d4, maxLen, offset, pos + CMatchFinder *p = (CMatchFinder *)_p; + UInt32 mmm; + UInt32 h2, h3, d2, d3, pos; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(5) - HASH5_CALC; + HASH5_CALC hash = p->hash; pos = p->pos; - + d2 = pos - hash [h2]; d3 = pos - (hash + kFix3HashSize)[h3]; - d4 = pos - (hash + kFix4HashSize)[h4]; + // d4 = pos - (hash + kFix4HashSize)[h4]; curMatch = (hash + kFix5HashSize)[hv]; hash [h2] = pos; (hash + kFix3HashSize)[h3] = pos; - (hash + kFix4HashSize)[h4] = pos; + // (hash + kFix4HashSize)[h4] = pos; (hash + kFix5HashSize)[hv] = pos; - maxLen = 0; - offset = 0; + SET_mmm + + maxLen = 4; - if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur) + for (;;) { - distances[0] = maxLen = 2; - distances[1] = d2 - 1; - offset = 2; - if (*(cur - d2 + 2) == cur[2]) - distances[0] = maxLen = 3; - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) + if (d2 < mmm && *(cur - d2) == *cur) { - distances[2] = maxLen = 3; - distances[3] = d3 - 1; - offset = 4; + distances[0] = 2; + distances[1] = d2 - 1; + distances += 2; + if (*(cur - d2 + 2) == cur[2]) + { + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; + d2 = d3; + } + else + break; + } + else if (d3 < mmm && *(cur - d3) == *cur) + { + distances[1] = d3 - 1; + distances += 2; d2 = d3; } - } - else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur) - { - distances[0] = maxLen = 3; - distances[1] = d3 - 1; - offset = 2; - d2 = d3; - } - - if (d2 != d4 && d4 < p->cyclicBufferSize - && *(cur - d4) == *cur - && *(cur - d4 + 3) == *(cur + 3)) - { - maxLen = 4; - distances[(size_t)offset + 1] = d4 - 1; - offset += 2; - d2 = d4; - } - - if (offset != 0) - { + else + break; + + distances[-2] = 3; + if (*(cur - d2 + 3) != cur[3]) + break; UPDATE_maxLen - distances[(size_t)offset - 2] = maxLen; + distances[-2] = (UInt32)maxLen; if (maxLen == lenLimit) { p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS_RET; + MOVE_POS_RET } + break; } - if (maxLen < 4) - maxLen = 4; - - offset = (UInt32)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), - distances + offset, maxLen) - (distances)); - MOVE_POS_RET + GET_MATCHES_FOOTER_HC(maxLen) } -*/ -UInt32 Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) + +UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) { - unsigned offset; GET_MATCHES_HEADER(3) - HASH_ZIP_CALC; + HASH_ZIP_CALC curMatch = p->hash[hv]; p->hash[hv] = p->pos; - offset = (unsigned)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), - distances, 2) - (distances)); - MOVE_POS_RET + GET_MATCHES_FOOTER_HC(2) } -static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) + +static void Bt2_MatchFinder_Skip(void *_p, UInt32 num) { - do + CMatchFinder *p = (CMatchFinder *)_p; + SKIP_HEADER(2) { - SKIP_HEADER(2) - HASH2_CALC; + HASH2_CALC curMatch = p->hash[hv]; p->hash[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do + SKIP_HEADER(3) { - SKIP_HEADER(3) - HASH_ZIP_CALC; + HASH_ZIP_CALC curMatch = p->hash[hv]; p->hash[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } -static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +static void Bt3_MatchFinder_Skip(void *_p, UInt32 num) { - do + CMatchFinder *p = (CMatchFinder *)_p; + SKIP_HEADER(3) { UInt32 h2; UInt32 *hash; - SKIP_HEADER(3) - HASH3_CALC; + HASH3_CALC hash = p->hash; curMatch = (hash + kFix3HashSize)[hv]; hash[h2] = (hash + kFix3HashSize)[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } -static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +static void Bt4_MatchFinder_Skip(void *_p, UInt32 num) { - do + CMatchFinder *p = (CMatchFinder *)_p; + SKIP_HEADER(4) { UInt32 h2, h3; UInt32 *hash; - SKIP_HEADER(4) - HASH4_CALC; + HASH4_CALC hash = p->hash; curMatch = (hash + kFix4HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = (hash + kFix4HashSize)[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } -/* -static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +static void Bt5_MatchFinder_Skip(void *_p, UInt32 num) { - do + CMatchFinder *p = (CMatchFinder *)_p; + SKIP_HEADER(5) { - UInt32 h2, h3, h4; + UInt32 h2, h3; UInt32 *hash; - SKIP_HEADER(5) - HASH5_CALC; + HASH5_CALC hash = p->hash; curMatch = (hash + kFix5HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = - (hash + kFix4HashSize)[h4] = + // (hash + kFix4HashSize)[h4] = (hash + kFix5HashSize)[hv] = p->pos; - SKIP_FOOTER } - while (--num != 0); + SKIP_FOOTER } -*/ -static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) + +#define HC_SKIP_HEADER(minLen) \ + do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \ + const Byte *cur; \ + UInt32 *hash; \ + UInt32 *son; \ + UInt32 pos = p->pos; \ + UInt32 num2 = num; \ + /* (p->pos == p->posLimit) is not allowed here !!! */ \ + { const UInt32 rem = p->posLimit - pos; if (num2 >= rem) num2 = rem; } \ + num -= num2; \ + { const UInt32 cycPos = p->cyclicBufferPos; \ + son = p->son + cycPos; \ + p->cyclicBufferPos = cycPos + num2; } \ + cur = p->buffer; \ + hash = p->hash; \ + do { \ + UInt32 curMatch; \ + UInt32 hv; + + +#define HC_SKIP_FOOTER \ + cur++; pos++; *son++ = curMatch; \ + } while (--num2); \ + p->buffer = cur; \ + p->pos = pos; \ + if (pos == p->posLimit) MatchFinder_CheckLimits(p); \ + }} while(num); \ + + +static void Hc4_MatchFinder_Skip(void *_p, UInt32 num) { - do - { + CMatchFinder *p = (CMatchFinder *)_p; + HC_SKIP_HEADER(4) + UInt32 h2, h3; - UInt32 *hash; - SKIP_HEADER(4) - HASH4_CALC; - hash = p->hash; + HASH4_CALC curMatch = (hash + kFix4HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = - (hash + kFix4HashSize)[hv] = p->pos; - p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS - } - while (--num != 0); + (hash + kFix4HashSize)[hv] = pos; + + HC_SKIP_FOOTER } -/* -static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) + +static void Hc5_MatchFinder_Skip(void *_p, UInt32 num) { - do - { - UInt32 h2, h3, h4; - UInt32 *hash; - SKIP_HEADER(5) - HASH5_CALC; - hash = p->hash; - curMatch = hash + kFix5HashSize)[hv]; + CMatchFinder *p = (CMatchFinder *)_p; + HC_SKIP_HEADER(5) + + UInt32 h2, h3; + HASH5_CALC + curMatch = (hash + kFix5HashSize)[hv]; hash [h2] = (hash + kFix3HashSize)[h3] = - (hash + kFix4HashSize)[h4] = - (hash + kFix5HashSize)[hv] = p->pos; - p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS - } - while (--num != 0); + // (hash + kFix4HashSize)[h4] = + (hash + kFix5HashSize)[hv] = pos; + + HC_SKIP_FOOTER } -*/ + void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) { - do - { - SKIP_HEADER(3) - HASH_ZIP_CALC; - curMatch = p->hash[hv]; - p->hash[hv] = p->pos; - p->son[p->cyclicBufferPos] = curMatch; - MOVE_POS - } - while (--num != 0); + HC_SKIP_HEADER(3) + + HASH_ZIP_CALC + curMatch = hash[hv]; + hash[hv] = pos; + + HC_SKIP_FOOTER } -void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable) + +void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) { - vTable->Init = (Mf_Init_Func)MatchFinder_Init; - vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; - vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos; + vTable->Init = MatchFinder_Init; + vTable->GetNumAvailableBytes = MatchFinder_GetNumAvailableBytes; + vTable->GetPointerToCurrentPos = MatchFinder_GetPointerToCurrentPos; if (!p->btMode) { - /* if (p->numHashBytes <= 4) */ + if (p->numHashBytes <= 4) { - vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip; + vTable->GetMatches = Hc4_MatchFinder_GetMatches; + vTable->Skip = Hc4_MatchFinder_Skip; } - /* else { - vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip; + vTable->GetMatches = Hc5_MatchFinder_GetMatches; + vTable->Skip = Hc5_MatchFinder_Skip; } - */ } else if (p->numHashBytes == 2) { - vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip; + vTable->GetMatches = Bt2_MatchFinder_GetMatches; + vTable->Skip = Bt2_MatchFinder_Skip; } else if (p->numHashBytes == 3) { - vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip; + vTable->GetMatches = Bt3_MatchFinder_GetMatches; + vTable->Skip = Bt3_MatchFinder_Skip; } - else /* if (p->numHashBytes == 4) */ + else if (p->numHashBytes == 4) { - vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; + vTable->GetMatches = Bt4_MatchFinder_GetMatches; + vTable->Skip = Bt4_MatchFinder_Skip; } - /* else { - vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip; + vTable->GetMatches = Bt5_MatchFinder_GetMatches; + vTable->Skip = Bt5_MatchFinder_Skip; } - */ } + + + +void LzFindPrepare(void) +{ + #ifndef FORCE_LZFIND_SATUR_SUB_128 + #ifdef USE_LZFIND_SATUR_SUB_128 + LZFIND_SATUR_SUB_CODE_FUNC f = NULL; + #ifdef MY_CPU_ARM_OR_ARM64 + { + if (CPU_IsSupported_NEON()) + { + // #pragma message ("=== LzFind NEON") + PRF(printf("\n=== LzFind NEON\n")); + f = LzFind_SaturSub_128; + } + // f = 0; // for debug + } + #else // MY_CPU_ARM_OR_ARM64 + if (CPU_IsSupported_SSE41()) + { + // #pragma message ("=== LzFind SSE41") + PRF(printf("\n=== LzFind SSE41\n")); + f = LzFind_SaturSub_128; + + #ifdef USE_LZFIND_SATUR_SUB_256 + if (CPU_IsSupported_AVX2()) + { + // #pragma message ("=== LzFind AVX2") + PRF(printf("\n=== LzFind AVX2\n")); + f = LzFind_SaturSub_256; + } + #endif + } + #endif // MY_CPU_ARM_OR_ARM64 + g_LzFind_SaturSub = f; + #endif // USE_LZFIND_SATUR_SUB_128 + #endif // FORCE_LZFIND_SATUR_SUB_128 +} + + +#undef MOVE_POS +#undef MOVE_POS_RET +#undef PRF diff --git a/src/sdk/C/LzFind.h b/src/sdk/C/LzFind.h index 42c13be..67e8a6e 100644 --- a/src/sdk/C/LzFind.h +++ b/src/sdk/C/LzFind.h @@ -1,8 +1,8 @@ /* LzFind.h -- Match finder for LZ algorithms -2017-06-10 : Igor Pavlov : Public domain */ +2024-01-22 : Igor Pavlov : Public domain */ -#ifndef __LZ_FIND_H -#define __LZ_FIND_H +#ifndef ZIP7_INC_LZ_FIND_H +#define ZIP7_INC_LZ_FIND_H #include "7zTypes.h" @@ -10,12 +10,12 @@ EXTERN_C_BEGIN typedef UInt32 CLzRef; -typedef struct _CMatchFinder +typedef struct { - Byte *buffer; + const Byte *buffer; UInt32 pos; UInt32 posLimit; - UInt32 streamPos; + UInt32 streamPos; /* wrap over Zero is allowed (streamPos < pos). Use (UInt32)(streamPos - pos) */ UInt32 lenLimit; UInt32 cyclicBufferPos; @@ -32,8 +32,8 @@ typedef struct _CMatchFinder UInt32 hashMask; UInt32 cutValue; - Byte *bufferBase; - ISeqInStream *stream; + Byte *bufBase; + ISeqInStreamPtr stream; UInt32 blockSize; UInt32 keepSizeBefore; @@ -43,7 +43,9 @@ typedef struct _CMatchFinder size_t directInputRem; UInt32 historySize; UInt32 fixedHashSize; - UInt32 hashSizeSum; + Byte numHashBytes_Min; + Byte numHashOutBits; + Byte _pad2_[2]; SRes result; UInt32 crc[256]; size_t numRefs; @@ -51,35 +53,69 @@ typedef struct _CMatchFinder UInt64 expectedDataSize; } CMatchFinder; -#define Inline_MatchFinder_GetPointerToCurrentPos(p) ((p)->buffer) +#define Inline_MatchFinder_GetPointerToCurrentPos(p) ((const Byte *)(p)->buffer) -#define Inline_MatchFinder_GetNumAvailableBytes(p) ((p)->streamPos - (p)->pos) +#define Inline_MatchFinder_GetNumAvailableBytes(p) ((UInt32)((p)->streamPos - (p)->pos)) +/* #define Inline_MatchFinder_IsFinishedOK(p) \ ((p)->streamEndWasReached \ && (p)->streamPos == (p)->pos \ && (!(p)->directInput || (p)->directInputRem == 0)) +*/ int MatchFinder_NeedMove(CMatchFinder *p); -Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p); +/* Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p); */ void MatchFinder_MoveBlock(CMatchFinder *p); void MatchFinder_ReadIfRequired(CMatchFinder *p); void MatchFinder_Construct(CMatchFinder *p); -/* Conditions: - historySize <= 3 GB - keepAddBufferBefore + matchMaxLen + keepAddBufferAfter < 511MB +/* (directInput = 0) is default value. + It's required to provide correct (directInput) value + before calling MatchFinder_Create(). + You can set (directInput) by any of the following calls: + - MatchFinder_SET_DIRECT_INPUT_BUF() + - MatchFinder_SET_STREAM() + - MatchFinder_SET_STREAM_MODE() +*/ + +#define MatchFinder_SET_DIRECT_INPUT_BUF(p, _src_, _srcLen_) { \ + (p)->stream = NULL; \ + (p)->directInput = 1; \ + (p)->buffer = (_src_); \ + (p)->directInputRem = (_srcLen_); } + +/* +#define MatchFinder_SET_STREAM_MODE(p) { \ + (p)->directInput = 0; } */ + +#define MatchFinder_SET_STREAM(p, _stream_) { \ + (p)->stream = _stream_; \ + (p)->directInput = 0; } + + int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc); void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc); void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems); -void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue); + +/* +#define MatchFinder_INIT_POS(p, val) \ + (p)->pos = (val); \ + (p)->streamPos = (val); +*/ + +// void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue); +#define MatchFinder_REDUCE_OFFSETS(p, subValue) \ + (p)->pos -= (subValue); \ + (p)->streamPos -= (subValue); + UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *buffer, CLzRef *son, - UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue, UInt32 *distances, UInt32 maxLen); /* @@ -91,31 +127,34 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt typedef void (*Mf_Init_Func)(void *object); typedef UInt32 (*Mf_GetNumAvailableBytes_Func)(void *object); typedef const Byte * (*Mf_GetPointerToCurrentPos_Func)(void *object); -typedef UInt32 (*Mf_GetMatches_Func)(void *object, UInt32 *distances); +typedef UInt32 * (*Mf_GetMatches_Func)(void *object, UInt32 *distances); typedef void (*Mf_Skip_Func)(void *object, UInt32); -typedef struct _IMatchFinder +typedef struct { Mf_Init_Func Init; Mf_GetNumAvailableBytes_Func GetNumAvailableBytes; Mf_GetPointerToCurrentPos_Func GetPointerToCurrentPos; Mf_GetMatches_Func GetMatches; Mf_Skip_Func Skip; -} IMatchFinder; +} IMatchFinder2; -void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable); +void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable); void MatchFinder_Init_LowHash(CMatchFinder *p); void MatchFinder_Init_HighHash(CMatchFinder *p); -void MatchFinder_Init_3(CMatchFinder *p, int readData); -void MatchFinder_Init(CMatchFinder *p); +void MatchFinder_Init_4(CMatchFinder *p); +// void MatchFinder_Init(CMatchFinder *p); +void MatchFinder_Init(void *p); -UInt32 Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); -UInt32 Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); +UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); +UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num); void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num); +void LzFindPrepare(void); + EXTERN_C_END #endif diff --git a/src/sdk/C/LzFindMt.c b/src/sdk/C/LzFindMt.c index bb0f42c..25fcc46 100644 --- a/src/sdk/C/LzFindMt.c +++ b/src/sdk/C/LzFindMt.c @@ -1,97 +1,217 @@ /* LzFindMt.c -- multithreaded Match finder for LZ algorithms -2018-12-29 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" -#include "LzHash.h" +// #include + +#include "CpuArch.h" +#include "LzHash.h" #include "LzFindMt.h" +// #define LOG_ITERS + +// #define LOG_THREAD + +#ifdef LOG_THREAD +#include +#define PRF(x) x +#else +#define PRF(x) +#endif + +#ifdef LOG_ITERS +#include +extern UInt64 g_NumIters_Tree; +extern UInt64 g_NumIters_Loop; +extern UInt64 g_NumIters_Bytes; +#define LOG_ITER(x) x +#else +#define LOG_ITER(x) +#endif + +#define kMtHashBlockSize ((UInt32)1 << 17) +#define kMtHashNumBlocks (1 << 1) + +#define GET_HASH_BLOCK_OFFSET(i) (((i) & (kMtHashNumBlocks - 1)) * kMtHashBlockSize) + +#define kMtBtBlockSize ((UInt32)1 << 16) +#define kMtBtNumBlocks (1 << 4) + +#define GET_BT_BLOCK_OFFSET(i) (((i) & (kMtBtNumBlocks - 1)) * (size_t)kMtBtBlockSize) + +/* + HASH functions: + We use raw 8/16 bits from a[1] and a[2], + xored with crc(a[0]) and crc(a[3]). + We check a[0], a[3] only. We don't need to compare a[1] and a[2] in matches. + our crc() function provides one-to-one correspondence for low 8-bit values: + (crc[0...0xFF] & 0xFF) <-> [0...0xFF] +*/ + +#define MF(mt) ((mt)->MatchFinder) +#define MF_CRC (p->crc) + +// #define MF(mt) (&(mt)->MatchFinder) +// #define MF_CRC (p->MatchFinder.crc) + +#define MT_HASH2_CALC \ + h2 = (MF_CRC[cur[0]] ^ cur[1]) & (kHash2Size - 1); + +#define MT_HASH3_CALC { \ + UInt32 temp = MF_CRC[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); } + +/* +#define MT_HASH3_CALC__NO_2 { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); } + +#define MT_HASH4_CALC { \ + UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ + h2 = temp & (kHash2Size - 1); \ + temp ^= ((UInt32)cur[2] << 8); \ + h3 = temp & (kHash3Size - 1); \ + h4 = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hash4Mask; } + // (kHash4Size - 1); +*/ + + +Z7_NO_INLINE static void MtSync_Construct(CMtSync *p) { + p->affinityGroup = -1; + p->affinityInGroup = 0; + p->affinity = 0; p->wasCreated = False; p->csWasInitialized = False; p->csWasEntered = False; - Thread_Construct(&p->thread); + Thread_CONSTRUCT(&p->thread) Event_Construct(&p->canStart); - Event_Construct(&p->wasStarted); Event_Construct(&p->wasStopped); Semaphore_Construct(&p->freeSemaphore); Semaphore_Construct(&p->filledSemaphore); } -static void MtSync_GetNextBlock(CMtSync *p) + +// #define DEBUG_BUFFER_LOCK // define it to debug lock state + +#ifdef DEBUG_BUFFER_LOCK +#include +#define BUFFER_MUST_BE_LOCKED(p) if (!(p)->csWasEntered) exit(1); +#define BUFFER_MUST_BE_UNLOCKED(p) if ( (p)->csWasEntered) exit(1); +#else +#define BUFFER_MUST_BE_LOCKED(p) +#define BUFFER_MUST_BE_UNLOCKED(p) +#endif + +#define LOCK_BUFFER(p) { \ + BUFFER_MUST_BE_UNLOCKED(p); \ + CriticalSection_Enter(&(p)->cs); \ + (p)->csWasEntered = True; } + +#define UNLOCK_BUFFER(p) { \ + BUFFER_MUST_BE_LOCKED(p); \ + CriticalSection_Leave(&(p)->cs); \ + (p)->csWasEntered = False; } + + +Z7_NO_INLINE +static UInt32 MtSync_GetNextBlock(CMtSync *p) { + UInt32 numBlocks = 0; if (p->needStart) { + BUFFER_MUST_BE_UNLOCKED(p) p->numProcessedBlocks = 1; p->needStart = False; p->stopWriting = False; p->exit = False; - Event_Reset(&p->wasStarted); Event_Reset(&p->wasStopped); - Event_Set(&p->canStart); - Event_Wait(&p->wasStarted); - - // if (mt) MatchFinder_Init_LowHash(mt->MatchFinder); } else { - CriticalSection_Leave(&p->cs); - p->csWasEntered = False; - p->numProcessedBlocks++; + UNLOCK_BUFFER(p) + // we free current block + numBlocks = p->numProcessedBlocks++; Semaphore_Release1(&p->freeSemaphore); } + + // buffer is UNLOCKED here Semaphore_Wait(&p->filledSemaphore); - CriticalSection_Enter(&p->cs); - p->csWasEntered = True; + LOCK_BUFFER(p) + return numBlocks; } -/* MtSync_StopWriting must be called if Writing was started */ +/* if Writing (Processing) thread was started, we must call MtSync_StopWriting() */ + +Z7_NO_INLINE static void MtSync_StopWriting(CMtSync *p) { - UInt32 myNumBlocks = p->numProcessedBlocks; if (!Thread_WasCreated(&p->thread) || p->needStart) return; - p->stopWriting = True; + + PRF(printf("\nMtSync_StopWriting %p\n", p)); + if (p->csWasEntered) { - CriticalSection_Leave(&p->cs); - p->csWasEntered = False; + /* we don't use buffer in this thread after StopWriting(). + So we UNLOCK buffer. + And we restore default UNLOCKED state for stopped thread */ + UNLOCK_BUFFER(p) } - Semaphore_Release1(&p->freeSemaphore); - + + /* We send (p->stopWriting) message and release freeSemaphore + to free current block. + So the thread will see (p->stopWriting) at some + iteration after Wait(freeSemaphore). + The thread doesn't need to fill all avail free blocks, + so we can get fast thread stop. + */ + + p->stopWriting = True; + Semaphore_Release1(&p->freeSemaphore); // check semaphore count !!! + + PRF(printf("\nMtSync_StopWriting %p : Event_Wait(&p->wasStopped)\n", p)); Event_Wait(&p->wasStopped); + PRF(printf("\nMtSync_StopWriting %p : Event_Wait() finsihed\n", p)); + + /* 21.03 : we don't restore samaphore counters here. + We will recreate and reinit samaphores in next start */ - while (myNumBlocks++ != p->numProcessedBlocks) - { - Semaphore_Wait(&p->filledSemaphore); - Semaphore_Release1(&p->freeSemaphore); - } p->needStart = True; } + +Z7_NO_INLINE static void MtSync_Destruct(CMtSync *p) { + PRF(printf("\nMtSync_Destruct %p\n", p)); + if (Thread_WasCreated(&p->thread)) { + /* we want thread to be in Stopped state before sending EXIT command. + note: stop(btSync) will stop (htSync) also */ MtSync_StopWriting(p); + /* thread in Stopped state here : (p->needStart == true) */ p->exit = True; - if (p->needStart) - Event_Set(&p->canStart); - Thread_Wait(&p->thread); - Thread_Close(&p->thread); + // if (p->needStart) // it's (true) + Event_Set(&p->canStart); // we send EXIT command to thread + Thread_Wait_Close(&p->thread); // we wait thread finishing } + if (p->csWasInitialized) { CriticalSection_Delete(&p->cs); p->csWasInitialized = False; } + p->csWasEntered = False; Event_Close(&p->canStart); - Event_Close(&p->wasStarted); Event_Close(&p->wasStopped); Semaphore_Close(&p->freeSemaphore); Semaphore_Close(&p->filledSemaphore); @@ -99,80 +219,257 @@ static void MtSync_Destruct(CMtSync *p) p->wasCreated = False; } -#define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; } -static SRes MtSync_Create2(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks) +// #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; } +// we want to get real system error codes here instead of SZ_ERROR_THREAD +#define RINOK_THREAD(x) RINOK_WRes(x) + + +// call it before each new file (when new starting is required): +Z7_NO_INLINE +static SRes MtSync_Init(CMtSync *p, UInt32 numBlocks) +{ + WRes wres; + // BUFFER_MUST_BE_UNLOCKED(p) + if (!p->needStart || p->csWasEntered) + return SZ_ERROR_FAIL; + wres = Semaphore_OptCreateInit(&p->freeSemaphore, numBlocks, numBlocks); + if (wres == 0) + wres = Semaphore_OptCreateInit(&p->filledSemaphore, 0, numBlocks); + return MY_SRes_HRESULT_FROM_WRes(wres); +} + + +static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj) { + WRes wres; + if (p->wasCreated) return SZ_OK; - RINOK_THREAD(CriticalSection_Init(&p->cs)); + RINOK_THREAD(CriticalSection_Init(&p->cs)) p->csWasInitialized = True; + p->csWasEntered = False; - RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart)); - RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStarted)); - RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped)); - - RINOK_THREAD(Semaphore_Create(&p->freeSemaphore, numBlocks, numBlocks)); - RINOK_THREAD(Semaphore_Create(&p->filledSemaphore, 0, numBlocks)); + RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart)) + RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped)) p->needStart = True; - - RINOK_THREAD(Thread_Create(&p->thread, startAddress, obj)); + p->exit = True; /* p->exit is unused before (canStart) Event. + But in case of some unexpected code failure we will get fast exit from thread */ + + // return ERROR_TOO_MANY_POSTS; // for debug + // return EINVAL; // for debug + +#ifdef _WIN32 + if (p->affinityGroup >= 0) + wres = Thread_Create_With_Group(&p->thread, startAddress, obj, + (unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup); + else +#endif + if (p->affinity != 0) + wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity); + else + wres = Thread_Create(&p->thread, startAddress, obj); + + RINOK_THREAD(wres) p->wasCreated = True; return SZ_OK; } -static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks) + +Z7_NO_INLINE +static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj) { - SRes res = MtSync_Create2(p, startAddress, obj, numBlocks); - if (res != SZ_OK) - MtSync_Destruct(p); - return res; + const WRes wres = MtSync_Create_WRes(p, startAddress, obj); + if (wres == 0) + return 0; + MtSync_Destruct(p); + return MY_SRes_HRESULT_FROM_WRes(wres); } -void MtSync_Init(CMtSync *p) { p->needStart = True; } + +// ---------- HASH THREAD ---------- #define kMtMaxValForNormalize 0xFFFFFFFF +// #define kMtMaxValForNormalize ((1 << 21)) // for debug +// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses -#define DEF_GetHeads2(name, v, action) \ - static void GetHeads ## name(const Byte *p, UInt32 pos, \ - UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc) \ - { action; for (; numHeads != 0; numHeads--) { \ - const UInt32 value = (v); p++; *heads++ = pos - hash[value]; hash[value] = pos++; } } +#ifdef MY_CPU_LE_UNALIGN + #define GetUi24hi_from32(p) ((UInt32)GetUi32(p) >> 8) +#else + #define GetUi24hi_from32(p) ((p)[1] ^ ((UInt32)(p)[2] << 8) ^ ((UInt32)(p)[3] << 16)) +#endif + +#define GetHeads_DECL(name) \ + static void GetHeads ## name(const Byte *p, UInt32 pos, \ + UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc) +#define GetHeads_LOOP(v) \ + for (; numHeads != 0; numHeads--) { \ + const UInt32 value = (v); \ + p++; \ + *heads++ = pos - hash[value]; \ + hash[value] = pos++; } + +#define DEF_GetHeads2(name, v, action) \ + GetHeads_DECL(name) { action \ + GetHeads_LOOP(v) } + #define DEF_GetHeads(name, v) DEF_GetHeads2(name, v, ;) -DEF_GetHeads2(2, (p[0] | ((UInt32)p[1] << 8)), UNUSED_VAR(hashMask); UNUSED_VAR(crc); ) -DEF_GetHeads(3, (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8)) & hashMask) -DEF_GetHeads(4, (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8) ^ (crc[p[3]] << 5)) & hashMask) -DEF_GetHeads(4b, (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8) ^ ((UInt32)p[3] << 16)) & hashMask) -/* DEF_GetHeads(5, (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8) ^ (crc[p[3]] << 5) ^ (crc[p[4]] << 3)) & hashMask) */ +DEF_GetHeads2(2, GetUi16(p), UNUSED_VAR(hashMask); UNUSED_VAR(crc); ) +DEF_GetHeads(3, (crc[p[0]] ^ GetUi16(p + 1)) & hashMask) +DEF_GetHeads2(3b, GetUi16(p) ^ ((UInt32)(p)[2] << 16), UNUSED_VAR(hashMask); UNUSED_VAR(crc); ) +// BT3 is not good for crc collisions for big hashMask values. + +/* +GetHeads_DECL(3b) +{ + UNUSED_VAR(hashMask); + UNUSED_VAR(crc); + { + const Byte *pLim = p + numHeads; + if (numHeads == 0) + return; + pLim--; + while (p < pLim) + { + UInt32 v1 = GetUi32(p); + UInt32 v0 = v1 & 0xFFFFFF; + UInt32 h0, h1; + p += 2; + v1 >>= 8; + h0 = hash[v0]; hash[v0] = pos; heads[0] = pos - h0; pos++; + h1 = hash[v1]; hash[v1] = pos; heads[1] = pos - h1; pos++; + heads += 2; + } + if (p == pLim) + { + UInt32 v0 = GetUi16(p) ^ ((UInt32)(p)[2] << 16); + *heads = pos - hash[v0]; + hash[v0] = pos; + } + } +} +*/ + +/* +GetHeads_DECL(4) +{ + unsigned sh = 0; + UNUSED_VAR(crc) + while ((hashMask & 0x80000000) == 0) + { + hashMask <<= 1; + sh++; + } + GetHeads_LOOP((GetUi32(p) * 0xa54a1) >> sh) +} +#define GetHeads4b GetHeads4 +*/ + +#define USE_GetHeads_LOCAL_CRC + +#ifdef USE_GetHeads_LOCAL_CRC + +GetHeads_DECL(4) +{ + UInt32 crc0[256]; + UInt32 crc1[256]; + { + unsigned i; + for (i = 0; i < 256; i++) + { + UInt32 v = crc[i]; + crc0[i] = v & hashMask; + crc1[i] = (v << kLzHash_CrcShift_1) & hashMask; + // crc1[i] = rotlFixed(v, 8) & hashMask; + } + } + GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ (UInt32)GetUi16(p+1)) +} + +GetHeads_DECL(4b) +{ + UInt32 crc0[256]; + { + unsigned i; + for (i = 0; i < 256; i++) + crc0[i] = crc[i] & hashMask; + } + GetHeads_LOOP(crc0[p[0]] ^ GetUi24hi_from32(p)) +} + +GetHeads_DECL(5) +{ + UInt32 crc0[256]; + UInt32 crc1[256]; + UInt32 crc2[256]; + { + unsigned i; + for (i = 0; i < 256; i++) + { + UInt32 v = crc[i]; + crc0[i] = v & hashMask; + crc1[i] = (v << kLzHash_CrcShift_1) & hashMask; + crc2[i] = (v << kLzHash_CrcShift_2) & hashMask; + } + } + GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ crc2[p[4]] ^ (UInt32)GetUi16(p+1)) +} + +GetHeads_DECL(5b) +{ + UInt32 crc0[256]; + UInt32 crc1[256]; + { + unsigned i; + for (i = 0; i < 256; i++) + { + UInt32 v = crc[i]; + crc0[i] = v & hashMask; + crc1[i] = (v << kLzHash_CrcShift_1) & hashMask; + } + } + GetHeads_LOOP(crc0[p[0]] ^ crc1[p[4]] ^ GetUi24hi_from32(p)) +} + +#else + +DEF_GetHeads(4, (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (UInt32)GetUi16(p+1)) & hashMask) +DEF_GetHeads(4b, (crc[p[0]] ^ GetUi24hi_from32(p)) & hashMask) +DEF_GetHeads(5, (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (crc[p[4]] << kLzHash_CrcShift_2) ^ (UInt32)GetUi16(p + 1)) & hashMask) +DEF_GetHeads(5b, (crc[p[0]] ^ (crc[p[4]] << kLzHash_CrcShift_1) ^ GetUi24hi_from32(p)) & hashMask) + +#endif + static void HashThreadFunc(CMatchFinderMt *mt) { CMtSync *p = &mt->hashSync; + PRF(printf("\nHashThreadFunc\n")); + for (;;) { - UInt32 numProcessedBlocks = 0; + UInt32 blockIndex = 0; + PRF(printf("\nHashThreadFunc : Event_Wait(&p->canStart)\n")); Event_Wait(&p->canStart); - Event_Set(&p->wasStarted); + PRF(printf("\nHashThreadFunc : Event_Wait(&p->canStart) : after \n")); + if (p->exit) + { + PRF(printf("\nHashThreadFunc : exit \n")); + return; + } - MatchFinder_Init_HighHash(mt->MatchFinder); + MatchFinder_Init_HighHash(MF(mt)); for (;;) { - if (p->exit) - return; - if (p->stopWriting) - { - p->numProcessedBlocks = numProcessedBlocks; - Event_Set(&p->wasStopped); - break; - } + PRF(printf("Hash thread block = %d pos = %d\n", (unsigned)blockIndex, mt->MatchFinder->pos)); { - CMatchFinder *mf = mt->MatchFinder; + CMatchFinder *mf = MF(mt); if (MatchFinder_NeedMove(mf)) { CriticalSection_Enter(&mt->btSync.cs); @@ -185,194 +482,178 @@ static void HashThreadFunc(CMatchFinderMt *mt) mt->pointerToCurPos -= offset; mt->buffer -= offset; } - CriticalSection_Leave(&mt->btSync.cs); CriticalSection_Leave(&mt->hashSync.cs); + CriticalSection_Leave(&mt->btSync.cs); continue; } Semaphore_Wait(&p->freeSemaphore); + if (p->exit) // exit is unexpected here. But we check it here for some failure case + return; + + // for faster stop : we check (p->stopWriting) after Wait(freeSemaphore) + if (p->stopWriting) + break; + MatchFinder_ReadIfRequired(mf); - if (mf->pos > (kMtMaxValForNormalize - kMtHashBlockSize)) - { - UInt32 subValue = (mf->pos - mf->historySize - 1); - MatchFinder_ReduceOffsets(mf, subValue); - MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1); - } { - UInt32 *heads = mt->hashBuf + ((numProcessedBlocks++) & kMtHashNumBlocksMask) * kMtHashBlockSize; - UInt32 num = mf->streamPos - mf->pos; + UInt32 *heads = mt->hashBuf + GET_HASH_BLOCK_OFFSET(blockIndex++); + UInt32 num = Inline_MatchFinder_GetNumAvailableBytes(mf); heads[0] = 2; heads[1] = num; + + /* heads[1] contains the number of avail bytes: + if (avail < mf->numHashBytes) : + { + it means that stream was finished + HASH_THREAD and BT_TREAD must move position for heads[1] (avail) bytes. + HASH_THREAD doesn't stop, + HASH_THREAD fills only the header (2 numbers) for all next blocks: + {2, NumHashBytes - 1}, {2,0}, {2,0}, ... , {2,0} + } + else + { + HASH_THREAD and BT_TREAD must move position for (heads[0] - 2) bytes; + } + */ + if (num >= mf->numHashBytes) { num = num - mf->numHashBytes + 1; if (num > kMtHashBlockSize - 2) num = kMtHashBlockSize - 2; - mt->GetHeadsFunc(mf->buffer, mf->pos, mf->hash + mf->fixedHashSize, mf->hashMask, heads + 2, num, mf->crc); + + if (mf->pos > (UInt32)kMtMaxValForNormalize - num) + { + const UInt32 subValue = (mf->pos - mf->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1); + MatchFinder_REDUCE_OFFSETS(mf, subValue) + MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1); + } + heads[0] = 2 + num; + mt->GetHeadsFunc(mf->buffer, mf->pos, mf->hash + mf->fixedHashSize, mf->hashMask, heads + 2, num, mf->crc); } - mf->pos += num; + + mf->pos += num; // wrap over zero is allowed at the end of stream mf->buffer += num; } } Semaphore_Release1(&p->filledSemaphore); - } - } -} + } // for() processing end -static void MatchFinderMt_GetNextBlock_Hash(CMatchFinderMt *p) -{ - MtSync_GetNextBlock(&p->hashSync); - p->hashBufPosLimit = p->hashBufPos = ((p->hashSync.numProcessedBlocks - 1) & kMtHashNumBlocksMask) * kMtHashBlockSize; - p->hashBufPosLimit += p->hashBuf[p->hashBufPos++]; - p->hashNumAvail = p->hashBuf[p->hashBufPos++]; + // p->numBlocks_Sent = blockIndex; + Event_Set(&p->wasStopped); + } // for() thread end } -#define kEmptyHashValue 0 + + + +// ---------- BT THREAD ---------- + +/* we use one variable instead of two (cyclicBufferPos == pos) before CyclicBuf wrap. + here we define fixed offset of (p->pos) from (p->cyclicBufferPos) */ +#define CYC_TO_POS_OFFSET 0 +// #define CYC_TO_POS_OFFSET 1 // for debug #define MFMT_GM_INLINE #ifdef MFMT_GM_INLINE /* - we use size_t for _cyclicBufferPos instead of UInt32 + we use size_t for (pos) instead of UInt32 to eliminate "movsx" BUG in old MSVC x64 compiler. */ -MY_NO_INLINE -static UInt32 *GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *cur, CLzRef *son, - size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue, - UInt32 *distances, UInt32 _maxLen, const UInt32 *hash, const UInt32 *limit, UInt32 size, UInt32 *posRes) -{ - do - { - UInt32 *_distances = ++distances; - UInt32 delta = *hash++; - - CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; - CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); - unsigned len0 = 0, len1 = 0; - UInt32 cutValue = _cutValue; - unsigned maxLen = (unsigned)_maxLen; - /* - if (size > 1) - { - UInt32 delta = *hash; - if (delta < _cyclicBufferSize) - { - UInt32 cyc1 = _cyclicBufferPos + 1; - CLzRef *pair = son + ((size_t)(cyc1 - delta + ((delta > cyc1) ? _cyclicBufferSize : 0)) << 1); - Byte b = *(cur + 1 - delta); - _distances[0] = pair[0]; - _distances[1] = b; - } - } - */ - if (cutValue == 0 || delta >= _cyclicBufferSize) - { - *ptr0 = *ptr1 = kEmptyHashValue; - } - else - for(;;) - { - { - CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((_cyclicBufferPos < delta) ? _cyclicBufferSize : 0)) << 1); - const Byte *pb = cur - delta; - unsigned len = (len0 < len1 ? len0 : len1); - UInt32 pair0 = *pair; - if (pb[len] == cur[len]) - { - if (++len != lenLimit && pb[len] == cur[len]) - while (++len != lenLimit) - if (pb[len] != cur[len]) - break; - if (maxLen < len) - { - maxLen = len; - *distances++ = (UInt32)len; - *distances++ = delta - 1; - if (len == lenLimit) - { - UInt32 pair1 = pair[1]; - *ptr1 = pair0; - *ptr0 = pair1; - break; - } - } - } - { - UInt32 curMatch = pos - delta; - // delta = pos - *pair; - // delta = pos - pair[((UInt32)pb[len] - (UInt32)cur[len]) >> 31]; - if (pb[len] < cur[len]) - { - delta = pos - pair[1]; - *ptr1 = curMatch; - ptr1 = pair + 1; - len1 = len; - } - else - { - delta = pos - *pair; - *ptr0 = curMatch; - ptr0 = pair; - len0 = len; - } - } - } - if (--cutValue == 0 || delta >= _cyclicBufferSize) - { - *ptr0 = *ptr1 = kEmptyHashValue; - break; - } - } - pos++; - _cyclicBufferPos++; - cur++; - { - UInt32 num = (UInt32)(distances - _distances); - _distances[-1] = num; - } - } - while (distances < limit && --size != 0); - *posRes = pos; - return distances; -} +UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes); #endif - -static void BtGetMatches(CMatchFinderMt *p, UInt32 *distances) +static void BtGetMatches(CMatchFinderMt *p, UInt32 *d) { UInt32 numProcessed = 0; UInt32 curPos = 2; - UInt32 limit = kMtBtBlockSize - (p->matchMaxLen * 2); // * 2 - distances[1] = p->hashNumAvail; + /* GetMatchesSpec() functions don't create (len = 1) + in [len, dist] match pairs, if (p->numHashBytes >= 2) + Also we suppose here that (matchMaxLen >= 2). + So the following code for (reserve) is not required + UInt32 reserve = (p->matchMaxLen * 2); + const UInt32 kNumHashBytes_Max = 5; // BT_HASH_BYTES_MAX + if (reserve < kNumHashBytes_Max - 1) + reserve = kNumHashBytes_Max - 1; + const UInt32 limit = kMtBtBlockSize - (reserve); + */ + + const UInt32 limit = kMtBtBlockSize - (p->matchMaxLen * 2); + + d[1] = p->hashNumAvail; + + if (p->failure_BT) + { + // printf("\n == 1 BtGetMatches() p->failure_BT\n"); + d[0] = 0; + // d[1] = 0; + return; + } while (curPos < limit) { if (p->hashBufPos == p->hashBufPosLimit) { - MatchFinderMt_GetNextBlock_Hash(p); - distances[1] = numProcessed + p->hashNumAvail; - if (p->hashNumAvail >= p->numHashBytes) + // MatchFinderMt_GetNextBlock_Hash(p); + UInt32 avail; + { + const UInt32 bi = MtSync_GetNextBlock(&p->hashSync); + const UInt32 k = GET_HASH_BLOCK_OFFSET(bi); + const UInt32 *h = p->hashBuf + k; + avail = h[1]; + p->hashBufPosLimit = k + h[0]; + p->hashNumAvail = avail; + p->hashBufPos = k + 2; + } + + { + /* we must prevent UInt32 overflow for avail total value, + if avail was increased with new hash block */ + UInt32 availSum = numProcessed + avail; + if (availSum < numProcessed) + availSum = (UInt32)(Int32)-1; + d[1] = availSum; + } + + if (avail >= p->numHashBytes) continue; - distances[0] = curPos + p->hashNumAvail; - distances += curPos; - for (; p->hashNumAvail != 0; p->hashNumAvail--) - *distances++ = 0; + + // if (p->hashBufPos != p->hashBufPosLimit) exit(1); + + /* (avail < p->numHashBytes) + It means that stream was finished. + And (avail) - is a number of remaining bytes, + we fill (d) for (avail) bytes for LZ_THREAD (receiver). + but we don't update (p->pos) and (p->cyclicBufferPos) here in BT_THREAD */ + + /* here we suppose that we have space enough: + (kMtBtBlockSize - curPos >= p->hashNumAvail) */ + p->hashNumAvail = 0; + d[0] = curPos + avail; + d += curPos; + for (; avail != 0; avail--) + *d++ = 0; return; } { UInt32 size = p->hashBufPosLimit - p->hashBufPos; - UInt32 lenLimit = p->matchMaxLen; UInt32 pos = p->pos; UInt32 cyclicBufferPos = p->cyclicBufferPos; + UInt32 lenLimit = p->matchMaxLen; if (lenLimit >= p->hashNumAvail) lenLimit = p->hashNumAvail; { @@ -384,10 +665,18 @@ static void BtGetMatches(CMatchFinderMt *p, UInt32 *distances) size = size2; } + if (pos > (UInt32)kMtMaxValForNormalize - size) + { + const UInt32 subValue = (pos - p->cyclicBufferSize); // & ~(UInt32)(kNormalizeAlign - 1); + pos -= subValue; + p->pos = pos; + MatchFinder_Normalize3(subValue, p->son, (size_t)p->cyclicBufferSize * 2); + } + #ifndef MFMT_GM_INLINE while (curPos < limit && size-- != 0) { - UInt32 *startDistances = distances + curPos; + UInt32 *startDistances = d + curPos; UInt32 num = (UInt32)(GetMatchesSpec1(lenLimit, pos - p->hashBuf[p->hashBufPos++], pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue, startDistances + 1, p->numHashBytes - 1) - startDistances); @@ -399,81 +688,112 @@ static void BtGetMatches(CMatchFinderMt *p, UInt32 *distances) } #else { - UInt32 posRes; - curPos = (UInt32)(GetMatchesSpecN(lenLimit, pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue, - distances + curPos, p->numHashBytes - 1, p->hashBuf + p->hashBufPos, - distances + limit, - size, &posRes) - distances); - p->hashBufPos += posRes - pos; - cyclicBufferPos += posRes - pos; - p->buffer += posRes - pos; - pos = posRes; + UInt32 posRes = pos; + const UInt32 *d_end; + { + d_end = GetMatchesSpecN_2( + p->buffer + lenLimit - 1, + pos, p->buffer, p->son, p->cutValue, d + curPos, + p->numHashBytes - 1, p->hashBuf + p->hashBufPos, + d + limit, p->hashBuf + p->hashBufPos + size, + cyclicBufferPos, p->cyclicBufferSize, + &posRes); + } + { + if (!d_end) + { + // printf("\n == 2 BtGetMatches() p->failure_BT\n"); + // internal data failure + p->failure_BT = True; + d[0] = 0; + // d[1] = 0; + return; + } + } + curPos = (UInt32)(d_end - d); + { + const UInt32 processed = posRes - pos; + pos = posRes; + p->hashBufPos += processed; + cyclicBufferPos += processed; + p->buffer += processed; + } } #endif - numProcessed += pos - p->pos; - p->hashNumAvail -= pos - p->pos; - p->pos = pos; + { + const UInt32 processed = pos - p->pos; + numProcessed += processed; + p->hashNumAvail -= processed; + p->pos = pos; + } if (cyclicBufferPos == p->cyclicBufferSize) cyclicBufferPos = 0; p->cyclicBufferPos = cyclicBufferPos; } } - distances[0] = curPos; + d[0] = curPos; } + static void BtFillBlock(CMatchFinderMt *p, UInt32 globalBlockIndex) { CMtSync *sync = &p->hashSync; + + BUFFER_MUST_BE_UNLOCKED(sync) + if (!sync->needStart) { - CriticalSection_Enter(&sync->cs); - sync->csWasEntered = True; + LOCK_BUFFER(sync) } - BtGetMatches(p, p->btBuf + (globalBlockIndex & kMtBtNumBlocksMask) * kMtBtBlockSize); - - if (p->pos > kMtMaxValForNormalize - kMtBtBlockSize) - { - UInt32 subValue = p->pos - p->cyclicBufferSize; - MatchFinder_Normalize3(subValue, p->son, (size_t)p->cyclicBufferSize * 2); - p->pos -= subValue; - } + BtGetMatches(p, p->btBuf + GET_BT_BLOCK_OFFSET(globalBlockIndex)); + + /* We suppose that we have called GetNextBlock() from start. + So buffer is LOCKED */ - if (!sync->needStart) - { - CriticalSection_Leave(&sync->cs); - sync->csWasEntered = False; - } + UNLOCK_BUFFER(sync) } -void BtThreadFunc(CMatchFinderMt *mt) + +Z7_NO_INLINE +static void BtThreadFunc(CMatchFinderMt *mt) { CMtSync *p = &mt->btSync; for (;;) { UInt32 blockIndex = 0; Event_Wait(&p->canStart); - Event_Set(&p->wasStarted); + for (;;) { + PRF(printf(" BT thread block = %d pos = %d\n", (unsigned)blockIndex, mt->pos)); + /* (p->exit == true) is possible after (p->canStart) at first loop iteration + and is unexpected after more Wait(freeSemaphore) iterations */ if (p->exit) return; + + Semaphore_Wait(&p->freeSemaphore); + + // for faster stop : we check (p->stopWriting) after Wait(freeSemaphore) if (p->stopWriting) - { - p->numProcessedBlocks = blockIndex; - MtSync_StopWriting(&mt->hashSync); - Event_Set(&p->wasStopped); break; - } - Semaphore_Wait(&p->freeSemaphore); + BtFillBlock(mt, blockIndex++); + Semaphore_Release1(&p->filledSemaphore); } + + // we stop HASH_THREAD here + MtSync_StopWriting(&mt->hashSync); + + // p->numBlocks_Sent = blockIndex; + Event_Set(&p->wasStopped); } } + void MatchFinderMt_Construct(CMatchFinderMt *p) { p->hashBuf = NULL; @@ -489,16 +809,39 @@ static void MatchFinderMt_FreeMem(CMatchFinderMt *p, ISzAllocPtr alloc) void MatchFinderMt_Destruct(CMatchFinderMt *p, ISzAllocPtr alloc) { - MtSync_Destruct(&p->hashSync); + /* + HASH_THREAD can use CriticalSection(s) btSync.cs and hashSync.cs. + So we must be sure that HASH_THREAD will not use CriticalSection(s) + after deleting CriticalSection here. + + we call ReleaseStream(p) + that calls StopWriting(btSync) + that calls StopWriting(hashSync), if it's required to stop HASH_THREAD. + after StopWriting() it's safe to destruct MtSync(s) in any order */ + + MatchFinderMt_ReleaseStream(p); + MtSync_Destruct(&p->btSync); + MtSync_Destruct(&p->hashSync); + + LOG_ITER( + printf("\nTree %9d * %7d iter = %9d = sum : bytes = %9d\n", + (UInt32)(g_NumIters_Tree / 1000), + (UInt32)(((UInt64)g_NumIters_Loop * 1000) / (g_NumIters_Tree + 1)), + (UInt32)(g_NumIters_Loop / 1000), + (UInt32)(g_NumIters_Bytes / 1000) + )); + MatchFinderMt_FreeMem(p, alloc); } + #define kHashBufferSize (kMtHashBlockSize * kMtHashNumBlocks) #define kBtBufferSize (kMtBtBlockSize * kMtBtNumBlocks) -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE HashThreadFunc2(void *p) { HashThreadFunc((CMatchFinderMt *)p); return 0; } -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE BtThreadFunc2(void *p) + +static THREAD_FUNC_DECL HashThreadFunc2(void *p) { HashThreadFunc((CMatchFinderMt *)p); return 0; } +static THREAD_FUNC_DECL BtThreadFunc2(void *p) { Byte allocaDummy[0x180]; unsigned i = 0; @@ -509,16 +852,17 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE BtThreadFunc2(void *p) return 0; } + SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc) { - CMatchFinder *mf = p->MatchFinder; + CMatchFinder *mf = MF(p); p->historySize = historySize; if (kMtBtBlockSize <= matchMaxLen * 4) return SZ_ERROR_PARAM; if (!p->hashBuf) { - p->hashBuf = (UInt32 *)ISzAlloc_Alloc(alloc, (kHashBufferSize + kBtBufferSize) * sizeof(UInt32)); + p->hashBuf = (UInt32 *)ISzAlloc_Alloc(alloc, ((size_t)kHashBufferSize + (size_t)kBtBufferSize) * sizeof(UInt32)); if (!p->hashBuf) return SZ_ERROR_MEM; p->btBuf = p->hashBuf + kHashBufferSize; @@ -528,262 +872,472 @@ SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddB if (!MatchFinder_Create(mf, historySize, keepAddBufferBefore, matchMaxLen, keepAddBufferAfter, alloc)) return SZ_ERROR_MEM; - RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p, kMtHashNumBlocks)); - RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p, kMtBtNumBlocks)); + RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p)) + RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p)) return SZ_OK; } -/* Call it after ReleaseStream / SetStream */ -static void MatchFinderMt_Init(CMatchFinderMt *p) + +SRes MatchFinderMt_InitMt(CMatchFinderMt *p) +{ + RINOK(MtSync_Init(&p->hashSync, kMtHashNumBlocks)) + return MtSync_Init(&p->btSync, kMtBtNumBlocks); +} + + +static void MatchFinderMt_Init(void *_p) { - CMatchFinder *mf = p->MatchFinder; + CMatchFinderMt *p = (CMatchFinderMt *)_p; + CMatchFinder *mf = MF(p); p->btBufPos = - p->btBufPosLimit = 0; + p->btBufPosLimit = NULL; p->hashBufPos = p->hashBufPosLimit = 0; + p->hashNumAvail = 0; // 21.03 + + p->failure_BT = False; /* Init without data reading. We don't want to read data in this thread */ - MatchFinder_Init_3(mf, False); + MatchFinder_Init_4(mf); + MatchFinder_Init_LowHash(mf); p->pointerToCurPos = Inline_MatchFinder_GetPointerToCurrentPos(mf); p->btNumAvailBytes = 0; - p->lzPos = p->historySize + 1; + p->failure_LZ_BT = False; + // p->failure_LZ_LZ = False; + + p->lzPos = + 1; // optimal smallest value + // 0; // for debug: ignores match to start + // kNormalizeAlign; // for debug p->hash = mf->hash; p->fixedHashSize = mf->fixedHashSize; + // p->hash4Mask = mf->hash4Mask; p->crc = mf->crc; + // memcpy(p->crc, mf->crc, sizeof(mf->crc)); p->son = mf->son; p->matchMaxLen = mf->matchMaxLen; p->numHashBytes = mf->numHashBytes; - p->pos = mf->pos; - p->buffer = mf->buffer; - p->cyclicBufferPos = mf->cyclicBufferPos; + + /* (mf->pos) and (mf->streamPos) were already initialized to 1 in MatchFinder_Init_4() */ + // mf->streamPos = mf->pos = 1; // optimal smallest value + // 0; // for debug: ignores match to start + // kNormalizeAlign; // for debug + + /* we must init (p->pos = mf->pos) for BT, because + BT code needs (p->pos == delta_value_for_empty_hash_record == mf->pos) */ + p->pos = mf->pos; // do not change it + + p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); p->cyclicBufferSize = mf->cyclicBufferSize; + p->buffer = mf->buffer; p->cutValue = mf->cutValue; + // p->son[0] = p->son[1] = 0; // unused: to init skipped record for speculated accesses. } + /* ReleaseStream is required to finish multithreading */ void MatchFinderMt_ReleaseStream(CMatchFinderMt *p) { + // Sleep(1); // for debug MtSync_StopWriting(&p->btSync); + // Sleep(200); // for debug /* p->MatchFinder->ReleaseStream(); */ } -static void MatchFinderMt_Normalize(CMatchFinderMt *p) -{ - MatchFinder_Normalize3(p->lzPos - p->historySize - 1, p->hash, p->fixedHashSize); - p->lzPos = p->historySize + 1; -} -static void MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p) +Z7_NO_INLINE +static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p) { - UInt32 blockIndex; - MtSync_GetNextBlock(&p->btSync); - blockIndex = ((p->btSync.numProcessedBlocks - 1) & kMtBtNumBlocksMask); - p->btBufPosLimit = p->btBufPos = blockIndex * kMtBtBlockSize; - p->btBufPosLimit += p->btBuf[p->btBufPos++]; - p->btNumAvailBytes = p->btBuf[p->btBufPos++]; - if (p->lzPos >= kMtMaxValForNormalize - kMtBtBlockSize) - MatchFinderMt_Normalize(p); + if (p->failure_LZ_BT) + p->btBufPos = p->failureBuf; + else + { + const UInt32 bi = MtSync_GetNextBlock(&p->btSync); + const UInt32 *bt = p->btBuf + GET_BT_BLOCK_OFFSET(bi); + { + const UInt32 numItems = bt[0]; + p->btBufPosLimit = bt + numItems; + p->btNumAvailBytes = bt[1]; + p->btBufPos = bt + 2; + if (numItems < 2 || numItems > kMtBtBlockSize) + { + p->failureBuf[0] = 0; + p->btBufPos = p->failureBuf; + p->btBufPosLimit = p->failureBuf + 1; + p->failure_LZ_BT = True; + // p->btNumAvailBytes = 0; + /* we don't want to decrease AvailBytes, that was load before. + that can be unxepected for the code that have loaded anopther value before */ + } + } + + if (p->lzPos >= (UInt32)kMtMaxValForNormalize - (UInt32)kMtBtBlockSize) + { + /* we don't check (lzPos) over exact avail bytes in (btBuf). + (fixedHashSize) is small, so normalization is fast */ + const UInt32 subValue = (p->lzPos - p->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1); + p->lzPos -= subValue; + MatchFinder_Normalize3(subValue, p->hash, p->fixedHashSize); + } + } + return p->btNumAvailBytes; } -static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p) + + +static const Byte * MatchFinderMt_GetPointerToCurrentPos(void *_p) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; return p->pointerToCurPos; } + #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p); -static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p) + +static UInt32 MatchFinderMt_GetNumAvailableBytes(void *_p) { - GET_NEXT_BLOCK_IF_REQUIRED; - return p->btNumAvailBytes; + CMatchFinderMt *p = (CMatchFinderMt *)_p; + if (p->btBufPos != p->btBufPosLimit) + return p->btNumAvailBytes; + return MatchFinderMt_GetNextBlock_Bt(p); } -static UInt32 * MixMatches2(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *distances) + +// #define CHECK_FAILURE_LZ(_match_, _pos_) if (_match_ >= _pos_) { p->failure_LZ_LZ = True; return d; } +#define CHECK_FAILURE_LZ(_match_, _pos_) + +static UInt32 * MixMatches2(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) { - UInt32 h2, curMatch2; + UInt32 h2, c2; UInt32 *hash = p->hash; const Byte *cur = p->pointerToCurPos; - UInt32 lzPos = p->lzPos; + const UInt32 m = p->lzPos; MT_HASH2_CALC - curMatch2 = hash[h2]; - hash[h2] = lzPos; + c2 = hash[h2]; + hash[h2] = m; - if (curMatch2 >= matchMinPos) - if (cur[(ptrdiff_t)curMatch2 - lzPos] == cur[0]) + if (c2 >= matchMinPos) + { + CHECK_FAILURE_LZ(c2, m) + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) { - *distances++ = 2; - *distances++ = lzPos - curMatch2 - 1; + *d++ = 2; + *d++ = m - c2 - 1; } + } - return distances; + return d; } -static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *distances) +static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) { - UInt32 h2, h3, curMatch2, curMatch3; + UInt32 h2, h3, c2, c3; UInt32 *hash = p->hash; const Byte *cur = p->pointerToCurPos; - UInt32 lzPos = p->lzPos; + const UInt32 m = p->lzPos; MT_HASH3_CALC - curMatch2 = hash[ h2]; - curMatch3 = (hash + kFix3HashSize)[h3]; + c2 = hash[h2]; + c3 = (hash + kFix3HashSize)[h3]; - hash[ h2] = lzPos; - (hash + kFix3HashSize)[h3] = lzPos; + hash[h2] = m; + (hash + kFix3HashSize)[h3] = m; - if (curMatch2 >= matchMinPos && cur[(ptrdiff_t)curMatch2 - lzPos] == cur[0]) + if (c2 >= matchMinPos) { - distances[1] = lzPos - curMatch2 - 1; - if (cur[(ptrdiff_t)curMatch2 - lzPos + 2] == cur[2]) + CHECK_FAILURE_LZ(c2, m) + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) { - distances[0] = 3; - return distances + 2; + d[1] = m - c2 - 1; + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2]) + { + d[0] = 3; + return d + 2; + } + d[0] = 2; + d += 2; } - distances[0] = 2; - distances += 2; } - if (curMatch3 >= matchMinPos && cur[(ptrdiff_t)curMatch3 - lzPos] == cur[0]) + if (c3 >= matchMinPos) { - *distances++ = 3; - *distances++ = lzPos - curMatch3 - 1; + CHECK_FAILURE_LZ(c3, m) + if (cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0]) + { + *d++ = 3; + *d++ = m - c3 - 1; + } } - return distances; + return d; } + +#define INCREASE_LZ_POS p->lzPos++; p->pointerToCurPos++; + /* -static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *distances) +static +UInt32* MatchFinderMt_GetMatches_Bt4(CMatchFinderMt *p, UInt32 *d) { - UInt32 h2, h3, h4, curMatch2, curMatch3, curMatch4; + const UInt32 *bt = p->btBufPos; + const UInt32 len = *bt++; + const UInt32 *btLim = bt + len; + UInt32 matchMinPos; + UInt32 avail = p->btNumAvailBytes - 1; + p->btBufPos = btLim; + + { + p->btNumAvailBytes = avail; + + #define BT_HASH_BYTES_MAX 5 + + matchMinPos = p->lzPos; + + if (len != 0) + matchMinPos -= bt[1]; + else if (avail < (BT_HASH_BYTES_MAX - 1) - 1) + { + INCREASE_LZ_POS + return d; + } + else + { + const UInt32 hs = p->historySize; + if (matchMinPos > hs) + matchMinPos -= hs; + else + matchMinPos = 1; + } + } + + for (;;) + { + + UInt32 h2, h3, c2, c3; UInt32 *hash = p->hash; const Byte *cur = p->pointerToCurPos; - UInt32 lzPos = p->lzPos; - MT_HASH4_CALC - - curMatch2 = hash[ h2]; - curMatch3 = (hash + kFix3HashSize)[h3]; - curMatch4 = (hash + kFix4HashSize)[h4]; + UInt32 m = p->lzPos; + MT_HASH3_CALC + + c2 = hash[h2]; + c3 = (hash + kFix3HashSize)[h3]; + + hash[h2] = m; + (hash + kFix3HashSize)[h3] = m; + + if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) + { + d[1] = m - c2 - 1; + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2]) + { + d[0] = 3; + d += 2; + break; + } + // else + { + d[0] = 2; + d += 2; + } + } + if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0]) + { + *d++ = 3; + *d++ = m - c3 - 1; + } + break; + } + + if (len != 0) + { + do + { + const UInt32 v0 = bt[0]; + const UInt32 v1 = bt[1]; + bt += 2; + d[0] = v0; + d[1] = v1; + d += 2; + } + while (bt != btLim); + } + INCREASE_LZ_POS + return d; +} +*/ + + +static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) +{ + UInt32 h2, h3, /* h4, */ c2, c3 /* , c4 */; + UInt32 *hash = p->hash; + const Byte *cur = p->pointerToCurPos; + const UInt32 m = p->lzPos; + MT_HASH3_CALC + // MT_HASH4_CALC + c2 = hash[h2]; + c3 = (hash + kFix3HashSize)[h3]; + // c4 = (hash + kFix4HashSize)[h4]; - hash[ h2] = lzPos; - (hash + kFix3HashSize)[h3] = lzPos; - (hash + kFix4HashSize)[h4] = lzPos; + hash[h2] = m; + (hash + kFix3HashSize)[h3] = m; + // (hash + kFix4HashSize)[h4] = m; - if (curMatch2 >= matchMinPos && cur[(ptrdiff_t)curMatch2 - lzPos] == cur[0]) + // #define BT5_USE_H2 + // #ifdef BT5_USE_H2 + if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) { - distances[1] = lzPos - curMatch2 - 1; - if (cur[(ptrdiff_t)curMatch2 - lzPos + 2] == cur[2]) + d[1] = m - c2 - 1; + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2]) { - distances[0] = (cur[(ptrdiff_t)curMatch2 - lzPos + 3] == cur[3]) ? 4 : 3; - return distances + 2; + // d[0] = (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3]) ? 4 : 3; + // return d + 2; + + if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3]) + { + d[0] = 4; + return d + 2; + } + d[0] = 3; + d += 2; + + #ifdef BT5_USE_H4 + if (c4 >= matchMinPos) + if ( + cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] && + cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3] + ) + { + *d++ = 4; + *d++ = m - c4 - 1; + } + #endif + return d; } - distances[0] = 2; - distances += 2; + d[0] = 2; + d += 2; } + // #endif - if (curMatch3 >= matchMinPos && cur[(ptrdiff_t)curMatch3 - lzPos] == cur[0]) + if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0]) { - distances[1] = lzPos - curMatch3 - 1; - if (cur[(ptrdiff_t)curMatch3 - lzPos + 3] == cur[3]) + d[1] = m - c3 - 1; + if (cur[(ptrdiff_t)c3 - (ptrdiff_t)m + 3] == cur[3]) { - distances[0] = 4; - return distances + 2; + d[0] = 4; + return d + 2; } - distances[0] = 3; - distances += 2; + d[0] = 3; + d += 2; } - if (curMatch4 >= matchMinPos) + #ifdef BT5_USE_H4 + if (c4 >= matchMinPos) if ( - cur[(ptrdiff_t)curMatch4 - lzPos] == cur[0] && - cur[(ptrdiff_t)curMatch4 - lzPos + 3] == cur[3] + cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] && + cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3] ) { - *distances++ = 4; - *distances++ = lzPos - curMatch4 - 1; + *d++ = 4; + *d++ = m - c4 - 1; } + #endif - return distances; + return d; } -*/ -#define INCREASE_LZ_POS p->lzPos++; p->pointerToCurPos++; -static UInt32 MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *distances) +static UInt32 * MatchFinderMt2_GetMatches(void *_p, UInt32 *d) { - const UInt32 *btBuf = p->btBuf + p->btBufPos; - UInt32 len = *btBuf++; - p->btBufPos += 1 + len; + CMatchFinderMt *p = (CMatchFinderMt *)_p; + const UInt32 *bt = p->btBufPos; + const UInt32 len = *bt++; + const UInt32 *btLim = bt + len; + p->btBufPos = btLim; p->btNumAvailBytes--; + INCREASE_LZ_POS { - UInt32 i; - for (i = 0; i < len; i += 2) + while (bt != btLim) { - UInt32 v0 = btBuf[0]; - UInt32 v1 = btBuf[1]; - btBuf += 2; - distances[0] = v0; - distances[1] = v1; - distances += 2; + const UInt32 v0 = bt[0]; + const UInt32 v1 = bt[1]; + bt += 2; + d[0] = v0; + d[1] = v1; + d += 2; } } - INCREASE_LZ_POS - return len; + return d; } -static UInt32 MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *distances) -{ - const UInt32 *btBuf = p->btBuf + p->btBufPos; - UInt32 len = *btBuf++; - p->btBufPos += 1 + len; + +static UInt32 * MatchFinderMt_GetMatches(void *_p, UInt32 *d) +{ + CMatchFinderMt *p = (CMatchFinderMt *)_p; + const UInt32 *bt = p->btBufPos; + UInt32 len = *bt++; + const UInt32 avail = p->btNumAvailBytes - 1; + p->btNumAvailBytes = avail; + p->btBufPos = bt + len; if (len == 0) { - /* change for bt5 ! */ - if (p->btNumAvailBytes-- >= 4) - len = (UInt32)(p->MixMatchesFunc(p, p->lzPos - p->historySize, distances) - (distances)); + #define BT_HASH_BYTES_MAX 5 + if (avail >= (BT_HASH_BYTES_MAX - 1) - 1) + { + UInt32 m = p->lzPos; + if (m > p->historySize) + m -= p->historySize; + else + m = 1; + d = p->MixMatchesFunc(p, m, d); + } } else { - /* Condition: there are matches in btBuf with length < p->numHashBytes */ - UInt32 *distances2; - p->btNumAvailBytes--; - distances2 = p->MixMatchesFunc(p, p->lzPos - btBuf[1], distances); + /* + first match pair from BinTree: (match_len, match_dist), + (match_len >= numHashBytes). + MixMatchesFunc() inserts only hash matches that are nearer than (match_dist) + */ + d = p->MixMatchesFunc(p, p->lzPos - bt[1], d); + // if (d) // check for failure do { - UInt32 v0 = btBuf[0]; - UInt32 v1 = btBuf[1]; - btBuf += 2; - distances2[0] = v0; - distances2[1] = v1; - distances2 += 2; + const UInt32 v0 = bt[0]; + const UInt32 v1 = bt[1]; + bt += 2; + d[0] = v0; + d[1] = v1; + d += 2; } - while ((len -= 2) != 0); - len = (UInt32)(distances2 - (distances)); + while (len -= 2); } INCREASE_LZ_POS - return len; + return d; } #define SKIP_HEADER2_MT do { GET_NEXT_BLOCK_IF_REQUIRED #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash; -#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += p->btBuf[p->btBufPos] + 1; } while (--num != 0); +#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0); -static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num) +static void MatchFinderMt0_Skip(void *_p, UInt32 num) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; SKIP_HEADER2_MT { p->btNumAvailBytes--; SKIP_FOOTER_MT } -static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num) +static void MatchFinderMt2_Skip(void *_p, UInt32 num) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; SKIP_HEADER_MT(2) UInt32 h2; MT_HASH2_CALC @@ -791,8 +1345,9 @@ static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num) SKIP_FOOTER_MT } -static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num) +static void MatchFinderMt3_Skip(void *_p, UInt32 num) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; SKIP_HEADER_MT(3) UInt32 h2, h3; MT_HASH3_CALC @@ -803,12 +1358,16 @@ static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num) } /* +// MatchFinderMt4_Skip() is similar to MatchFinderMt3_Skip(). +// The difference is that MatchFinderMt3_Skip() updates hash for last 3 bytes of stream. + static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num) { SKIP_HEADER_MT(4) - UInt32 h2, h3, h4; - MT_HASH4_CALC - (hash + kFix4HashSize)[h4] = + UInt32 h2, h3; // h4 + MT_HASH3_CALC + // MT_HASH4_CALC + // (hash + kFix4HashSize)[h4] = (hash + kFix3HashSize)[h3] = hash[ h2] = p->lzPos; @@ -816,38 +1375,48 @@ static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num) } */ -void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable) +void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable) { - vTable->Init = (Mf_Init_Func)MatchFinderMt_Init; - vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes; - vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos; - vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches; + vTable->Init = MatchFinderMt_Init; + vTable->GetNumAvailableBytes = MatchFinderMt_GetNumAvailableBytes; + vTable->GetPointerToCurrentPos = MatchFinderMt_GetPointerToCurrentPos; + vTable->GetMatches = MatchFinderMt_GetMatches; - switch (p->MatchFinder->numHashBytes) + switch (MF(p)->numHashBytes) { case 2: p->GetHeadsFunc = GetHeads2; - p->MixMatchesFunc = (Mf_Mix_Matches)NULL; - vTable->Skip = (Mf_Skip_Func)MatchFinderMt0_Skip; - vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches; + p->MixMatchesFunc = NULL; + vTable->Skip = MatchFinderMt0_Skip; + vTable->GetMatches = MatchFinderMt2_GetMatches; break; case 3: - p->GetHeadsFunc = GetHeads3; - p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2; - vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip; + p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3; + p->MixMatchesFunc = MixMatches2; + vTable->Skip = MatchFinderMt2_Skip; break; - default: - /* case 4: */ - p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads4b : GetHeads4; - p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3; - vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip; + case 4: + p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4; + + // it's fast inline version of GetMatches() + // vTable->GetMatches = MatchFinderMt_GetMatches_Bt4; + + p->MixMatchesFunc = MixMatches3; + vTable->Skip = MatchFinderMt3_Skip; break; - /* default: - p->GetHeadsFunc = GetHeads5; - p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4; - vTable->Skip = (Mf_Skip_Func)MatchFinderMt4_Skip; + p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5; + p->MixMatchesFunc = MixMatches4; + vTable->Skip = + MatchFinderMt3_Skip; + // MatchFinderMt4_Skip; break; - */ } } + +#undef RINOK_THREAD +#undef PRF +#undef MF +#undef GetUi24hi_from32 +#undef LOCK_BUFFER +#undef UNLOCK_BUFFER diff --git a/src/sdk/C/LzFindMt.h b/src/sdk/C/LzFindMt.h index ef431e3..89984f5 100644 --- a/src/sdk/C/LzFindMt.h +++ b/src/sdk/C/LzFindMt.h @@ -1,42 +1,42 @@ /* LzFindMt.h -- multithreaded Match finder for LZ algorithms -2018-07-04 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ -#ifndef __LZ_FIND_MT_H -#define __LZ_FIND_MT_H +#ifndef ZIP7_INC_LZ_FIND_MT_H +#define ZIP7_INC_LZ_FIND_MT_H #include "LzFind.h" #include "Threads.h" EXTERN_C_BEGIN -#define kMtHashBlockSize (1 << 13) -#define kMtHashNumBlocks (1 << 3) -#define kMtHashNumBlocksMask (kMtHashNumBlocks - 1) - -#define kMtBtBlockSize (1 << 14) -#define kMtBtNumBlocks (1 << 6) -#define kMtBtNumBlocksMask (kMtBtNumBlocks - 1) - -typedef struct _CMtSync +typedef struct { + UInt32 numProcessedBlocks; + Int32 affinityGroup; + UInt64 affinityInGroup; + UInt64 affinity; + CThread thread; + BoolInt wasCreated; BoolInt needStart; + BoolInt csWasInitialized; + BoolInt csWasEntered; + BoolInt exit; BoolInt stopWriting; - CThread thread; CAutoResetEvent canStart; - CAutoResetEvent wasStarted; CAutoResetEvent wasStopped; CSemaphore freeSemaphore; CSemaphore filledSemaphore; - BoolInt csWasInitialized; - BoolInt csWasEntered; CCriticalSection cs; - UInt32 numProcessedBlocks; + // UInt32 numBlocks_Sent; } CMtSync; -typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distances); + +struct CMatchFinderMt_; + +typedef UInt32 * (*Mf_Mix_Matches)(struct CMatchFinderMt_ *p, UInt32 matchMinPos, UInt32 *distances); /* kMtCacheLineDummy must be >= size_of_CPU_cache_line */ #define kMtCacheLineDummy 128 @@ -44,23 +44,28 @@ typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distance typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos, UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc); -typedef struct _CMatchFinderMt +typedef struct CMatchFinderMt_ { /* LZ */ const Byte *pointerToCurPos; UInt32 *btBuf; - UInt32 btBufPos; - UInt32 btBufPosLimit; + const UInt32 *btBufPos; + const UInt32 *btBufPosLimit; UInt32 lzPos; UInt32 btNumAvailBytes; UInt32 *hash; UInt32 fixedHashSize; + // UInt32 hash4Mask; UInt32 historySize; const UInt32 *crc; Mf_Mix_Matches MixMatchesFunc; - + UInt32 failure_LZ_BT; // failure in BT transfered to LZ + // UInt32 failure_LZ_LZ; // failure in LZ tables + UInt32 failureBuf[1]; + // UInt32 crc[256]; + /* LZ + BT */ CMtSync btSync; Byte btDummy[kMtCacheLineDummy]; @@ -70,6 +75,8 @@ typedef struct _CMatchFinderMt UInt32 hashBufPos; UInt32 hashBufPosLimit; UInt32 hashNumAvail; + UInt32 failure_BT; + CLzRef *son; UInt32 matchMaxLen; @@ -77,7 +84,7 @@ typedef struct _CMatchFinderMt UInt32 pos; const Byte *buffer; UInt32 cyclicBufferPos; - UInt32 cyclicBufferSize; /* it must be historySize + 1 */ + UInt32 cyclicBufferSize; /* it must be = (historySize + 1) */ UInt32 cutValue; /* BT + Hash */ @@ -87,13 +94,19 @@ typedef struct _CMatchFinderMt /* Hash */ Mf_GetHeads GetHeadsFunc; CMatchFinder *MatchFinder; + // CMatchFinder MatchFinder; } CMatchFinderMt; +// only for Mt part void MatchFinderMt_Construct(CMatchFinderMt *p); void MatchFinderMt_Destruct(CMatchFinderMt *p, ISzAllocPtr alloc); + SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc); -void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable); +void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable); + +/* call MatchFinderMt_InitMt() before IMatchFinder::Init() */ +SRes MatchFinderMt_InitMt(CMatchFinderMt *p); void MatchFinderMt_ReleaseStream(CMatchFinderMt *p); EXTERN_C_END diff --git a/src/sdk/C/LzFindOpt.c b/src/sdk/C/LzFindOpt.c new file mode 100644 index 0000000..85bdc13 --- /dev/null +++ b/src/sdk/C/LzFindOpt.c @@ -0,0 +1,578 @@ +/* LzFindOpt.c -- multithreaded Match finder for LZ algorithms +2023-04-02 : Igor Pavlov : Public domain */ + +#include "Precomp.h" + +#include "CpuArch.h" +#include "LzFind.h" + +// #include "LzFindMt.h" + +// #define LOG_ITERS + +// #define LOG_THREAD + +#ifdef LOG_THREAD +#include +#define PRF(x) x +#else +// #define PRF(x) +#endif + +#ifdef LOG_ITERS +#include +UInt64 g_NumIters_Tree; +UInt64 g_NumIters_Loop; +UInt64 g_NumIters_Bytes; +#define LOG_ITER(x) x +#else +#define LOG_ITER(x) +#endif + +// ---------- BT THREAD ---------- + +#define USE_SON_PREFETCH +#define USE_LONG_MATCH_OPT + +#define kEmptyHashValue 0 + +// #define CYC_TO_POS_OFFSET 0 + +// #define CYC_TO_POS_OFFSET 1 // for debug + +/* +Z7_NO_INLINE +UInt32 * Z7_FASTCALL GetMatchesSpecN_1(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 *posRes) +{ + do + { + UInt32 delta; + if (hash == size) + break; + delta = *hash++; + + if (delta == 0 || delta > (UInt32)pos) + return NULL; + + lenLimit++; + + if (delta == (UInt32)pos) + { + CLzRef *ptr1 = son + ((size_t)pos << 1) - CYC_TO_POS_OFFSET * 2; + *d++ = 0; + ptr1[0] = kEmptyHashValue; + ptr1[1] = kEmptyHashValue; + } +else +{ + UInt32 *_distances = ++d; + + CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1; + CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; + + const Byte *len0 = cur, *len1 = cur; + UInt32 cutValue = _cutValue; + const Byte *maxLen = cur + _maxLen; + + for (LOG_ITER(g_NumIters_Tree++);;) + { + LOG_ITER(g_NumIters_Loop++); + { + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + CLzRef *pair = son + ((size_t)(((ptrdiff_t)pos - CYC_TO_POS_OFFSET) + diff) << 1); + const Byte *len = (len0 < len1 ? len0 : len1); + + #ifdef USE_SON_PREFETCH + const UInt32 pair0 = *pair; + #endif + + if (len[diff] == len[0]) + { + if (++len != lenLimit && len[diff] == len[0]) + while (++len != lenLimit) + { + LOG_ITER(g_NumIters_Bytes++); + if (len[diff] != len[0]) + break; + } + if (maxLen < len) + { + maxLen = len; + *d++ = (UInt32)(len - cur); + *d++ = delta - 1; + + if (len == lenLimit) + { + const UInt32 pair1 = pair[1]; + *ptr1 = + #ifdef USE_SON_PREFETCH + pair0; + #else + pair[0]; + #endif + *ptr0 = pair1; + + _distances[-1] = (UInt32)(d - _distances); + + #ifdef USE_LONG_MATCH_OPT + + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + + { + for (;;) + { + hash++; + pos++; + cur++; + lenLimit++; + { + CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; + #if 0 + *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff]; + #else + const UInt32 p0 = ptr[0 + (diff * 2)]; + const UInt32 p1 = ptr[1 + (diff * 2)]; + ptr[0] = p0; + ptr[1] = p1; + // ptr[0] = ptr[0 + (diff * 2)]; + // ptr[1] = ptr[1 + (diff * 2)]; + #endif + } + // PrintSon(son + 2, pos - 1); + // printf("\npos = %x delta = %x\n", pos, delta); + len++; + *d++ = 2; + *d++ = (UInt32)(len - cur); + *d++ = delta - 1; + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + } + } + #endif + + break; + } + } + } + + { + const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff); + if (len[diff] < len[0]) + { + delta = pair[1]; + if (delta >= curMatch) + return NULL; + *ptr1 = curMatch; + ptr1 = pair + 1; + len1 = len; + } + else + { + delta = *pair; + if (delta >= curMatch) + return NULL; + *ptr0 = curMatch; + ptr0 = pair; + len0 = len; + } + + delta = (UInt32)pos - delta; + + if (--cutValue == 0 || delta >= pos) + { + *ptr0 = *ptr1 = kEmptyHashValue; + _distances[-1] = (UInt32)(d - _distances); + break; + } + } + } + } // for (tree iterations) +} + pos++; + cur++; + } + while (d < limit); + *posRes = (UInt32)pos; + return d; +} +*/ + +/* define cbs if you use 2 functions. + GetMatchesSpecN_1() : (pos < _cyclicBufferSize) + GetMatchesSpecN_2() : (pos >= _cyclicBufferSize) + + do not define cbs if you use 1 function: + GetMatchesSpecN_2() +*/ + +// #define cbs _cyclicBufferSize + +/* + we use size_t for (pos) and (_cyclicBufferPos_ instead of UInt32 + to eliminate "movsx" BUG in old MSVC x64 compiler. +*/ + +UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes); + +Z7_NO_INLINE +UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes) +{ + do // while (hash != size) + { + UInt32 delta; + + #ifndef cbs + UInt32 cbs; + #endif + + if (hash == size) + break; + + delta = *hash++; + + if (delta == 0) + return NULL; + + lenLimit++; + + #ifndef cbs + cbs = _cyclicBufferSize; + if ((UInt32)pos < cbs) + { + if (delta > (UInt32)pos) + return NULL; + cbs = (UInt32)pos; + } + #endif + + if (delta >= cbs) + { + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + *d++ = 0; + ptr1[0] = kEmptyHashValue; + ptr1[1] = kEmptyHashValue; + } +else +{ + UInt32 *_distances = ++d; + + CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + + UInt32 cutValue = _cutValue; + const Byte *len0 = cur, *len1 = cur; + const Byte *maxLen = cur + _maxLen; + + // if (cutValue == 0) { *ptr0 = *ptr1 = kEmptyHashValue; } else + for (LOG_ITER(g_NumIters_Tree++);;) + { + LOG_ITER(g_NumIters_Loop++); + { + // SPEC code + CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - (ptrdiff_t)delta + + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0) + ) << 1); + + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + const Byte *len = (len0 < len1 ? len0 : len1); + + #ifdef USE_SON_PREFETCH + const UInt32 pair0 = *pair; + #endif + + if (len[diff] == len[0]) + { + if (++len != lenLimit && len[diff] == len[0]) + while (++len != lenLimit) + { + LOG_ITER(g_NumIters_Bytes++); + if (len[diff] != len[0]) + break; + } + if (maxLen < len) + { + maxLen = len; + *d++ = (UInt32)(len - cur); + *d++ = delta - 1; + + if (len == lenLimit) + { + const UInt32 pair1 = pair[1]; + *ptr1 = + #ifdef USE_SON_PREFETCH + pair0; + #else + pair[0]; + #endif + *ptr0 = pair1; + + _distances[-1] = (UInt32)(d - _distances); + + #ifdef USE_LONG_MATCH_OPT + + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + + { + for (;;) + { + *d++ = 2; + *d++ = (UInt32)(lenLimit - cur); + *d++ = delta - 1; + cur++; + lenLimit++; + // SPEC + _cyclicBufferPos++; + { + // SPEC code + CLzRef *dest = son + ((size_t)(_cyclicBufferPos) << 1); + const CLzRef *src = dest + ((diff + + (ptrdiff_t)(UInt32)((_cyclicBufferPos < delta) ? cbs : 0)) << 1); + // CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; + #if 0 + *(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src); + #else + const UInt32 p0 = src[0]; + const UInt32 p1 = src[1]; + dest[0] = p0; + dest[1] = p1; + #endif + } + pos++; + hash++; + if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + break; + } // for() end for long matches + } + #endif + + break; // break from TREE iterations + } + } + } + { + const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff); + if (len[diff] < len[0]) + { + delta = pair[1]; + *ptr1 = curMatch; + ptr1 = pair + 1; + len1 = len; + if (delta >= curMatch) + return NULL; + } + else + { + delta = *pair; + *ptr0 = curMatch; + ptr0 = pair; + len0 = len; + if (delta >= curMatch) + return NULL; + } + delta = (UInt32)pos - delta; + + if (--cutValue == 0 || delta >= cbs) + { + *ptr0 = *ptr1 = kEmptyHashValue; + _distances[-1] = (UInt32)(d - _distances); + break; + } + } + } + } // for (tree iterations) +} + pos++; + _cyclicBufferPos++; + cur++; + } + while (d < limit); + *posRes = (UInt32)pos; + return d; +} + + + +/* +typedef UInt32 uint32plus; // size_t + +UInt32 * Z7_FASTCALL GetMatchesSpecN_3(uint32plus lenLimit, size_t pos, const Byte *cur, CLzRef *son, + UInt32 _cutValue, UInt32 *d, uint32plus _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, + size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, + UInt32 *posRes) +{ + do // while (hash != size) + { + UInt32 delta; + + #ifndef cbs + UInt32 cbs; + #endif + + if (hash == size) + break; + + delta = *hash++; + + if (delta == 0) + return NULL; + + #ifndef cbs + cbs = _cyclicBufferSize; + if ((UInt32)pos < cbs) + { + if (delta > (UInt32)pos) + return NULL; + cbs = (UInt32)pos; + } + #endif + + if (delta >= cbs) + { + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + *d++ = 0; + ptr1[0] = kEmptyHashValue; + ptr1[1] = kEmptyHashValue; + } +else +{ + CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1; + CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1); + UInt32 *_distances = ++d; + uint32plus len0 = 0, len1 = 0; + UInt32 cutValue = _cutValue; + uint32plus maxLen = _maxLen; + // lenLimit++; // const Byte *lenLimit = cur + _lenLimit; + + for (LOG_ITER(g_NumIters_Tree++);;) + { + LOG_ITER(g_NumIters_Loop++); + { + // const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - delta + + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0) + ) << 1); + const Byte *pb = cur - delta; + uint32plus len = (len0 < len1 ? len0 : len1); + + #ifdef USE_SON_PREFETCH + const UInt32 pair0 = *pair; + #endif + + if (pb[len] == cur[len]) + { + if (++len != lenLimit && pb[len] == cur[len]) + while (++len != lenLimit) + if (pb[len] != cur[len]) + break; + if (maxLen < len) + { + maxLen = len; + *d++ = (UInt32)len; + *d++ = delta - 1; + if (len == lenLimit) + { + { + const UInt32 pair1 = pair[1]; + *ptr0 = pair1; + *ptr1 = + #ifdef USE_SON_PREFETCH + pair0; + #else + pair[0]; + #endif + } + + _distances[-1] = (UInt32)(d - _distances); + + #ifdef USE_LONG_MATCH_OPT + + if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit) + break; + + { + const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta; + for (;;) + { + *d++ = 2; + *d++ = (UInt32)lenLimit; + *d++ = delta - 1; + _cyclicBufferPos++; + { + CLzRef *dest = son + ((size_t)_cyclicBufferPos << 1); + const CLzRef *src = dest + ((diff + + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)) << 1); + #if 0 + *(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src); + #else + const UInt32 p0 = src[0]; + const UInt32 p1 = src[1]; + dest[0] = p0; + dest[1] = p1; + #endif + } + hash++; + pos++; + cur++; + pb++; + if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit) + break; + } + } + #endif + + break; + } + } + } + { + const UInt32 curMatch = (UInt32)pos - delta; + if (pb[len] < cur[len]) + { + delta = pair[1]; + *ptr1 = curMatch; + ptr1 = pair + 1; + len1 = len; + } + else + { + delta = *pair; + *ptr0 = curMatch; + ptr0 = pair; + len0 = len; + } + + { + if (delta >= curMatch) + return NULL; + delta = (UInt32)pos - delta; + if (delta >= cbs + // delta >= _cyclicBufferSize || delta >= pos + || --cutValue == 0) + { + *ptr0 = *ptr1 = kEmptyHashValue; + _distances[-1] = (UInt32)(d - _distances); + break; + } + } + } + } + } // for (tree iterations) +} + pos++; + _cyclicBufferPos++; + cur++; + } + while (d < limit); + *posRes = (UInt32)pos; + return d; +} +*/ diff --git a/src/sdk/C/LzHash.h b/src/sdk/C/LzHash.h index e7c9423..2b6290b 100644 --- a/src/sdk/C/LzHash.h +++ b/src/sdk/C/LzHash.h @@ -1,57 +1,34 @@ -/* LzHash.h -- HASH functions for LZ algorithms -2015-04-12 : Igor Pavlov : Public domain */ +/* LzHash.h -- HASH constants for LZ algorithms +2023-03-05 : Igor Pavlov : Public domain */ -#ifndef __LZ_HASH_H -#define __LZ_HASH_H +#ifndef ZIP7_INC_LZ_HASH_H +#define ZIP7_INC_LZ_HASH_H + +/* + (kHash2Size >= (1 << 8)) : Required + (kHash3Size >= (1 << 16)) : Required +*/ #define kHash2Size (1 << 10) #define kHash3Size (1 << 16) -#define kHash4Size (1 << 20) +// #define kHash4Size (1 << 20) #define kFix3HashSize (kHash2Size) #define kFix4HashSize (kHash2Size + kHash3Size) -#define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size) - -#define HASH2_CALC hv = cur[0] | ((UInt32)cur[1] << 8); - -#define HASH3_CALC { \ - UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ - h2 = temp & (kHash2Size - 1); \ - hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; } - -#define HASH4_CALC { \ - UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ - h2 = temp & (kHash2Size - 1); \ - temp ^= ((UInt32)cur[2] << 8); \ - h3 = temp & (kHash3Size - 1); \ - hv = (temp ^ (p->crc[cur[3]] << 5)) & p->hashMask; } - -#define HASH5_CALC { \ - UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ - h2 = temp & (kHash2Size - 1); \ - temp ^= ((UInt32)cur[2] << 8); \ - h3 = temp & (kHash3Size - 1); \ - temp ^= (p->crc[cur[3]] << 5); \ - h4 = temp & (kHash4Size - 1); \ - hv = (temp ^ (p->crc[cur[4]] << 3)) & p->hashMask; } - -/* #define HASH_ZIP_CALC hv = ((cur[0] | ((UInt32)cur[1] << 8)) ^ p->crc[cur[2]]) & 0xFFFF; */ -#define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF; - - -#define MT_HASH2_CALC \ - h2 = (p->crc[cur[0]] ^ cur[1]) & (kHash2Size - 1); - -#define MT_HASH3_CALC { \ - UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ - h2 = temp & (kHash2Size - 1); \ - h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); } - -#define MT_HASH4_CALC { \ - UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ - h2 = temp & (kHash2Size - 1); \ - temp ^= ((UInt32)cur[2] << 8); \ - h3 = temp & (kHash3Size - 1); \ - h4 = (temp ^ (p->crc[cur[3]] << 5)) & (kHash4Size - 1); } +// #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size) + +/* + We use up to 3 crc values for hash: + crc0 + crc1 << Shift_1 + crc2 << Shift_2 + (Shift_1 = 5) and (Shift_2 = 10) is good tradeoff. + Small values for Shift are not good for collision rate. + Big value for Shift_2 increases the minimum size + of hash table, that will be slow for small files. +*/ + +#define kLzHash_CrcShift_1 5 +#define kLzHash_CrcShift_2 10 #endif diff --git a/src/sdk/C/Lzma2Dec.c b/src/sdk/C/Lzma2Dec.c index 4e138a4..8bf54e4 100644 --- a/src/sdk/C/Lzma2Dec.c +++ b/src/sdk/C/Lzma2Dec.c @@ -1,5 +1,5 @@ /* Lzma2Dec.c -- LZMA2 Decoder -2019-02-02 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ /* #define SHOW_DEBUG_INFO */ @@ -71,14 +71,14 @@ static SRes Lzma2Dec_GetOldProps(Byte prop, Byte *props) SRes Lzma2Dec_AllocateProbs(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc) { Byte props[LZMA_PROPS_SIZE]; - RINOK(Lzma2Dec_GetOldProps(prop, props)); + RINOK(Lzma2Dec_GetOldProps(prop, props)) return LzmaDec_AllocateProbs(&p->decoder, props, LZMA_PROPS_SIZE, alloc); } SRes Lzma2Dec_Allocate(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc) { Byte props[LZMA_PROPS_SIZE]; - RINOK(Lzma2Dec_GetOldProps(prop, props)); + RINOK(Lzma2Dec_GetOldProps(prop, props)) return LzmaDec_Allocate(&p->decoder, props, LZMA_PROPS_SIZE, alloc); } @@ -93,7 +93,8 @@ void Lzma2Dec_Init(CLzma2Dec *p) LzmaDec_Init(&p->decoder); } -static ELzma2State Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b) +// ELzma2State +static unsigned Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b) { switch (p->state) { @@ -156,8 +157,10 @@ static ELzma2State Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b) p->decoder.prop.lp = (Byte)lp; return LZMA2_STATE_DATA; } + + default: + return LZMA2_STATE_ERROR; } - return LZMA2_STATE_ERROR; } static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size) @@ -473,8 +476,8 @@ SRes Lzma2Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, SizeT outSize = *destLen, inSize = *srcLen; *destLen = *srcLen = 0; *status = LZMA_STATUS_NOT_SPECIFIED; - Lzma2Dec_Construct(&p); - RINOK(Lzma2Dec_AllocateProbs(&p, prop, alloc)); + Lzma2Dec_CONSTRUCT(&p) + RINOK(Lzma2Dec_AllocateProbs(&p, prop, alloc)) p.decoder.dic = dest; p.decoder.dicBufSize = outSize; Lzma2Dec_Init(&p); @@ -486,3 +489,5 @@ SRes Lzma2Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, Lzma2Dec_FreeProbs(&p, alloc); return res; } + +#undef PRF diff --git a/src/sdk/C/Lzma2Dec.h b/src/sdk/C/Lzma2Dec.h index b8ddeac..1f5233a 100644 --- a/src/sdk/C/Lzma2Dec.h +++ b/src/sdk/C/Lzma2Dec.h @@ -1,8 +1,8 @@ /* Lzma2Dec.h -- LZMA2 Decoder -2018-02-19 : Igor Pavlov : Public domain */ +2023-03-03 : Igor Pavlov : Public domain */ -#ifndef __LZMA2_DEC_H -#define __LZMA2_DEC_H +#ifndef ZIP7_INC_LZMA2_DEC_H +#define ZIP7_INC_LZMA2_DEC_H #include "LzmaDec.h" @@ -22,9 +22,10 @@ typedef struct CLzmaDec decoder; } CLzma2Dec; -#define Lzma2Dec_Construct(p) LzmaDec_Construct(&(p)->decoder) -#define Lzma2Dec_FreeProbs(p, alloc) LzmaDec_FreeProbs(&(p)->decoder, alloc) -#define Lzma2Dec_Free(p, alloc) LzmaDec_Free(&(p)->decoder, alloc) +#define Lzma2Dec_CONSTRUCT(p) LzmaDec_CONSTRUCT(&(p)->decoder) +#define Lzma2Dec_Construct(p) Lzma2Dec_CONSTRUCT(p) +#define Lzma2Dec_FreeProbs(p, alloc) LzmaDec_FreeProbs(&(p)->decoder, alloc) +#define Lzma2Dec_Free(p, alloc) LzmaDec_Free(&(p)->decoder, alloc) SRes Lzma2Dec_AllocateProbs(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc); SRes Lzma2Dec_Allocate(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc); @@ -90,7 +91,7 @@ Lzma2Dec_GetUnpackExtra() returns the value that shows at current input positon. */ -#define Lzma2Dec_GetUnpackExtra(p) ((p)->isExtraMode ? (p)->unpackSize : 0); +#define Lzma2Dec_GetUnpackExtra(p) ((p)->isExtraMode ? (p)->unpackSize : 0) /* ---------- One Call Interface ---------- */ diff --git a/src/sdk/C/Lzma2DecMt.c b/src/sdk/C/Lzma2DecMt.c index 988643d..4bc4dde 100644 --- a/src/sdk/C/Lzma2DecMt.c +++ b/src/sdk/C/Lzma2DecMt.c @@ -1,44 +1,44 @@ /* Lzma2DecMt.c -- LZMA2 Decoder Multi-thread -2019-02-02 : Igor Pavlov : Public domain */ +2023-04-13 : Igor Pavlov : Public domain */ #include "Precomp.h" // #define SHOW_DEBUG_INFO +// #define Z7_ST #ifdef SHOW_DEBUG_INFO #include #endif -#ifdef SHOW_DEBUG_INFO -#define PRF(x) x -#else -#define PRF(x) -#endif - -#define PRF_STR(s) PRF(printf("\n" s "\n")) -#define PRF_STR_INT(s, d) PRF(printf("\n" s " %d\n", (unsigned)d)) -#define PRF_STR_INT_2(s, d1, d2) PRF(printf("\n" s " %d %d\n", (unsigned)d1, (unsigned)d2)) - -// #define _7ZIP_ST - #include "Alloc.h" #include "Lzma2Dec.h" #include "Lzma2DecMt.h" -#ifndef _7ZIP_ST +#ifndef Z7_ST #include "MtDec.h" + +#define LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT (1 << 28) #endif -#define LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT (1 << 28) +#ifndef Z7_ST +#ifdef SHOW_DEBUG_INFO +#define PRF(x) x +#else +#define PRF(x) +#endif +#define PRF_STR(s) PRF(printf("\n" s "\n");) +#define PRF_STR_INT_2(s, d1, d2) PRF(printf("\n" s " %d %d\n", (unsigned)d1, (unsigned)d2);) +#endif + void Lzma2DecMtProps_Init(CLzma2DecMtProps *p) { p->inBufSize_ST = 1 << 20; p->outStep_ST = 1 << 20; - #ifndef _7ZIP_ST + #ifndef Z7_ST p->numThreads = 1; p->inBufSize_MT = 1 << 18; p->outBlockMax = LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT; @@ -48,7 +48,7 @@ void Lzma2DecMtProps_Init(CLzma2DecMtProps *p) -#ifndef _7ZIP_ST +#ifndef Z7_ST /* ---------- CLzma2DecMtThread ---------- */ @@ -81,7 +81,7 @@ typedef struct /* ---------- CLzma2DecMt ---------- */ -typedef struct +struct CLzma2DecMt { // ISzAllocPtr alloc; ISzAllocPtr allocMid; @@ -90,9 +90,9 @@ typedef struct CLzma2DecMtProps props; Byte prop; - ISeqInStream *inStream; - ISeqOutStream *outStream; - ICompressProgress *progress; + ISeqInStreamPtr inStream; + ISeqOutStreamPtr outStream; + ICompressProgressPtr progress; BoolInt finishMode; BoolInt outSize_Defined; @@ -111,14 +111,13 @@ typedef struct size_t inPos; size_t inLim; - #ifndef _7ZIP_ST + #ifndef Z7_ST UInt64 outProcessed_Parse; BoolInt mtc_WasConstructed; CMtDec mtc; - CLzma2DecMtThread coders[MTDEC__THREADS_MAX]; + CLzma2DecMtThread coders[MTDEC_THREADS_MAX]; #endif - -} CLzma2DecMt; +}; @@ -142,11 +141,11 @@ CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid) // Lzma2DecMtProps_Init(&p->props); - #ifndef _7ZIP_ST + #ifndef Z7_ST p->mtc_WasConstructed = False; { unsigned i; - for (i = 0; i < MTDEC__THREADS_MAX; i++) + for (i = 0; i < MTDEC_THREADS_MAX; i++) { CLzma2DecMtThread *t = &p->coders[i]; t->dec_created = False; @@ -156,16 +155,16 @@ CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid) } #endif - return p; + return (CLzma2DecMtHandle)(void *)p; } -#ifndef _7ZIP_ST +#ifndef Z7_ST static void Lzma2DecMt_FreeOutBufs(CLzma2DecMt *p) { unsigned i; - for (i = 0; i < MTDEC__THREADS_MAX; i++) + for (i = 0; i < MTDEC_THREADS_MAX; i++) { CLzma2DecMtThread *t = &p->coders[i]; if (t->outBuf) @@ -196,13 +195,15 @@ static void Lzma2DecMt_FreeSt(CLzma2DecMt *p) } -void Lzma2DecMt_Destroy(CLzma2DecMtHandle pp) +// #define GET_CLzma2DecMt_p CLzma2DecMt *p = (CLzma2DecMt *)(void *)pp; + +void Lzma2DecMt_Destroy(CLzma2DecMtHandle p) { - CLzma2DecMt *p = (CLzma2DecMt *)pp; + // GET_CLzma2DecMt_p Lzma2DecMt_FreeSt(p); - #ifndef _7ZIP_ST + #ifndef Z7_ST if (p->mtc_WasConstructed) { @@ -211,7 +212,7 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle pp) } { unsigned i; - for (i = 0; i < MTDEC__THREADS_MAX; i++) + for (i = 0; i < MTDEC_THREADS_MAX; i++) { CLzma2DecMtThread *t = &p->coders[i]; if (t->dec_created) @@ -226,19 +227,19 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle pp) #endif - ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, pp); + ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, p); } -#ifndef _7ZIP_ST +#ifndef Z7_ST static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCallbackInfo *cc) { CLzma2DecMt *me = (CLzma2DecMt *)obj; CLzma2DecMtThread *t = &me->coders[coderIndex]; - PRF_STR_INT_2("Parse", coderIndex, cc->srcSize); + PRF_STR_INT_2("Parse", coderIndex, cc->srcSize) cc->state = MTDEC_PARSE_CONTINUE; @@ -246,7 +247,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa { if (!t->dec_created) { - Lzma2Dec_Construct(&t->dec); + Lzma2Dec_CONSTRUCT(&t->dec) t->dec_created = True; AlignOffsetAlloc_CreateVTable(&t->alloc); { @@ -255,7 +256,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa const unsigned kNumAlignBits = 12; const unsigned kNumCacheLineBits = 7; /* <= kNumAlignBits */ t->alloc.numAlignBits = kNumAlignBits; - t->alloc.offset = ((UInt32)coderIndex * ((1 << 11) + (1 << 8) + (1 << 6))) & ((1 << kNumAlignBits) - (1 << kNumCacheLineBits)); + t->alloc.offset = ((UInt32)coderIndex * (((unsigned)1 << 11) + (1 << 8) + (1 << 6))) & (((unsigned)1 << kNumAlignBits) - ((unsigned)1 << kNumCacheLineBits)); t->alloc.baseAlloc = me->alignOffsetAlloc.baseAlloc; } } @@ -297,7 +298,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa // that must be finished at position <= outBlockMax. { - const SizeT srcOrig = cc->srcSize; + const size_t srcOrig = cc->srcSize; SizeT srcSize_Point = 0; SizeT dicPos_Point = 0; @@ -306,10 +307,10 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa for (;;) { - SizeT srcCur = srcOrig - cc->srcSize; + SizeT srcCur = (SizeT)(srcOrig - cc->srcSize); status = Lzma2Dec_Parse(&t->dec, - limit - t->dec.decoder.dicPos, + (SizeT)limit - t->dec.decoder.dicPos, cc->src + cc->srcSize, &srcCur, checkFinishBlock); @@ -333,7 +334,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa if (t->dec.decoder.dicPos >= (1 << 14)) break; dicPos_Point = t->dec.decoder.dicPos; - srcSize_Point = cc->srcSize; + srcSize_Point = (SizeT)cc->srcSize; continue; } @@ -391,7 +392,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa if (unpackRem != 0) { /* we also reserve space for max possible number of output bytes of current LZMA chunk */ - SizeT rem = limit - dicPos; + size_t rem = limit - dicPos; if (rem > unpackRem) rem = unpackRem; dicPos += rem; @@ -444,7 +445,7 @@ static SRes Lzma2DecMt_MtCallback_PreCode(void *pp, unsigned coderIndex) } t->dec.decoder.dic = dest; - t->dec.decoder.dicBufSize = t->outPreSize; + t->dec.decoder.dicBufSize = (SizeT)t->outPreSize; t->needInit = True; @@ -462,7 +463,7 @@ static SRes Lzma2DecMt_MtCallback_Code(void *pp, unsigned coderIndex, UNUSED_VAR(srcFinished) - PRF_STR_INT_2("Code", coderIndex, srcSize); + PRF_STR_INT_2("Code", coderIndex, srcSize) *inCodePos = t->inCodeSize; *outCodePos = 0; @@ -476,13 +477,13 @@ static SRes Lzma2DecMt_MtCallback_Code(void *pp, unsigned coderIndex, { ELzmaStatus status; - size_t srcProcessed = srcSize; + SizeT srcProcessed = (SizeT)srcSize; BoolInt blockWasFinished = ((int)t->parseStatus == LZMA_STATUS_FINISHED_WITH_MARK || t->parseStatus == LZMA2_PARSE_STATUS_NEW_BLOCK); SRes res = Lzma2Dec_DecodeToDic(&t->dec, - t->outPreSize, + (SizeT)t->outPreSize, src, &srcProcessed, blockWasFinished ? LZMA_FINISH_END : LZMA_FINISH_ANY, &status); @@ -527,7 +528,7 @@ static SRes Lzma2DecMt_MtCallback_Code(void *pp, unsigned coderIndex, static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex, BoolInt needWriteToStream, - const Byte *src, size_t srcSize, + const Byte *src, size_t srcSize, BoolInt isCross, BoolInt *needContinue, BoolInt *canRecode) { CLzma2DecMt *me = (CLzma2DecMt *)pp; @@ -536,12 +537,14 @@ static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex, const Byte *data = t->outBuf; BoolInt needContinue2 = True; - PRF_STR_INT_2("Write", coderIndex, srcSize); + UNUSED_VAR(src) + UNUSED_VAR(srcSize) + UNUSED_VAR(isCross) + + PRF_STR_INT_2("Write", coderIndex, srcSize) *needContinue = False; *canRecode = True; - UNUSED_VAR(src) - UNUSED_VAR(srcSize) if ( // t->parseStatus == LZMA_STATUS_FINISHED_WITH_MARK @@ -586,7 +589,7 @@ static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex, *needContinue = needContinue2; return SZ_OK; } - RINOK(MtProgress_ProgressAdd(&me->mtc.mtProgress, 0, 0)); + RINOK(MtProgress_ProgressAdd(&me->mtc.mtProgress, 0, 0)) } } @@ -609,11 +612,11 @@ static SRes Lzma2Dec_Prepare_ST(CLzma2DecMt *p) { if (!p->dec_created) { - Lzma2Dec_Construct(&p->dec); + Lzma2Dec_CONSTRUCT(&p->dec) p->dec_created = True; } - RINOK(Lzma2Dec_Allocate(&p->dec, p->prop, &p->alignOffsetAlloc.vt)); + RINOK(Lzma2Dec_Allocate(&p->dec, p->prop, &p->alignOffsetAlloc.vt)) if (!p->inBuf || p->inBufSize != p->props.inBufSize_ST) { @@ -632,7 +635,7 @@ static SRes Lzma2Dec_Prepare_ST(CLzma2DecMt *p) static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p - #ifndef _7ZIP_ST + #ifndef Z7_ST , BoolInt tMode #endif ) @@ -644,7 +647,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p CLzma2Dec *dec; - #ifndef _7ZIP_ST + #ifndef Z7_ST if (tMode) { Lzma2DecMt_FreeOutBufs(p); @@ -652,7 +655,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p } #endif - RINOK(Lzma2Dec_Prepare_ST(p)); + RINOK(Lzma2Dec_Prepare_ST(p)) dec = &p->dec; @@ -679,7 +682,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p if (inPos == inLim) { - #ifndef _7ZIP_ST + #ifndef Z7_ST if (tMode) { inData = MtDec_Read(&p->mtc, &inLim); @@ -696,7 +699,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p inPos = 0; inLim = p->inBufSize; inData = p->inBuf; - p->readRes = ISeqInStream_Read(p->inStream, (void *)inData, &inLim); + p->readRes = ISeqInStream_Read(p->inStream, (void *)(p->inBuf), &inLim); // p->readProcessed += inLim; // inLim -= 5; p->readWasFinished = True; // for test if (inLim == 0 || p->readRes != SZ_OK) @@ -708,7 +711,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p { SizeT next = dec->decoder.dicBufSize; if (next - wrPos > p->props.outStep_ST) - next = wrPos + p->props.outStep_ST; + next = wrPos + (SizeT)p->props.outStep_ST; size = next - dicPos; } @@ -724,7 +727,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p } } - inProcessed = inLim - inPos; + inProcessed = (SizeT)(inLim - inPos); res = Lzma2Dec_DecodeToDic(dec, dicPos + size, inData + inPos, &inProcessed, finishMode, &status); @@ -753,7 +756,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p dec->decoder.dicPos = 0; wrPos = dec->decoder.dicPos; - RINOK(res2); + RINOK(res2) if (needStop) { @@ -786,7 +789,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p UInt64 outDelta = p->outProcessed - outPrev; if (inDelta >= (1 << 22) || outDelta >= (1 << 22)) { - RINOK(ICompressProgress_Progress(p->progress, p->inProcessed, p->outProcessed)); + RINOK(ICompressProgress_Progress(p->progress, p->inProcessed, p->outProcessed)) inPrev = p->inProcessed; outPrev = p->outProcessed; } @@ -796,20 +799,20 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p -SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp, +SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p, Byte prop, const CLzma2DecMtProps *props, - ISeqOutStream *outStream, const UInt64 *outDataSize, int finishMode, + ISeqOutStreamPtr outStream, const UInt64 *outDataSize, int finishMode, // Byte *outBuf, size_t *outBufSize, - ISeqInStream *inStream, + ISeqInStreamPtr inStream, // const Byte *inData, size_t inDataSize, UInt64 *inProcessed, // UInt64 *outProcessed, int *isMT, - ICompressProgress *progress) + ICompressProgressPtr progress) { - CLzma2DecMt *p = (CLzma2DecMt *)pp; - #ifndef _7ZIP_ST + // GET_CLzma2DecMt_p + #ifndef Z7_ST BoolInt tMode; #endif @@ -838,11 +841,12 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp, p->inProcessed = 0; p->readWasFinished = False; + p->readRes = SZ_OK; *isMT = False; - #ifndef _7ZIP_ST + #ifndef Z7_ST tMode = False; @@ -856,7 +860,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp, if (p->props.numThreads > 1) { - IMtDecCallback vt; + IMtDecCallback2 vt; Lzma2DecMt_FreeSt(p); @@ -936,7 +940,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp, p->readWasFinished = p->mtc.readWasFinished; p->inProcessed = p->mtc.inProcessed; - PRF_STR("----- decoding ST -----"); + PRF_STR("----- decoding ST -----") } } @@ -947,7 +951,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp, { SRes res = Lzma2Dec_Decode_ST(p - #ifndef _7ZIP_ST + #ifndef Z7_ST , tMode #endif ); @@ -955,11 +959,16 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp, *inProcessed = p->inProcessed; // res = SZ_OK; // for test - if (res == SZ_OK && p->readRes != SZ_OK) + if (res == SZ_ERROR_INPUT_EOF) + { + if (p->readRes != SZ_OK) + res = p->readRes; + } + else if (res == SZ_OK && p->readRes != SZ_OK) res = p->readRes; /* - #ifndef _7ZIP_ST + #ifndef Z7_ST if (res == SZ_OK && tMode && p->mtc.parseRes != SZ_OK) res = p->mtc.parseRes; #endif @@ -972,13 +981,13 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp, /* ---------- Read from CLzma2DecMtHandle Interface ---------- */ -SRes Lzma2DecMt_Init(CLzma2DecMtHandle pp, +SRes Lzma2DecMt_Init(CLzma2DecMtHandle p, Byte prop, const CLzma2DecMtProps *props, const UInt64 *outDataSize, int finishMode, - ISeqInStream *inStream) + ISeqInStreamPtr inStream) { - CLzma2DecMt *p = (CLzma2DecMt *)pp; + // GET_CLzma2DecMt_p if (prop > 40) return SZ_ERROR_UNSUPPORTED; @@ -1007,11 +1016,11 @@ SRes Lzma2DecMt_Init(CLzma2DecMtHandle pp, } -SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp, +SRes Lzma2DecMt_Read(CLzma2DecMtHandle p, Byte *data, size_t *outSize, UInt64 *inStreamProcessed) { - CLzma2DecMt *p = (CLzma2DecMt *)pp; + // GET_CLzma2DecMt_p ELzmaFinishMode finishMode; SRes readRes; size_t size = *outSize; @@ -1047,8 +1056,8 @@ SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp, readRes = ISeqInStream_Read(p->inStream, p->inBuf, &p->inLim); } - inCur = p->inLim - p->inPos; - outCur = size; + inCur = (SizeT)(p->inLim - p->inPos); + outCur = (SizeT)size; res = Lzma2Dec_DecodeToBuf(&p->dec, data, &outCur, p->inBuf + p->inPos, &inCur, finishMode, &status); @@ -1080,3 +1089,7 @@ SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp, return readRes; } } + +#undef PRF +#undef PRF_STR +#undef PRF_STR_INT_2 diff --git a/src/sdk/C/Lzma2DecMt.h b/src/sdk/C/Lzma2DecMt.h index 7791c31..93a5cd5 100644 --- a/src/sdk/C/Lzma2DecMt.h +++ b/src/sdk/C/Lzma2DecMt.h @@ -1,8 +1,8 @@ /* Lzma2DecMt.h -- LZMA2 Decoder Multi-thread -2018-02-17 : Igor Pavlov : Public domain */ +2023-04-13 : Igor Pavlov : Public domain */ -#ifndef __LZMA2_DEC_MT_H -#define __LZMA2_DEC_MT_H +#ifndef ZIP7_INC_LZMA2_DEC_MT_H +#define ZIP7_INC_LZMA2_DEC_MT_H #include "7zTypes.h" @@ -13,7 +13,7 @@ typedef struct size_t inBufSize_ST; size_t outStep_ST; - #ifndef _7ZIP_ST + #ifndef Z7_ST unsigned numThreads; size_t inBufSize_MT; size_t outBlockMax; @@ -38,7 +38,9 @@ void Lzma2DecMtProps_Init(CLzma2DecMtProps *p); SZ_ERROR_THREAD - error in multithreading functions (only for Mt version) */ -typedef void * CLzma2DecMtHandle; +typedef struct CLzma2DecMt CLzma2DecMt; +typedef CLzma2DecMt * CLzma2DecMtHandle; +// Z7_DECLARE_HANDLE(CLzma2DecMtHandle) CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid); void Lzma2DecMt_Destroy(CLzma2DecMtHandle p); @@ -46,11 +48,11 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle p); SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p, Byte prop, const CLzma2DecMtProps *props, - ISeqOutStream *outStream, + ISeqOutStreamPtr outStream, const UInt64 *outDataSize, // NULL means undefined int finishMode, // 0 - partial unpacking is allowed, 1 - if lzma2 stream must be finished // Byte *outBuf, size_t *outBufSize, - ISeqInStream *inStream, + ISeqInStreamPtr inStream, // const Byte *inData, size_t inDataSize, // out variables: @@ -58,7 +60,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p, int *isMT, /* out: (*isMT == 0), if single thread decoding was used */ // UInt64 *outProcessed, - ICompressProgress *progress); + ICompressProgressPtr progress); /* ---------- Read from CLzma2DecMtHandle Interface ---------- */ @@ -67,7 +69,7 @@ SRes Lzma2DecMt_Init(CLzma2DecMtHandle pp, Byte prop, const CLzma2DecMtProps *props, const UInt64 *outDataSize, int finishMode, - ISeqInStream *inStream); + ISeqInStreamPtr inStream); SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp, Byte *data, size_t *outSize, diff --git a/src/sdk/C/Lzma2Enc.c b/src/sdk/C/Lzma2Enc.c index 5c1ad49..72aec69 100644 --- a/src/sdk/C/Lzma2Enc.c +++ b/src/sdk/C/Lzma2Enc.c @@ -1,18 +1,18 @@ /* Lzma2Enc.c -- LZMA2 Encoder -2018-07-04 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" #include -/* #define _7ZIP_ST */ +/* #define Z7_ST */ #include "Lzma2Enc.h" -#ifndef _7ZIP_ST +#ifndef Z7_ST #include "MtCoder.h" #else -#define MTCODER__THREADS_MAX 1 +#define MTCODER_THREADS_MAX 1 #endif #define LZMA2_CONTROL_LZMA (1 << 7) @@ -40,7 +40,7 @@ typedef struct { ISeqInStream vt; - ISeqInStream *realStream; + ISeqInStreamPtr realStream; UInt64 limit; UInt64 processed; int finished; @@ -53,15 +53,15 @@ static void LimitedSeqInStream_Init(CLimitedSeqInStream *p) p->finished = 0; } -static SRes LimitedSeqInStream_Read(const ISeqInStream *pp, void *data, size_t *size) +static SRes LimitedSeqInStream_Read(ISeqInStreamPtr pp, void *data, size_t *size) { - CLimitedSeqInStream *p = CONTAINER_FROM_VTBL(pp, CLimitedSeqInStream, vt); + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CLimitedSeqInStream) size_t size2 = *size; SRes res = SZ_OK; if (p->limit != (UInt64)(Int64)-1) { - UInt64 rem = p->limit - p->processed; + const UInt64 rem = p->limit - p->processed; if (size2 > rem) size2 = (size_t)rem; } @@ -95,8 +95,8 @@ static SRes Lzma2EncInt_InitStream(CLzma2EncInt *p, const CLzma2EncProps *props) { SizeT propsSize = LZMA_PROPS_SIZE; Byte propsEncoded[LZMA_PROPS_SIZE]; - RINOK(LzmaEnc_SetProps(p->enc, &props->lzmaProps)); - RINOK(LzmaEnc_WriteProperties(p->enc, propsEncoded, &propsSize)); + RINOK(LzmaEnc_SetProps(p->enc, &props->lzmaProps)) + RINOK(LzmaEnc_WriteProperties(p->enc, propsEncoded, &propsSize)) p->propsByte = propsEncoded[0]; p->propsAreSet = True; } @@ -111,23 +111,23 @@ static void Lzma2EncInt_InitBlock(CLzma2EncInt *p) } -SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp, ISeqInStream *inStream, UInt32 keepWindowSize, +SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle p, ISeqInStreamPtr inStream, UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig); -SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const Byte *src, SizeT srcLen, +SRes LzmaEnc_MemPrepare(CLzmaEncHandle p, const Byte *src, SizeT srcLen, UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig); -SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit, +SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle p, BoolInt reInit, Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize); -const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle pp); -void LzmaEnc_Finish(CLzmaEncHandle pp); -void LzmaEnc_SaveState(CLzmaEncHandle pp); -void LzmaEnc_RestoreState(CLzmaEncHandle pp); +const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle p); +void LzmaEnc_Finish(CLzmaEncHandle p); +void LzmaEnc_SaveState(CLzmaEncHandle p); +void LzmaEnc_RestoreState(CLzmaEncHandle p); /* -UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp); +UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle p); */ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf, - size_t *packSizeRes, ISeqOutStream *outStream) + size_t *packSizeRes, ISeqOutStreamPtr outStream) { size_t packSizeLimit = *packSizeRes; size_t packSize = packSizeLimit; @@ -167,7 +167,7 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf, while (unpackSize > 0) { - UInt32 u = (unpackSize < LZMA2_COPY_CHUNK_SIZE) ? unpackSize : LZMA2_COPY_CHUNK_SIZE; + const UInt32 u = (unpackSize < LZMA2_COPY_CHUNK_SIZE) ? unpackSize : LZMA2_COPY_CHUNK_SIZE; if (packSizeLimit - destPos < u + 3) return SZ_ERROR_OUTPUT_EOF; outBuf[destPos++] = (Byte)(p->srcPos == 0 ? LZMA2_CONTROL_COPY_RESET_DIC : LZMA2_CONTROL_COPY_NO_RESET); @@ -196,9 +196,9 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf, { size_t destPos = 0; - UInt32 u = unpackSize - 1; - UInt32 pm = (UInt32)(packSize - 1); - unsigned mode = (p->srcPos == 0) ? 3 : (p->needInitState ? (p->needInitProp ? 2 : 1) : 0); + const UInt32 u = unpackSize - 1; + const UInt32 pm = (UInt32)(packSize - 1); + const unsigned mode = (p->srcPos == 0) ? 3 : (p->needInitState ? (p->needInitProp ? 2 : 1) : 0); PRF(printf(" ")); @@ -231,10 +231,11 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf, void Lzma2EncProps_Init(CLzma2EncProps *p) { LzmaEncProps_Init(&p->lzmaProps); - p->blockSize = LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO; + p->blockSize = LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO; p->numBlockThreads_Reduced = -1; p->numBlockThreads_Max = -1; p->numTotalThreads = -1; + p->numThreadGroups = 0; } void Lzma2EncProps_Normalize(CLzma2EncProps *p) @@ -251,8 +252,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p) t2 = p->numBlockThreads_Max; t3 = p->numTotalThreads; - if (t2 > MTCODER__THREADS_MAX) - t2 = MTCODER__THREADS_MAX; + if (t2 > MTCODER_THREADS_MAX) + t2 = MTCODER_THREADS_MAX; if (t3 <= 0) { @@ -268,8 +269,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p) t1 = 1; t2 = t3; } - if (t2 > MTCODER__THREADS_MAX) - t2 = MTCODER__THREADS_MAX; + if (t2 > MTCODER_THREADS_MAX) + t2 = MTCODER_THREADS_MAX; } else if (t1 <= 0) { @@ -286,8 +287,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p) fileSize = p->lzmaProps.reduceSize; - if ( p->blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID - && p->blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO + if ( p->blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID + && p->blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO && (p->blockSize < fileSize || fileSize == (UInt64)(Int64)-1)) p->lzmaProps.reduceSize = p->blockSize; @@ -297,19 +298,19 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p) t1 = p->lzmaProps.numThreads; - if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID) + if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID) { t2r = t2 = 1; t3 = t1; } - else if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO && t2 <= 1) + else if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO && t2 <= 1) { /* if there is no block multi-threading, we use SOLID block */ - p->blockSize = LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID; + p->blockSize = LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID; } else { - if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO) + if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO) { const UInt32 kMinSize = (UInt32)1 << 20; const UInt32 kMaxSize = (UInt32)1 << 28; @@ -330,7 +331,7 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p) numBlocks++; if (numBlocks < (unsigned)t2) { - t2r = (unsigned)numBlocks; + t2r = (int)numBlocks; if (t2r == 0) t2r = 1; t3 = t1 * t2r; @@ -344,7 +345,7 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p) } -static SRes Progress(ICompressProgress *p, UInt64 inSize, UInt64 outSize) +static SRes Progress(ICompressProgressPtr p, UInt64 inSize, UInt64 outSize) { return (p && ICompressProgress_Progress(p, inSize, outSize) != SZ_OK) ? SZ_ERROR_PROGRESS : SZ_OK; } @@ -352,7 +353,7 @@ static SRes Progress(ICompressProgress *p, UInt64 inSize, UInt64 outSize) /* ---------- Lzma2 ---------- */ -typedef struct +struct CLzma2Enc { Byte propEncoded; CLzma2EncProps props; @@ -363,23 +364,22 @@ typedef struct ISzAllocPtr alloc; ISzAllocPtr allocBig; - CLzma2EncInt coders[MTCODER__THREADS_MAX]; + CLzma2EncInt coders[MTCODER_THREADS_MAX]; - #ifndef _7ZIP_ST + #ifndef Z7_ST - ISeqOutStream *outStream; + ISeqOutStreamPtr outStream; Byte *outBuf; size_t outBuf_Rem; /* remainder in outBuf */ size_t outBufSize; /* size of allocated outBufs[i] */ - size_t outBufsDataSizes[MTCODER__BLOCKS_MAX]; + size_t outBufsDataSizes[MTCODER_BLOCKS_MAX]; BoolInt mtCoder_WasConstructed; CMtCoder mtCoder; - Byte *outBufs[MTCODER__BLOCKS_MAX]; + Byte *outBufs[MTCODER_BLOCKS_MAX]; #endif - -} CLzma2Enc; +}; @@ -396,30 +396,30 @@ CLzma2EncHandle Lzma2Enc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig) p->allocBig = allocBig; { unsigned i; - for (i = 0; i < MTCODER__THREADS_MAX; i++) + for (i = 0; i < MTCODER_THREADS_MAX; i++) p->coders[i].enc = NULL; } - #ifndef _7ZIP_ST + #ifndef Z7_ST p->mtCoder_WasConstructed = False; { unsigned i; - for (i = 0; i < MTCODER__BLOCKS_MAX; i++) + for (i = 0; i < MTCODER_BLOCKS_MAX; i++) p->outBufs[i] = NULL; p->outBufSize = 0; } #endif - return p; + return (CLzma2EncHandle)p; } -#ifndef _7ZIP_ST +#ifndef Z7_ST static void Lzma2Enc_FreeOutBufs(CLzma2Enc *p) { unsigned i; - for (i = 0; i < MTCODER__BLOCKS_MAX; i++) + for (i = 0; i < MTCODER_BLOCKS_MAX; i++) if (p->outBufs[i]) { ISzAlloc_Free(p->alloc, p->outBufs[i]); @@ -430,12 +430,13 @@ static void Lzma2Enc_FreeOutBufs(CLzma2Enc *p) #endif +// #define GET_CLzma2Enc_p CLzma2Enc *p = (CLzma2Enc *)(void *)p; -void Lzma2Enc_Destroy(CLzma2EncHandle pp) +void Lzma2Enc_Destroy(CLzma2EncHandle p) { - CLzma2Enc *p = (CLzma2Enc *)pp; + // GET_CLzma2Enc_p unsigned i; - for (i = 0; i < MTCODER__THREADS_MAX; i++) + for (i = 0; i < MTCODER_THREADS_MAX; i++) { CLzma2EncInt *t = &p->coders[i]; if (t->enc) @@ -446,7 +447,7 @@ void Lzma2Enc_Destroy(CLzma2EncHandle pp) } - #ifndef _7ZIP_ST + #ifndef Z7_ST if (p->mtCoder_WasConstructed) { MtCoder_Destruct(&p->mtCoder); @@ -458,13 +459,13 @@ void Lzma2Enc_Destroy(CLzma2EncHandle pp) ISzAlloc_Free(p->alloc, p->tempBufLzma); p->tempBufLzma = NULL; - ISzAlloc_Free(p->alloc, pp); + ISzAlloc_Free(p->alloc, p); } -SRes Lzma2Enc_SetProps(CLzma2EncHandle pp, const CLzma2EncProps *props) +SRes Lzma2Enc_SetProps(CLzma2EncHandle p, const CLzma2EncProps *props) { - CLzma2Enc *p = (CLzma2Enc *)pp; + // GET_CLzma2Enc_p CLzmaEncProps lzmaProps = props->lzmaProps; LzmaEncProps_Normalize(&lzmaProps); if (lzmaProps.lc + lzmaProps.lp > LZMA2_LCLP_MAX) @@ -475,16 +476,16 @@ SRes Lzma2Enc_SetProps(CLzma2EncHandle pp, const CLzma2EncProps *props) } -void Lzma2Enc_SetDataSize(CLzmaEncHandle pp, UInt64 expectedDataSiize) +void Lzma2Enc_SetDataSize(CLzma2EncHandle p, UInt64 expectedDataSiize) { - CLzma2Enc *p = (CLzma2Enc *)pp; + // GET_CLzma2Enc_p p->expectedDataSize = expectedDataSiize; } -Byte Lzma2Enc_WriteProperties(CLzma2EncHandle pp) +Byte Lzma2Enc_WriteProperties(CLzma2EncHandle p) { - CLzma2Enc *p = (CLzma2Enc *)pp; + // GET_CLzma2Enc_p unsigned i; UInt32 dicSize = LzmaEncProps_GetDictSize(&p->props.lzmaProps); for (i = 0; i < 40; i++) @@ -497,12 +498,12 @@ Byte Lzma2Enc_WriteProperties(CLzma2EncHandle pp) static SRes Lzma2Enc_EncodeMt1( CLzma2Enc *me, CLzma2EncInt *p, - ISeqOutStream *outStream, + ISeqOutStreamPtr outStream, Byte *outBuf, size_t *outBufSize, - ISeqInStream *inStream, + ISeqInStreamPtr inStream, const Byte *inData, size_t inDataSize, int finished, - ICompressProgress *progress) + ICompressProgressPtr progress) { UInt64 unpackTotal = 0; UInt64 packTotal = 0; @@ -540,12 +541,12 @@ static SRes Lzma2Enc_EncodeMt1( } } - RINOK(Lzma2EncInt_InitStream(p, &me->props)); + RINOK(Lzma2EncInt_InitStream(p, &me->props)) for (;;) { SRes res = SZ_OK; - size_t inSizeCur = 0; + SizeT inSizeCur = 0; Lzma2EncInt_InitBlock(p); @@ -559,7 +560,7 @@ static SRes Lzma2Enc_EncodeMt1( if (me->expectedDataSize != (UInt64)(Int64)-1 && me->expectedDataSize >= unpackTotal) expected = me->expectedDataSize - unpackTotal; - if (me->props.blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID + if (me->props.blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID && expected > me->props.blockSize) expected = (size_t)me->props.blockSize; @@ -569,14 +570,14 @@ static SRes Lzma2Enc_EncodeMt1( &limitedInStream.vt, LZMA2_KEEP_WINDOW_SIZE, me->alloc, - me->allocBig)); + me->allocBig)) } else { - inSizeCur = inDataSize - (size_t)unpackTotal; - if (me->props.blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID + inSizeCur = (SizeT)(inDataSize - (size_t)unpackTotal); + if (me->props.blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID && inSizeCur > me->props.blockSize) - inSizeCur = (size_t)me->props.blockSize; + inSizeCur = (SizeT)(size_t)me->props.blockSize; // LzmaEnc_SetDataSize(p->enc, inSizeCur); @@ -584,7 +585,7 @@ static SRes Lzma2Enc_EncodeMt1( inData + (size_t)unpackTotal, inSizeCur, LZMA2_KEEP_WINDOW_SIZE, me->alloc, - me->allocBig)); + me->allocBig)) } for (;;) @@ -621,7 +622,7 @@ static SRes Lzma2Enc_EncodeMt1( unpackTotal += p->srcPos; - RINOK(res); + RINOK(res) if (p->srcPos != (inStream ? limitedInStream.processed : inSizeCur)) return SZ_ERROR_FAIL; @@ -632,15 +633,15 @@ static SRes Lzma2Enc_EncodeMt1( { if (outBuf) { - size_t destPos = *outBufSize; + const size_t destPos = *outBufSize; if (destPos >= outLim) return SZ_ERROR_OUTPUT_EOF; - outBuf[destPos] = 0; + outBuf[destPos] = LZMA2_CONTROL_EOF; // 0 *outBufSize = destPos + 1; } else { - Byte b = 0; + const Byte b = LZMA2_CONTROL_EOF; // 0; if (ISeqOutStream_Write(outStream, &b, 1) != 1) return SZ_ERROR_WRITE; } @@ -652,12 +653,12 @@ static SRes Lzma2Enc_EncodeMt1( -#ifndef _7ZIP_ST +#ifndef Z7_ST -static SRes Lzma2Enc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBufIndex, +static SRes Lzma2Enc_MtCallback_Code(void *p, unsigned coderIndex, unsigned outBufIndex, const Byte *src, size_t srcSize, int finished) { - CLzma2Enc *me = (CLzma2Enc *)pp; + CLzma2Enc *me = (CLzma2Enc *)p; size_t destSize = me->outBufSize; SRes res; CMtProgressThunk progressThunk; @@ -692,9 +693,9 @@ static SRes Lzma2Enc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned out } -static SRes Lzma2Enc_MtCallback_Write(void *pp, unsigned outBufIndex) +static SRes Lzma2Enc_MtCallback_Write(void *p, unsigned outBufIndex) { - CLzma2Enc *me = (CLzma2Enc *)pp; + CLzma2Enc *me = (CLzma2Enc *)p; size_t size = me->outBufsDataSizes[outBufIndex]; const Byte *data = me->outBufs[outBufIndex]; @@ -713,14 +714,14 @@ static SRes Lzma2Enc_MtCallback_Write(void *pp, unsigned outBufIndex) -SRes Lzma2Enc_Encode2(CLzma2EncHandle pp, - ISeqOutStream *outStream, +SRes Lzma2Enc_Encode2(CLzma2EncHandle p, + ISeqOutStreamPtr outStream, Byte *outBuf, size_t *outBufSize, - ISeqInStream *inStream, + ISeqInStreamPtr inStream, const Byte *inData, size_t inDataSize, - ICompressProgress *progress) + ICompressProgressPtr progress) { - CLzma2Enc *p = (CLzma2Enc *)pp; + // GET_CLzma2Enc_p if (inStream && inData) return SZ_ERROR_PARAM; @@ -730,11 +731,11 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp, { unsigned i; - for (i = 0; i < MTCODER__THREADS_MAX; i++) + for (i = 0; i < MTCODER_THREADS_MAX; i++) p->coders[i].propsAreSet = False; } - #ifndef _7ZIP_ST + #ifndef Z7_ST if (p->props.numBlockThreads_Reduced > 1) { @@ -772,7 +773,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp, return SZ_ERROR_PARAM; /* SZ_ERROR_MEM */ { - size_t destBlockSize = p->mtCoder.blockSize + (p->mtCoder.blockSize >> 10) + 16; + const size_t destBlockSize = p->mtCoder.blockSize + (p->mtCoder.blockSize >> 10) + 16; if (destBlockSize < p->mtCoder.blockSize) return SZ_ERROR_PARAM; if (p->outBufSize != destBlockSize) @@ -780,13 +781,14 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp, p->outBufSize = destBlockSize; } - p->mtCoder.numThreadsMax = p->props.numBlockThreads_Max; + p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max; + p->mtCoder.numThreadGroups = p->props.numThreadGroups; p->mtCoder.expectedDataSize = p->expectedDataSize; { - SRes res = MtCoder_Code(&p->mtCoder); + const SRes res = MtCoder_Code(&p->mtCoder); if (!outStream) - *outBufSize = p->outBuf - outBuf; + *outBufSize = (size_t)(p->outBuf - outBuf); return res; } } @@ -801,3 +803,5 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp, True, /* finished */ progress); } + +#undef PRF diff --git a/src/sdk/C/Lzma2Enc.h b/src/sdk/C/Lzma2Enc.h index 6a6110f..1e6b50c 100644 --- a/src/sdk/C/Lzma2Enc.h +++ b/src/sdk/C/Lzma2Enc.h @@ -1,15 +1,15 @@ /* Lzma2Enc.h -- LZMA2 Encoder -2017-07-27 : Igor Pavlov : Public domain */ +2023-04-13 : Igor Pavlov : Public domain */ -#ifndef __LZMA2_ENC_H -#define __LZMA2_ENC_H +#ifndef ZIP7_INC_LZMA2_ENC_H +#define ZIP7_INC_LZMA2_ENC_H #include "LzmaEnc.h" EXTERN_C_BEGIN -#define LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO 0 -#define LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID ((UInt64)(Int64)-1) +#define LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO 0 +#define LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID ((UInt64)(Int64)-1) typedef struct { @@ -18,6 +18,7 @@ typedef struct int numBlockThreads_Reduced; int numBlockThreads_Max; int numTotalThreads; + unsigned numThreadGroups; // 0 : no groups } CLzma2EncProps; void Lzma2EncProps_Init(CLzma2EncProps *p); @@ -36,7 +37,9 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p); SZ_ERROR_THREAD - error in multithreading functions (only for Mt version) */ -typedef void * CLzma2EncHandle; +typedef struct CLzma2Enc CLzma2Enc; +typedef CLzma2Enc * CLzma2EncHandle; +// Z7_DECLARE_HANDLE(CLzma2EncHandle) CLzma2EncHandle Lzma2Enc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig); void Lzma2Enc_Destroy(CLzma2EncHandle p); @@ -44,11 +47,11 @@ SRes Lzma2Enc_SetProps(CLzma2EncHandle p, const CLzma2EncProps *props); void Lzma2Enc_SetDataSize(CLzma2EncHandle p, UInt64 expectedDataSiize); Byte Lzma2Enc_WriteProperties(CLzma2EncHandle p); SRes Lzma2Enc_Encode2(CLzma2EncHandle p, - ISeqOutStream *outStream, + ISeqOutStreamPtr outStream, Byte *outBuf, size_t *outBufSize, - ISeqInStream *inStream, + ISeqInStreamPtr inStream, const Byte *inData, size_t inDataSize, - ICompressProgress *progress); + ICompressProgressPtr progress); EXTERN_C_END diff --git a/src/sdk/C/Lzma86.h b/src/sdk/C/Lzma86.h index bebed5c..e7707e2 100644 --- a/src/sdk/C/Lzma86.h +++ b/src/sdk/C/Lzma86.h @@ -1,8 +1,8 @@ /* Lzma86.h -- LZMA + x86 (BCJ) Filter -2013-01-18 : Igor Pavlov : Public domain */ +2023-03-03 : Igor Pavlov : Public domain */ -#ifndef __LZMA86_H -#define __LZMA86_H +#ifndef ZIP7_INC_LZMA86_H +#define ZIP7_INC_LZMA86_H #include "7zTypes.h" diff --git a/src/sdk/C/Lzma86Dec.c b/src/sdk/C/Lzma86Dec.c index 2103174..f094d4c 100644 --- a/src/sdk/C/Lzma86Dec.c +++ b/src/sdk/C/Lzma86Dec.c @@ -1,5 +1,5 @@ /* Lzma86Dec.c -- LZMA + x86 (BCJ) Filter Decoder -2016-05-16 : Igor Pavlov : Public domain */ +2023-03-03 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -46,9 +46,8 @@ SRes Lzma86_Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen) return res; if (useFilter == 1) { - UInt32 x86State; - x86_Convert_Init(x86State); - x86_Convert(dest, *destLen, 0, &x86State, 0); + UInt32 x86State = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL; + z7_BranchConvSt_X86_Dec(dest, *destLen, 0, &x86State); } return SZ_OK; } diff --git a/src/sdk/C/Lzma86Enc.c b/src/sdk/C/Lzma86Enc.c index 2617bab..0cdde1c 100644 --- a/src/sdk/C/Lzma86Enc.c +++ b/src/sdk/C/Lzma86Enc.c @@ -1,5 +1,5 @@ /* Lzma86Enc.c -- LZMA + x86 (BCJ) Filter Encoder -2018-07-04 : Igor Pavlov : Public domain */ +2023-03-03 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -11,8 +11,6 @@ #include "Bra.h" #include "LzmaEnc.h" -#define SZE_OUT_OVERFLOW SZE_DATA_ERROR - int Lzma86_Encode(Byte *dest, size_t *destLen, const Byte *src, size_t srcLen, int level, UInt32 dictSize, int filterMode) { @@ -48,9 +46,8 @@ int Lzma86_Encode(Byte *dest, size_t *destLen, const Byte *src, size_t srcLen, memcpy(filteredStream, src, srcLen); } { - UInt32 x86State; - x86_Convert_Init(x86State); - x86_Convert(filteredStream, srcLen, 0, &x86State, 1); + UInt32 x86State = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL; + z7_BranchConvSt_X86_Enc(filteredStream, srcLen, 0, &x86State); } } diff --git a/src/sdk/C/LzmaDec.c b/src/sdk/C/LzmaDec.c index ba3e1dd..69bb8bb 100644 --- a/src/sdk/C/LzmaDec.c +++ b/src/sdk/C/LzmaDec.c @@ -1,5 +1,5 @@ /* LzmaDec.c -- LZMA Decoder -2018-07-04 : Igor Pavlov : Public domain */ +2023-04-07 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -8,29 +8,31 @@ /* #include "CpuArch.h" */ #include "LzmaDec.h" -#define kNumTopBits 24 -#define kTopValue ((UInt32)1 << kNumTopBits) +// #define kNumTopBits 24 +#define kTopValue ((UInt32)1 << 24) #define kNumBitModelTotalBits 11 #define kBitModelTotal (1 << kNumBitModelTotalBits) -#define kNumMoveBits 5 #define RC_INIT_SIZE 5 +#ifndef Z7_LZMA_DEC_OPT + +#define kNumMoveBits 5 #define NORMALIZE if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); } #define IF_BIT_0(p) ttt = *(p); NORMALIZE; bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound) #define UPDATE_0(p) range = bound; *(p) = (CLzmaProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); #define UPDATE_1(p) range -= bound; code -= bound; *(p) = (CLzmaProb)(ttt - (ttt >> kNumMoveBits)); #define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \ - { UPDATE_0(p); i = (i + i); A0; } else \ - { UPDATE_1(p); i = (i + i) + 1; A1; } + { UPDATE_0(p) i = (i + i); A0; } else \ + { UPDATE_1(p) i = (i + i) + 1; A1; } #define TREE_GET_BIT(probs, i) { GET_BIT2(probs + i, i, ;, ;); } #define REV_BIT(p, i, A0, A1) IF_BIT_0(p + i) \ - { UPDATE_0(p + i); A0; } else \ - { UPDATE_1(p + i); A1; } + { UPDATE_0(p + i) A0; } else \ + { UPDATE_1(p + i) A1; } #define REV_BIT_VAR( p, i, m) REV_BIT(p, i, i += m; m += m, m += m; i += m; ) #define REV_BIT_CONST(p, i, m) REV_BIT(p, i, i += m; , i += m * 2; ) #define REV_BIT_LAST( p, i, m) REV_BIT(p, i, i -= m , ; ) @@ -38,19 +40,19 @@ #define TREE_DECODE(probs, limit, i) \ { i = 1; do { TREE_GET_BIT(probs, i); } while (i < limit); i -= limit; } -/* #define _LZMA_SIZE_OPT */ +/* #define Z7_LZMA_SIZE_OPT */ -#ifdef _LZMA_SIZE_OPT +#ifdef Z7_LZMA_SIZE_OPT #define TREE_6_DECODE(probs, i) TREE_DECODE(probs, (1 << 6), i) #else #define TREE_6_DECODE(probs, i) \ { i = 1; \ - TREE_GET_BIT(probs, i); \ - TREE_GET_BIT(probs, i); \ - TREE_GET_BIT(probs, i); \ - TREE_GET_BIT(probs, i); \ - TREE_GET_BIT(probs, i); \ - TREE_GET_BIT(probs, i); \ + TREE_GET_BIT(probs, i) \ + TREE_GET_BIT(probs, i) \ + TREE_GET_BIT(probs, i) \ + TREE_GET_BIT(probs, i) \ + TREE_GET_BIT(probs, i) \ + TREE_GET_BIT(probs, i) \ i -= 0x40; } #endif @@ -62,24 +64,25 @@ probLit = prob + (offs + bit + symbol); \ GET_BIT2(probLit, symbol, offs ^= bit; , ;) +#endif // Z7_LZMA_DEC_OPT -#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); } +#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_INPUT_EOF; range <<= 8; code = (code << 8) | (*buf++); } -#define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound) +#define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound) #define UPDATE_0_CHECK range = bound; #define UPDATE_1_CHECK range -= bound; code -= bound; #define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \ - { UPDATE_0_CHECK; i = (i + i); A0; } else \ - { UPDATE_1_CHECK; i = (i + i) + 1; A1; } + { UPDATE_0_CHECK i = (i + i); A0; } else \ + { UPDATE_1_CHECK i = (i + i) + 1; A1; } #define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;) #define TREE_DECODE_CHECK(probs, limit, i) \ { i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; } #define REV_BIT_CHECK(p, i, m) IF_BIT_0_CHECK(p + i) \ - { UPDATE_0_CHECK; i += m; m += m; } else \ - { UPDATE_1_CHECK; m += m; i += m; } + { UPDATE_0_CHECK i += m; m += m; } else \ + { UPDATE_1_CHECK m += m; i += m; } #define kNumPosBitsMax 4 @@ -114,6 +117,9 @@ #define kMatchMinLen 2 #define kMatchSpecLenStart (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols) +#define kMatchSpecLen_Error_Data (1 << 9) +#define kMatchSpecLen_Error_Fail (kMatchSpecLen_Error_Data - 1) + /* External ASM code needs same CLzmaProb array layout. So don't change it. */ /* (probs_1664) is faster and better for code size at some platforms */ @@ -166,10 +172,12 @@ /* p->remainLen : shows status of LZMA decoder: - < kMatchSpecLenStart : normal remain - = kMatchSpecLenStart : finished - = kMatchSpecLenStart + 1 : need init range coder - = kMatchSpecLenStart + 2 : need init range coder and state + < kMatchSpecLenStart : the number of bytes to be copied with (p->rep0) offset + = kMatchSpecLenStart : the LZMA stream was finished with end mark + = kMatchSpecLenStart + 1 : need init range coder + = kMatchSpecLenStart + 2 : need init range coder and state + = kMatchSpecLen_Error_Fail : Internal Code Failure + = kMatchSpecLen_Error_Data + [0 ... 273] : LZMA Data Error */ /* ---------- LZMA_DECODE_REAL ---------- */ @@ -188,34 +196,42 @@ LZMA_DECODE_REAL() { LzmaDec_TryDummy() was called before to exclude LITERAL and MATCH-REP cases. So first symbol can be only MATCH-NON-REP. And if that MATCH-NON-REP symbol - is not END_OF_PAYALOAD_MARKER, then function returns error code. + is not END_OF_PAYALOAD_MARKER, then the function doesn't write any byte to dictionary, + the function returns SZ_OK, and the caller can use (p->remainLen) and (p->reps[0]) later. } Processing: - first LZMA symbol will be decoded in any case - All checks for limits are at the end of main loop, - It will decode new LZMA-symbols while (p->buf < bufLimit && dicPos < limit), + The first LZMA symbol will be decoded in any case. + All main checks for limits are at the end of main loop, + It decodes additional LZMA-symbols while (p->buf < bufLimit && dicPos < limit), RangeCoder is still without last normalization when (p->buf < bufLimit) is being checked. + But if (p->buf < bufLimit), the caller provided at least (LZMA_REQUIRED_INPUT_MAX + 1) bytes for + next iteration before limit (bufLimit + LZMA_REQUIRED_INPUT_MAX), + that is enough for worst case LZMA symbol with one additional RangeCoder normalization for one bit. + So that function never reads bufLimit [LZMA_REQUIRED_INPUT_MAX] byte. Out: RangeCoder is normalized Result: SZ_OK - OK - SZ_ERROR_DATA - Error - p->remainLen: - < kMatchSpecLenStart : normal remain - = kMatchSpecLenStart : finished + p->remainLen: + < kMatchSpecLenStart : the number of bytes to be copied with (p->reps[0]) offset + = kMatchSpecLenStart : the LZMA stream was finished with end mark + + SZ_ERROR_DATA - error, when the MATCH-Symbol refers out of dictionary + p->remainLen : undefined + p->reps[*] : undefined */ -#ifdef _LZMA_DEC_OPT +#ifdef Z7_LZMA_DEC_OPT -int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit); +int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit); #else static -int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit) +int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit) { CLzmaProb *probs = GET_PROBS; unsigned state = (unsigned)p->state; @@ -247,7 +263,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit IF_BIT_0(prob) { unsigned symbol; - UPDATE_0(prob); + UPDATE_0(prob) prob = probs + Literal; if (processedPos != 0 || checkDicSize != 0) prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc); @@ -257,7 +273,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit { state -= (state < 4) ? state : 3; symbol = 1; - #ifdef _LZMA_SIZE_OPT + #ifdef Z7_LZMA_SIZE_OPT do { NORMAL_LITER_DEC } while (symbol < 0x100); #else NORMAL_LITER_DEC @@ -276,7 +292,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit unsigned offs = 0x100; state -= (state < 10) ? 3 : 6; symbol = 1; - #ifdef _LZMA_SIZE_OPT + #ifdef Z7_LZMA_SIZE_OPT do { unsigned bit; @@ -305,60 +321,62 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit } { - UPDATE_1(prob); + UPDATE_1(prob) prob = probs + IsRep + state; IF_BIT_0(prob) { - UPDATE_0(prob); + UPDATE_0(prob) state += kNumStates; prob = probs + LenCoder; } else { - UPDATE_1(prob); - /* - // that case was checked before with kBadRepCode - if (checkDicSize == 0 && processedPos == 0) - return SZ_ERROR_DATA; - */ + UPDATE_1(prob) prob = probs + IsRepG0 + state; IF_BIT_0(prob) { - UPDATE_0(prob); + UPDATE_0(prob) prob = probs + IsRep0Long + COMBINED_PS_STATE; IF_BIT_0(prob) { - UPDATE_0(prob); + UPDATE_0(prob) + + // that case was checked before with kBadRepCode + // if (checkDicSize == 0 && processedPos == 0) { len = kMatchSpecLen_Error_Data + 1; break; } + // The caller doesn't allow (dicPos == limit) case here + // so we don't need the following check: + // if (dicPos == limit) { state = state < kNumLitStates ? 9 : 11; len = 1; break; } + dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; dicPos++; processedPos++; state = state < kNumLitStates ? 9 : 11; continue; } - UPDATE_1(prob); + UPDATE_1(prob) } else { UInt32 distance; - UPDATE_1(prob); + UPDATE_1(prob) prob = probs + IsRepG1 + state; IF_BIT_0(prob) { - UPDATE_0(prob); + UPDATE_0(prob) distance = rep1; } else { - UPDATE_1(prob); + UPDATE_1(prob) prob = probs + IsRepG2 + state; IF_BIT_0(prob) { - UPDATE_0(prob); + UPDATE_0(prob) distance = rep2; } else { - UPDATE_1(prob); + UPDATE_1(prob) distance = rep3; rep3 = rep2; } @@ -371,37 +389,37 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit prob = probs + RepLenCoder; } - #ifdef _LZMA_SIZE_OPT + #ifdef Z7_LZMA_SIZE_OPT { unsigned lim, offset; CLzmaProb *probLen = prob + LenChoice; IF_BIT_0(probLen) { - UPDATE_0(probLen); + UPDATE_0(probLen) probLen = prob + LenLow + GET_LEN_STATE; offset = 0; lim = (1 << kLenNumLowBits); } else { - UPDATE_1(probLen); + UPDATE_1(probLen) probLen = prob + LenChoice2; IF_BIT_0(probLen) { - UPDATE_0(probLen); + UPDATE_0(probLen) probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits); offset = kLenNumLowSymbols; lim = (1 << kLenNumLowBits); } else { - UPDATE_1(probLen); + UPDATE_1(probLen) probLen = prob + LenHigh; offset = kLenNumLowSymbols * 2; lim = (1 << kLenNumHighBits); } } - TREE_DECODE(probLen, lim, len); + TREE_DECODE(probLen, lim, len) len += offset; } #else @@ -409,32 +427,32 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit CLzmaProb *probLen = prob + LenChoice; IF_BIT_0(probLen) { - UPDATE_0(probLen); + UPDATE_0(probLen) probLen = prob + LenLow + GET_LEN_STATE; len = 1; - TREE_GET_BIT(probLen, len); - TREE_GET_BIT(probLen, len); - TREE_GET_BIT(probLen, len); + TREE_GET_BIT(probLen, len) + TREE_GET_BIT(probLen, len) + TREE_GET_BIT(probLen, len) len -= 8; } else { - UPDATE_1(probLen); + UPDATE_1(probLen) probLen = prob + LenChoice2; IF_BIT_0(probLen) { - UPDATE_0(probLen); + UPDATE_0(probLen) probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits); len = 1; - TREE_GET_BIT(probLen, len); - TREE_GET_BIT(probLen, len); - TREE_GET_BIT(probLen, len); + TREE_GET_BIT(probLen, len) + TREE_GET_BIT(probLen, len) + TREE_GET_BIT(probLen, len) } else { - UPDATE_1(probLen); + UPDATE_1(probLen) probLen = prob + LenHigh; - TREE_DECODE(probLen, (1 << kLenNumHighBits), len); + TREE_DECODE(probLen, (1 << kLenNumHighBits), len) len += kLenNumLowSymbols * 2; } } @@ -446,7 +464,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit UInt32 distance; prob = probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); - TREE_6_DECODE(prob, distance); + TREE_6_DECODE(prob, distance) if (distance >= kStartPosModelIndex) { unsigned posSlot = (unsigned)distance; @@ -461,7 +479,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit distance++; do { - REV_BIT_VAR(prob, distance, m); + REV_BIT_VAR(prob, distance, m) } while (--numDirectBits); distance -= m; @@ -496,10 +514,10 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit distance <<= kNumAlignBits; { unsigned i = 1; - REV_BIT_CONST(prob, i, 1); - REV_BIT_CONST(prob, i, 2); - REV_BIT_CONST(prob, i, 4); - REV_BIT_LAST (prob, i, 8); + REV_BIT_CONST(prob, i, 1) + REV_BIT_CONST(prob, i, 2) + REV_BIT_CONST(prob, i, 4) + REV_BIT_LAST (prob, i, 8) distance |= i; } if (distance == (UInt32)0xFFFFFFFF) @@ -518,8 +536,10 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize)) { - p->dicPos = dicPos; - return SZ_ERROR_DATA; + len += kMatchSpecLen_Error_Data + kMatchMinLen; + // len = kMatchSpecLen_Error_Data; + // len += kMatchMinLen; + break; } } @@ -532,8 +552,13 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit if ((rem = limit - dicPos) == 0) { - p->dicPos = dicPos; - return SZ_ERROR_DATA; + /* + We stop decoding and return SZ_OK, and we can resume decoding later. + Any error conditions can be tested later in caller code. + For more strict mode we can stop decoding with error + // len += kMatchSpecLen_Error_Data; + */ + break; } curLen = ((rem < len) ? (unsigned)rem : len); @@ -567,12 +592,12 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit } while (dicPos < limit && buf < bufLimit); - NORMALIZE; + NORMALIZE p->buf = buf; p->range = range; p->code = code; - p->remainLen = (UInt32)len; + p->remainLen = (UInt32)len; // & (kMatchSpecLen_Error_Data - 1); // we can write real length for error matches too. p->dicPos = dicPos; p->processedPos = processedPos; p->reps[0] = rep0; @@ -580,40 +605,61 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit p->reps[2] = rep2; p->reps[3] = rep3; p->state = (UInt32)state; - + if (len >= kMatchSpecLen_Error_Data) + return SZ_ERROR_DATA; return SZ_OK; } #endif -static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit) + + +static void Z7_FASTCALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit) { - if (p->remainLen != 0 && p->remainLen < kMatchSpecLenStart) + unsigned len = (unsigned)p->remainLen; + if (len == 0 /* || len >= kMatchSpecLenStart */) + return; { - Byte *dic = p->dic; SizeT dicPos = p->dicPos; - SizeT dicBufSize = p->dicBufSize; - unsigned len = (unsigned)p->remainLen; - SizeT rep0 = p->reps[0]; /* we use SizeT to avoid the BUG of VC14 for AMD64 */ - SizeT rem = limit - dicPos; - if (rem < len) - len = (unsigned)(rem); + Byte *dic; + SizeT dicBufSize; + SizeT rep0; /* we use SizeT to avoid the BUG of VC14 for AMD64 */ + { + SizeT rem = limit - dicPos; + if (rem < len) + { + len = (unsigned)(rem); + if (len == 0) + return; + } + } if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len) p->checkDicSize = p->prop.dicSize; p->processedPos += (UInt32)len; p->remainLen -= (UInt32)len; - while (len != 0) + dic = p->dic; + rep0 = p->reps[0]; + dicBufSize = p->dicBufSize; + do { - len--; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; dicPos++; } + while (--len); p->dicPos = dicPos; } } +/* +At staring of new stream we have one of the following symbols: + - Literal - is allowed + - Non-Rep-Match - is allowed only if it's end marker symbol + - Rep-Match - is not allowed +We use early check of (RangeCoder:Code) over kBadRepCode to simplify main decoding code +*/ + #define kRange0 0xFFFFFFFF #define kBound0 ((kRange0 >> kNumBitModelTotalBits) << (kNumBitModelTotalBits - 1)) #define kBadRepCode (kBound0 + (((kRange0 - kBound0) >> kNumBitModelTotalBits) << (kNumBitModelTotalBits - 1))) @@ -621,69 +667,77 @@ static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit) #error Stop_Compiling_Bad_LZMA_Check #endif -static int MY_FAST_CALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit) -{ - do - { - SizeT limit2 = limit; - if (p->checkDicSize == 0) - { - UInt32 rem = p->prop.dicSize - p->processedPos; - if (limit - p->dicPos > rem) - limit2 = p->dicPos + rem; - if (p->processedPos == 0) - if (p->code >= kBadRepCode) - return SZ_ERROR_DATA; - } +/* +LzmaDec_DecodeReal2(): + It calls LZMA_DECODE_REAL() and it adjusts limit according (p->checkDicSize). - RINOK(LZMA_DECODE_REAL(p, limit2, bufLimit)); - +We correct (p->checkDicSize) after LZMA_DECODE_REAL() and in LzmaDec_WriteRem(), +and we support the following state of (p->checkDicSize): + if (total_processed < p->prop.dicSize) then + { + (total_processed == p->processedPos) + (p->checkDicSize == 0) + } + else + (p->checkDicSize == p->prop.dicSize) +*/ + +static int Z7_FASTCALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit) +{ + if (p->checkDicSize == 0) + { + UInt32 rem = p->prop.dicSize - p->processedPos; + if (limit - p->dicPos > rem) + limit = p->dicPos + rem; + } + { + int res = LZMA_DECODE_REAL(p, limit, bufLimit); if (p->checkDicSize == 0 && p->processedPos >= p->prop.dicSize) p->checkDicSize = p->prop.dicSize; - - LzmaDec_WriteRem(p, limit); + return res; } - while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); - - return 0; } + + typedef enum { - DUMMY_ERROR, /* unexpected end of input stream */ + DUMMY_INPUT_EOF, /* need more input data */ DUMMY_LIT, DUMMY_MATCH, DUMMY_REP } ELzmaDummy; -static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inSize) + +#define IS_DUMMY_END_MARKER_POSSIBLE(dummyRes) ((dummyRes) == DUMMY_MATCH) + +static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byte **bufOut) { UInt32 range = p->range; UInt32 code = p->code; - const Byte *bufLimit = buf + inSize; + const Byte *bufLimit = *bufOut; const CLzmaProb *probs = GET_PROBS; unsigned state = (unsigned)p->state; ELzmaDummy res; + for (;;) { const CLzmaProb *prob; UInt32 bound; unsigned ttt; - unsigned posState = CALC_POS_STATE(p->processedPos, (1 << p->prop.pb) - 1); + unsigned posState = CALC_POS_STATE(p->processedPos, ((unsigned)1 << p->prop.pb) - 1); prob = probs + IsMatch + COMBINED_PS_STATE; IF_BIT_0_CHECK(prob) { UPDATE_0_CHECK - /* if (bufLimit - buf >= 7) return DUMMY_LIT; */ - prob = probs + Literal; if (p->checkDicSize != 0 || p->processedPos != 0) prob += ((UInt32)LZMA_LIT_SIZE * - ((((p->processedPos) & ((1 << (p->prop.lp)) - 1)) << p->prop.lc) + - (p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc)))); + ((((p->processedPos) & (((unsigned)1 << (p->prop.lp)) - 1)) << p->prop.lc) + + ((unsigned)p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc)))); if (state < kNumLitStates) { @@ -713,55 +767,54 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS else { unsigned len; - UPDATE_1_CHECK; + UPDATE_1_CHECK prob = probs + IsRep + state; IF_BIT_0_CHECK(prob) { - UPDATE_0_CHECK; + UPDATE_0_CHECK state = 0; prob = probs + LenCoder; res = DUMMY_MATCH; } else { - UPDATE_1_CHECK; + UPDATE_1_CHECK res = DUMMY_REP; prob = probs + IsRepG0 + state; IF_BIT_0_CHECK(prob) { - UPDATE_0_CHECK; + UPDATE_0_CHECK prob = probs + IsRep0Long + COMBINED_PS_STATE; IF_BIT_0_CHECK(prob) { - UPDATE_0_CHECK; - NORMALIZE_CHECK; - return DUMMY_REP; + UPDATE_0_CHECK + break; } else { - UPDATE_1_CHECK; + UPDATE_1_CHECK } } else { - UPDATE_1_CHECK; + UPDATE_1_CHECK prob = probs + IsRepG1 + state; IF_BIT_0_CHECK(prob) { - UPDATE_0_CHECK; + UPDATE_0_CHECK } else { - UPDATE_1_CHECK; + UPDATE_1_CHECK prob = probs + IsRepG2 + state; IF_BIT_0_CHECK(prob) { - UPDATE_0_CHECK; + UPDATE_0_CHECK } else { - UPDATE_1_CHECK; + UPDATE_1_CHECK } } } @@ -773,31 +826,31 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS const CLzmaProb *probLen = prob + LenChoice; IF_BIT_0_CHECK(probLen) { - UPDATE_0_CHECK; + UPDATE_0_CHECK probLen = prob + LenLow + GET_LEN_STATE; offset = 0; limit = 1 << kLenNumLowBits; } else { - UPDATE_1_CHECK; + UPDATE_1_CHECK probLen = prob + LenChoice2; IF_BIT_0_CHECK(probLen) { - UPDATE_0_CHECK; + UPDATE_0_CHECK probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits); offset = kLenNumLowSymbols; limit = 1 << kLenNumLowBits; } else { - UPDATE_1_CHECK; + UPDATE_1_CHECK probLen = prob + LenHigh; offset = kLenNumLowSymbols * 2; limit = 1 << kLenNumHighBits; } } - TREE_DECODE_CHECK(probLen, limit, len); + TREE_DECODE_CHECK(probLen, limit, len) len += offset; } @@ -807,13 +860,11 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS prob = probs + PosSlot + ((len < kNumLenToPosStates - 1 ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); - TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); + TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot) if (posSlot >= kStartPosModelIndex) { unsigned numDirectBits = ((posSlot >> 1) - 1); - /* if (bufLimit - buf >= 8) return DUMMY_MATCH; */ - if (posSlot < kEndPosModelIndex) { prob = probs + SpecPos + ((2 | (posSlot & 1)) << numDirectBits); @@ -837,19 +888,22 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS unsigned m = 1; do { - REV_BIT_CHECK(prob, i, m); + REV_BIT_CHECK(prob, i, m) } while (--numDirectBits); } } } } + break; } - NORMALIZE_CHECK; + NORMALIZE_CHECK + + *bufOut = buf; return res; } - +void LzmaDec_InitDicAndState(CLzmaDec *p, BoolInt initDic, BoolInt initState); void LzmaDec_InitDicAndState(CLzmaDec *p, BoolInt initDic, BoolInt initState) { p->remainLen = kMatchSpecLenStart + 1; @@ -872,16 +926,41 @@ void LzmaDec_Init(CLzmaDec *p) } +/* +LZMA supports optional end_marker. +So the decoder can lookahead for one additional LZMA-Symbol to check end_marker. +That additional LZMA-Symbol can require up to LZMA_REQUIRED_INPUT_MAX bytes in input stream. +When the decoder reaches dicLimit, it looks (finishMode) parameter: + if (finishMode == LZMA_FINISH_ANY), the decoder doesn't lookahead + if (finishMode != LZMA_FINISH_ANY), the decoder lookahead, if end_marker is possible for current position + +When the decoder lookahead, and the lookahead symbol is not end_marker, we have two ways: + 1) Strict mode (default) : the decoder returns SZ_ERROR_DATA. + 2) The relaxed mode (alternative mode) : we could return SZ_OK, and the caller + must check (status) value. The caller can show the error, + if the end of stream is expected, and the (status) is noit + LZMA_STATUS_FINISHED_WITH_MARK or LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK. +*/ + + +#define RETURN_NOT_FINISHED_FOR_FINISH \ + *status = LZMA_STATUS_NOT_FINISHED; \ + return SZ_ERROR_DATA; // for strict mode + // return SZ_OK; // for relaxed mode + + SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status) { SizeT inSize = *srcLen; (*srcLen) = 0; - *status = LZMA_STATUS_NOT_SPECIFIED; if (p->remainLen > kMatchSpecLenStart) { + if (p->remainLen > kMatchSpecLenStart + 2) + return p->remainLen == kMatchSpecLen_Error_Fail ? SZ_ERROR_FAIL : SZ_ERROR_DATA; + for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--) p->tempBuf[p->tempBufSize++] = *src++; if (p->tempBufSize != 0 && p->tempBuf[0] != 0) @@ -896,6 +975,12 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr | ((UInt32)p->tempBuf[2] << 16) | ((UInt32)p->tempBuf[3] << 8) | ((UInt32)p->tempBuf[4]); + + if (p->checkDicSize == 0 + && p->processedPos == 0 + && p->code >= kBadRepCode) + return SZ_ERROR_DATA; + p->range = 0xFFFFFFFF; p->tempBufSize = 0; @@ -913,10 +998,21 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr p->remainLen = 0; } - LzmaDec_WriteRem(p, dicLimit); - - while (p->remainLen != kMatchSpecLenStart) + for (;;) { + if (p->remainLen == kMatchSpecLenStart) + { + if (p->code != 0) + return SZ_ERROR_DATA; + *status = LZMA_STATUS_FINISHED_WITH_MARK; + return SZ_OK; + } + + LzmaDec_WriteRem(p, dicLimit); + + { + // (p->remainLen == 0 || p->dicPos == dicLimit) + int checkEndMarkNow = 0; if (p->dicPos >= dicLimit) @@ -933,92 +1029,174 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr } if (p->remainLen != 0) { - *status = LZMA_STATUS_NOT_FINISHED; - return SZ_ERROR_DATA; + RETURN_NOT_FINISHED_FOR_FINISH } checkEndMarkNow = 1; } + // (p->remainLen == 0) + if (p->tempBufSize == 0) { - SizeT processed; const Byte *bufLimit; + int dummyProcessed = -1; + if (inSize < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) { - int dummyRes = LzmaDec_TryDummy(p, src, inSize); - if (dummyRes == DUMMY_ERROR) + const Byte *bufOut = src + inSize; + + ELzmaDummy dummyRes = LzmaDec_TryDummy(p, src, &bufOut); + + if (dummyRes == DUMMY_INPUT_EOF) { - memcpy(p->tempBuf, src, inSize); - p->tempBufSize = (unsigned)inSize; + size_t i; + if (inSize >= LZMA_REQUIRED_INPUT_MAX) + break; (*srcLen) += inSize; + p->tempBufSize = (unsigned)inSize; + for (i = 0; i < inSize; i++) + p->tempBuf[i] = src[i]; *status = LZMA_STATUS_NEEDS_MORE_INPUT; return SZ_OK; } - if (checkEndMarkNow && dummyRes != DUMMY_MATCH) + + dummyProcessed = (int)(bufOut - src); + if ((unsigned)dummyProcessed > LZMA_REQUIRED_INPUT_MAX) + break; + + if (checkEndMarkNow && !IS_DUMMY_END_MARKER_POSSIBLE(dummyRes)) { - *status = LZMA_STATUS_NOT_FINISHED; - return SZ_ERROR_DATA; + unsigned i; + (*srcLen) += (unsigned)dummyProcessed; + p->tempBufSize = (unsigned)dummyProcessed; + for (i = 0; i < (unsigned)dummyProcessed; i++) + p->tempBuf[i] = src[i]; + // p->remainLen = kMatchSpecLen_Error_Data; + RETURN_NOT_FINISHED_FOR_FINISH } + bufLimit = src; + // we will decode only one iteration } else bufLimit = src + inSize - LZMA_REQUIRED_INPUT_MAX; + p->buf = src; - if (LzmaDec_DecodeReal2(p, dicLimit, bufLimit) != 0) - return SZ_ERROR_DATA; - processed = (SizeT)(p->buf - src); - (*srcLen) += processed; - src += processed; - inSize -= processed; + + { + int res = LzmaDec_DecodeReal2(p, dicLimit, bufLimit); + + SizeT processed = (SizeT)(p->buf - src); + + if (dummyProcessed < 0) + { + if (processed > inSize) + break; + } + else if ((unsigned)dummyProcessed != processed) + break; + + src += processed; + inSize -= processed; + (*srcLen) += processed; + + if (res != SZ_OK) + { + p->remainLen = kMatchSpecLen_Error_Data; + return SZ_ERROR_DATA; + } + } + continue; } - else + { - unsigned rem = p->tempBufSize, lookAhead = 0; - while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize) - p->tempBuf[rem++] = src[lookAhead++]; - p->tempBufSize = rem; + // we have some data in (p->tempBuf) + // in strict mode: tempBufSize is not enough for one Symbol decoding. + // in relaxed mode: tempBufSize not larger than required for one Symbol decoding. + + unsigned rem = p->tempBufSize; + unsigned ahead = 0; + int dummyProcessed = -1; + + while (rem < LZMA_REQUIRED_INPUT_MAX && ahead < inSize) + p->tempBuf[rem++] = src[ahead++]; + + // ahead - the size of new data copied from (src) to (p->tempBuf) + // rem - the size of temp buffer including new data from (src) + if (rem < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) { - int dummyRes = LzmaDec_TryDummy(p, p->tempBuf, (SizeT)rem); - if (dummyRes == DUMMY_ERROR) + const Byte *bufOut = p->tempBuf + rem; + + ELzmaDummy dummyRes = LzmaDec_TryDummy(p, p->tempBuf, &bufOut); + + if (dummyRes == DUMMY_INPUT_EOF) { - (*srcLen) += (SizeT)lookAhead; + if (rem >= LZMA_REQUIRED_INPUT_MAX) + break; + p->tempBufSize = rem; + (*srcLen) += (SizeT)ahead; *status = LZMA_STATUS_NEEDS_MORE_INPUT; return SZ_OK; } - if (checkEndMarkNow && dummyRes != DUMMY_MATCH) + + dummyProcessed = (int)(bufOut - p->tempBuf); + + if ((unsigned)dummyProcessed < p->tempBufSize) + break; + + if (checkEndMarkNow && !IS_DUMMY_END_MARKER_POSSIBLE(dummyRes)) { - *status = LZMA_STATUS_NOT_FINISHED; - return SZ_ERROR_DATA; + (*srcLen) += (unsigned)dummyProcessed - p->tempBufSize; + p->tempBufSize = (unsigned)dummyProcessed; + // p->remainLen = kMatchSpecLen_Error_Data; + RETURN_NOT_FINISHED_FOR_FINISH } } + p->buf = p->tempBuf; - if (LzmaDec_DecodeReal2(p, dicLimit, p->buf) != 0) - return SZ_ERROR_DATA; { - unsigned kkk = (unsigned)(p->buf - p->tempBuf); - if (rem < kkk) - return SZ_ERROR_FAIL; /* some internal error */ - rem -= kkk; - if (lookAhead < rem) - return SZ_ERROR_FAIL; /* some internal error */ - lookAhead -= rem; + // we decode one symbol from (p->tempBuf) here, so the (bufLimit) is equal to (p->buf) + int res = LzmaDec_DecodeReal2(p, dicLimit, p->buf); + + SizeT processed = (SizeT)(p->buf - p->tempBuf); + rem = p->tempBufSize; + + if (dummyProcessed < 0) + { + if (processed > LZMA_REQUIRED_INPUT_MAX) + break; + if (processed < rem) + break; + } + else if ((unsigned)dummyProcessed != processed) + break; + + processed -= rem; + + src += processed; + inSize -= processed; + (*srcLen) += processed; + p->tempBufSize = 0; + + if (res != SZ_OK) + { + p->remainLen = kMatchSpecLen_Error_Data; + return SZ_ERROR_DATA; + } } - (*srcLen) += (SizeT)lookAhead; - src += lookAhead; - inSize -= (SizeT)lookAhead; - p->tempBufSize = 0; } + } } - - if (p->code != 0) - return SZ_ERROR_DATA; - *status = LZMA_STATUS_FINISHED_WITH_MARK; - return SZ_OK; + + /* Some unexpected error: internal error of code, memory corruption or hardware failure */ + p->remainLen = kMatchSpecLen_Error_Fail; + return SZ_ERROR_FAIL; } + SRes LzmaDec_DecodeToBuf(CLzmaDec *p, Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status) { SizeT outSize = *destLen; @@ -1121,8 +1299,8 @@ static SRes LzmaDec_AllocateProbs2(CLzmaDec *p, const CLzmaProps *propNew, ISzAl SRes LzmaDec_AllocateProbs(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAllocPtr alloc) { CLzmaProps propNew; - RINOK(LzmaProps_Decode(&propNew, props, propsSize)); - RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); + RINOK(LzmaProps_Decode(&propNew, props, propsSize)) + RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)) p->prop = propNew; return SZ_OK; } @@ -1131,14 +1309,14 @@ SRes LzmaDec_Allocate(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAll { CLzmaProps propNew; SizeT dicBufSize; - RINOK(LzmaProps_Decode(&propNew, props, propsSize)); - RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); + RINOK(LzmaProps_Decode(&propNew, props, propsSize)) + RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)) { UInt32 dictSize = propNew.dicSize; SizeT mask = ((UInt32)1 << 12) - 1; if (dictSize >= ((UInt32)1 << 30)) mask = ((UInt32)1 << 22) - 1; - else if (dictSize >= ((UInt32)1 << 22)) mask = ((UInt32)1 << 20) - 1;; + else if (dictSize >= ((UInt32)1 << 22)) mask = ((UInt32)1 << 20) - 1; dicBufSize = ((SizeT)dictSize + mask) & ~mask; if (dicBufSize < dictSize) dicBufSize = dictSize; @@ -1170,8 +1348,8 @@ SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, *status = LZMA_STATUS_NOT_SPECIFIED; if (inSize < RC_INIT_SIZE) return SZ_ERROR_INPUT_EOF; - LzmaDec_Construct(&p); - RINOK(LzmaDec_AllocateProbs(&p, propData, propSize, alloc)); + LzmaDec_CONSTRUCT(&p) + RINOK(LzmaDec_AllocateProbs(&p, propData, propSize, alloc)) p.dic = dest; p.dicBufSize = outSize; LzmaDec_Init(&p); diff --git a/src/sdk/C/LzmaDec.h b/src/sdk/C/LzmaDec.h index 1f0927a..b0ce28f 100644 --- a/src/sdk/C/LzmaDec.h +++ b/src/sdk/C/LzmaDec.h @@ -1,19 +1,19 @@ /* LzmaDec.h -- LZMA Decoder -2018-04-21 : Igor Pavlov : Public domain */ +2023-04-02 : Igor Pavlov : Public domain */ -#ifndef __LZMA_DEC_H -#define __LZMA_DEC_H +#ifndef ZIP7_INC_LZMA_DEC_H +#define ZIP7_INC_LZMA_DEC_H #include "7zTypes.h" EXTERN_C_BEGIN -/* #define _LZMA_PROB32 */ -/* _LZMA_PROB32 can increase the speed on some CPUs, +/* #define Z7_LZMA_PROB32 */ +/* Z7_LZMA_PROB32 can increase the speed on some CPUs, but memory usage for CLzmaDec::probs will be doubled in that case */ typedef -#ifdef _LZMA_PROB32 +#ifdef Z7_LZMA_PROB32 UInt32 #else UInt16 @@ -25,7 +25,7 @@ typedef #define LZMA_PROPS_SIZE 5 -typedef struct _CLzmaProps +typedef struct { Byte lc; Byte lp; @@ -73,7 +73,8 @@ typedef struct Byte tempBuf[LZMA_REQUIRED_INPUT_MAX]; } CLzmaDec; -#define LzmaDec_Construct(p) { (p)->dic = NULL; (p)->probs = NULL; } +#define LzmaDec_CONSTRUCT(p) { (p)->dic = NULL; (p)->probs = NULL; } +#define LzmaDec_Construct(p) LzmaDec_CONSTRUCT(p) void LzmaDec_Init(CLzmaDec *p); @@ -181,6 +182,7 @@ void LzmaDec_Free(CLzmaDec *p, ISzAllocPtr alloc); LZMA_STATUS_NEEDS_MORE_INPUT LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK SZ_ERROR_DATA - Data error + SZ_ERROR_FAIL - Some unexpected error: internal error of code, memory corruption or hardware failure */ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, @@ -223,6 +225,7 @@ SRes LzmaDec_DecodeToBuf(CLzmaDec *p, Byte *dest, SizeT *destLen, SZ_ERROR_MEM - Memory allocation error SZ_ERROR_UNSUPPORTED - Unsupported properties SZ_ERROR_INPUT_EOF - It needs more bytes in input buffer (src). + SZ_ERROR_FAIL - Some unexpected error: internal error of code, memory corruption or hardware failure */ SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, diff --git a/src/sdk/C/LzmaEnc.c b/src/sdk/C/LzmaEnc.c index 1159ca2..d6b9e6e 100644 --- a/src/sdk/C/LzmaEnc.c +++ b/src/sdk/C/LzmaEnc.c @@ -1,5 +1,5 @@ /* LzmaEnc.c -- LZMA Encoder -2019-01-10: Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #include "Precomp.h" @@ -12,22 +12,36 @@ #include #endif +#include "CpuArch.h" #include "LzmaEnc.h" #include "LzFind.h" -#ifndef _7ZIP_ST +#ifndef Z7_ST #include "LzFindMt.h" #endif +/* the following LzmaEnc_* declarations is internal LZMA interface for LZMA2 encoder */ + +SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle p, ISeqInStreamPtr inStream, UInt32 keepWindowSize, + ISzAllocPtr alloc, ISzAllocPtr allocBig); +SRes LzmaEnc_MemPrepare(CLzmaEncHandle p, const Byte *src, SizeT srcLen, + UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig); +SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle p, BoolInt reInit, + Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize); +const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle p); +void LzmaEnc_Finish(CLzmaEncHandle p); +void LzmaEnc_SaveState(CLzmaEncHandle p); +void LzmaEnc_RestoreState(CLzmaEncHandle p); + #ifdef SHOW_STAT static unsigned g_STAT_OFFSET = 0; #endif -#define kLzmaMaxHistorySize ((UInt32)3 << 29) -/* #define kLzmaMaxHistorySize ((UInt32)7 << 29) */ +/* for good normalization speed we still reserve 256 MB before 4 GB range */ +#define kLzmaMaxHistorySize ((UInt32)15 << 28) -#define kNumTopBits 24 -#define kTopValue ((UInt32)1 << kNumTopBits) +// #define kNumTopBits 24 +#define kTopValue ((UInt32)1 << 24) #define kNumBitModelTotalBits 11 #define kBitModelTotal (1 << kNumBitModelTotalBits) @@ -36,7 +50,7 @@ static unsigned g_STAT_OFFSET = 0; #define kNumMoveReducingBits 4 #define kNumBitPriceShiftBits 4 -#define kBitPrice (1 << kNumBitPriceShiftBits) +// #define kBitPrice (1 << kNumBitPriceShiftBits) #define REP_LEN_COUNT 64 @@ -46,7 +60,11 @@ void LzmaEncProps_Init(CLzmaEncProps *p) p->dictSize = p->mc = 0; p->reduceSize = (UInt64)(Int64)-1; p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1; + p->numHashOutBits = 0; p->writeEndMark = 0; + p->affinityGroup = -1; + p->affinity = 0; + p->affinityInGroup = 0; } void LzmaEncProps_Normalize(CLzmaEncProps *p) @@ -55,31 +73,36 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) if (level < 0) level = 5; p->level = level; - if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level <= 7 ? (1 << 25) : (1 << 26))); + if (p->dictSize == 0) + p->dictSize = (unsigned)level <= 4 ? + (UInt32)1 << (level * 2 + 16) : + (unsigned)level <= sizeof(size_t) / 2 + 4 ? + (UInt32)1 << (level + 20) : + (UInt32)1 << (sizeof(size_t) / 2 + 24); + if (p->dictSize > p->reduceSize) { - unsigned i; - UInt32 reduceSize = (UInt32)p->reduceSize; - for (i = 11; i <= 30; i++) - { - if (reduceSize <= ((UInt32)2 << i)) { p->dictSize = ((UInt32)2 << i); break; } - if (reduceSize <= ((UInt32)3 << i)) { p->dictSize = ((UInt32)3 << i); break; } - } + UInt32 v = (UInt32)p->reduceSize; + const UInt32 kReduceMin = ((UInt32)1 << 12); + if (v < kReduceMin) + v = kReduceMin; + if (p->dictSize > v) + p->dictSize = v; } if (p->lc < 0) p->lc = 3; if (p->lp < 0) p->lp = 0; if (p->pb < 0) p->pb = 2; - if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); - if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); + if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1; + if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64; if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); - if (p->numHashBytes < 0) p->numHashBytes = 4; - if (p->mc == 0) p->mc = (16 + (p->fb >> 1)) >> (p->btMode ? 0 : 1); + if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5); + if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1); if (p->numThreads < 0) p->numThreads = - #ifndef _7ZIP_ST + #ifndef Z7_ST ((p->btMode && p->algo) ? 2 : 1); #else 1; @@ -93,30 +116,97 @@ UInt32 LzmaEncProps_GetDictSize(const CLzmaEncProps *props2) return props.dictSize; } -#if (_MSC_VER >= 1400) -/* BSR code is fast for some new CPUs */ -/* #define LZMA_LOG_BSR */ + +/* +x86/x64: + +BSR: + IF (SRC == 0) ZF = 1, DEST is undefined; + AMD : DEST is unchanged; + IF (SRC != 0) ZF = 0; DEST is index of top non-zero bit + BSR is slow in some processors + +LZCNT: + IF (SRC == 0) CF = 1, DEST is size_in_bits_of_register(src) (32 or 64) + IF (SRC != 0) CF = 0, DEST = num_lead_zero_bits + IF (DEST == 0) ZF = 1; + +LZCNT works only in new processors starting from Haswell. +if LZCNT is not supported by processor, then it's executed as BSR. +LZCNT can be faster than BSR, if supported. +*/ + +// #define LZMA_LOG_BSR + +#if defined(MY_CPU_ARM_OR_ARM64) /* || defined(MY_CPU_X86_OR_AMD64) */ + + #if (defined(__clang__) && (__clang_major__ >= 6)) \ + || (defined(__GNUC__) && (__GNUC__ >= 6)) + #define LZMA_LOG_BSR + #elif defined(_MSC_VER) && (_MSC_VER >= 1300) + // #if defined(MY_CPU_ARM_OR_ARM64) + #define LZMA_LOG_BSR + // #endif + #endif #endif +// #include + #ifdef LZMA_LOG_BSR -#define kDicLogSizeMaxCompress 32 +#if defined(__clang__) \ + || defined(__GNUC__) + +/* + C code: : (30 - __builtin_clz(x)) + gcc9/gcc10 for x64 /x86 : 30 - (bsr(x) xor 31) + clang10 for x64 : 31 + (bsr(x) xor -32) +*/ + + #define MY_clz(x) ((unsigned)__builtin_clz(x)) + // __lzcnt32 + // __builtin_ia32_lzcnt_u32 -#define BSR2_RET(pos, res) { unsigned long zz; _BitScanReverse(&zz, (pos)); res = (zz + zz) + ((pos >> (zz - 1)) & 1); } +#else // #if defined(_MSC_VER) -static unsigned GetPosSlot1(UInt32 pos) + #ifdef MY_CPU_ARM_OR_ARM64 + + #define MY_clz _CountLeadingZeros + + #else // if defined(MY_CPU_X86_OR_AMD64) + + // #define MY_clz __lzcnt // we can use lzcnt (unsupported by old CPU) + // _BitScanReverse code is not optimal for some MSVC compilers + #define BSR2_RET(pos, res) { unsigned long zz; _BitScanReverse(&zz, (pos)); zz--; \ + res = (zz + zz) + (pos >> zz); } + + #endif // MY_CPU_X86_OR_AMD64 + +#endif // _MSC_VER + + +#ifndef BSR2_RET + + #define BSR2_RET(pos, res) { unsigned zz = 30 - MY_clz(pos); \ + res = (zz + zz) + (pos >> zz); } + +#endif + + +unsigned GetPosSlot1(UInt32 pos); +unsigned GetPosSlot1(UInt32 pos) { unsigned res; - BSR2_RET(pos, res); + BSR2_RET(pos, res) return res; } -#define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } -#define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res); } +#define GetPosSlot2(pos, res) { BSR2_RET(pos, res) } +#define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res) } -#else -#define kNumLogBits (9 + sizeof(size_t) / 2) -/* #define kNumLogBits (11 + sizeof(size_t) / 8 * 3) */ +#else // ! LZMA_LOG_BSR + +#define kNumLogBits (11 + sizeof(size_t) / 8 * 3) #define kDicLogSizeMaxCompress ((kNumLogBits - 1) * 2 + 7) @@ -163,7 +253,7 @@ static void LzmaEnc_FastPosInit(Byte *g_FastPos) #define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } #define GetPosSlot(pos, res) { if (pos < kNumFullDistances) res = p->g_FastPos[pos & (kNumFullDistances - 1)]; else BSR2_RET(pos, res); } -#endif +#endif // LZMA_LOG_BSR #define LZMA_NUM_REPS 4 @@ -193,7 +283,7 @@ typedef struct #define kNumLenToPosStates 4 #define kNumPosSlotBits 6 -#define kDicLogSizeMin 0 +// #define kDicLogSizeMin 0 #define kDicLogSizeMax 32 #define kDistTableSizeMax (kDicLogSizeMax * 2) @@ -206,7 +296,7 @@ typedef struct #define kNumFullDistances (1 << (kEndPosModelIndex >> 1)) typedef -#ifdef _LZMA_PROB32 +#ifdef Z7_LZMA_PROB32 UInt32 #else UInt16 @@ -263,7 +353,7 @@ typedef struct Byte *buf; Byte *bufLim; Byte *bufBase; - ISeqOutStream *outStream; + ISeqOutStreamPtr outStream; UInt64 processed; SRes res; } CRangeEnc; @@ -296,10 +386,10 @@ typedef struct typedef UInt32 CProbPrice; -typedef struct +struct CLzmaEnc { void *matchFinderObj; - IMatchFinder matchFinder; + IMatchFinder2 matchFinder; unsigned optCur; unsigned optEnd; @@ -339,24 +429,30 @@ typedef struct UInt32 dictSize; SRes result; - #ifndef _7ZIP_ST + #ifndef Z7_ST BoolInt mtMode; // begin of CMatchFinderMt is used in LZ thread CMatchFinderMt matchFinderMt; // end of CMatchFinderMt is used in BT and HASH threads + // #else + // CMatchFinder matchFinderBase; #endif - CMatchFinder matchFinderBase; - #ifndef _7ZIP_ST + + // we suppose that we have 8-bytes alignment after CMatchFinder + + #ifndef Z7_ST Byte pad[128]; #endif // LZ thread CProbPrice ProbPrices[kBitModelTotal >> kNumMoveReducingBits]; - UInt32 matches[LZMA_MATCH_LEN_MAX * 2 + 2 + 1]; + // we want {len , dist} pairs to be 8-bytes aligned in matches array + UInt32 matches[LZMA_MATCH_LEN_MAX * 2 + 2]; + // we want 8-bytes alignment here UInt32 alignPrices[kAlignTableSize]; UInt32 posSlotPrices[kNumLenToPosStates][kDistTableSizeMax]; UInt32 distancesPrices[kNumLenToPosStates][kNumFullDistances]; @@ -385,113 +481,115 @@ typedef struct CSaveState saveState; - #ifndef _7ZIP_ST + // BoolInt mf_Failure; + #ifndef Z7_ST Byte pad2[128]; #endif -} CLzmaEnc; - +}; -#define COPY_ARR(dest, src, arr) memcpy(dest->arr, src->arr, sizeof(src->arr)); +#define MFB (p->matchFinderBase) +/* +#ifndef Z7_ST +#define MFB (p->matchFinderMt.MatchFinder) +#endif +*/ -void LzmaEnc_SaveState(CLzmaEncHandle pp) -{ - CLzmaEnc *p = (CLzmaEnc *)pp; - CSaveState *dest = &p->saveState; - - dest->state = p->state; - - dest->lenProbs = p->lenProbs; - dest->repLenProbs = p->repLenProbs; - - COPY_ARR(dest, p, reps); - - COPY_ARR(dest, p, posAlignEncoder); - COPY_ARR(dest, p, isRep); - COPY_ARR(dest, p, isRepG0); - COPY_ARR(dest, p, isRepG1); - COPY_ARR(dest, p, isRepG2); - COPY_ARR(dest, p, isMatch); - COPY_ARR(dest, p, isRep0Long); - COPY_ARR(dest, p, posSlotEncoder); - COPY_ARR(dest, p, posEncoders); - - memcpy(dest->litProbs, p->litProbs, ((UInt32)0x300 << p->lclp) * sizeof(CLzmaProb)); +// #define GET_CLzmaEnc_p CLzmaEnc *p = (CLzmaEnc*)(void *)p; +// #define GET_const_CLzmaEnc_p const CLzmaEnc *p = (const CLzmaEnc*)(const void *)p; + +#define COPY_ARR(dest, src, arr) memcpy((dest)->arr, (src)->arr, sizeof((src)->arr)); + +#define COPY_LZMA_ENC_STATE(d, s, p) \ + (d)->state = (s)->state; \ + COPY_ARR(d, s, reps) \ + COPY_ARR(d, s, posAlignEncoder) \ + COPY_ARR(d, s, isRep) \ + COPY_ARR(d, s, isRepG0) \ + COPY_ARR(d, s, isRepG1) \ + COPY_ARR(d, s, isRepG2) \ + COPY_ARR(d, s, isMatch) \ + COPY_ARR(d, s, isRep0Long) \ + COPY_ARR(d, s, posSlotEncoder) \ + COPY_ARR(d, s, posEncoders) \ + (d)->lenProbs = (s)->lenProbs; \ + (d)->repLenProbs = (s)->repLenProbs; \ + memcpy((d)->litProbs, (s)->litProbs, ((size_t)0x300 * sizeof(CLzmaProb)) << (p)->lclp); + +void LzmaEnc_SaveState(CLzmaEncHandle p) +{ + // GET_CLzmaEnc_p + CSaveState *v = &p->saveState; + COPY_LZMA_ENC_STATE(v, p, p) } - -void LzmaEnc_RestoreState(CLzmaEncHandle pp) +void LzmaEnc_RestoreState(CLzmaEncHandle p) { - CLzmaEnc *dest = (CLzmaEnc *)pp; - const CSaveState *p = &dest->saveState; - - dest->state = p->state; - - dest->lenProbs = p->lenProbs; - dest->repLenProbs = p->repLenProbs; - - COPY_ARR(dest, p, reps); - - COPY_ARR(dest, p, posAlignEncoder); - COPY_ARR(dest, p, isRep); - COPY_ARR(dest, p, isRepG0); - COPY_ARR(dest, p, isRepG1); - COPY_ARR(dest, p, isRepG2); - COPY_ARR(dest, p, isMatch); - COPY_ARR(dest, p, isRep0Long); - COPY_ARR(dest, p, posSlotEncoder); - COPY_ARR(dest, p, posEncoders); - - memcpy(dest->litProbs, p->litProbs, ((UInt32)0x300 << dest->lclp) * sizeof(CLzmaProb)); + // GET_CLzmaEnc_p + const CSaveState *v = &p->saveState; + COPY_LZMA_ENC_STATE(p, v, p) } - -SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) +Z7_NO_INLINE +SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props2) { - CLzmaEnc *p = (CLzmaEnc *)pp; + // GET_CLzmaEnc_p CLzmaEncProps props = *props2; LzmaEncProps_Normalize(&props); if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX - || props.pb > LZMA_PB_MAX - || props.dictSize > ((UInt64)1 << kDicLogSizeMaxCompress) - || props.dictSize > kLzmaMaxHistorySize) + || props.pb > LZMA_PB_MAX) return SZ_ERROR_PARAM; + + if (props.dictSize > kLzmaMaxHistorySize) + props.dictSize = kLzmaMaxHistorySize; + + #ifndef LZMA_LOG_BSR + { + const UInt64 dict64 = props.dictSize; + if (dict64 > ((UInt64)1 << kDicLogSizeMaxCompress)) + return SZ_ERROR_PARAM; + } + #endif + p->dictSize = props.dictSize; { - unsigned fb = props.fb; + unsigned fb = (unsigned)props.fb; if (fb < 5) fb = 5; if (fb > LZMA_MATCH_LEN_MAX) fb = LZMA_MATCH_LEN_MAX; p->numFastBytes = fb; } - p->lc = props.lc; - p->lp = props.lp; - p->pb = props.pb; + p->lc = (unsigned)props.lc; + p->lp = (unsigned)props.lp; + p->pb = (unsigned)props.pb; p->fastMode = (props.algo == 0); // p->_maxMode = True; - p->matchFinderBase.btMode = (Byte)(props.btMode ? 1 : 0); + MFB.btMode = (Byte)(props.btMode ? 1 : 0); + // MFB.btMode = (Byte)(props.btMode); { unsigned numHashBytes = 4; if (props.btMode) { - if (props.numHashBytes < 2) - numHashBytes = 2; - else if (props.numHashBytes < 4) - numHashBytes = props.numHashBytes; + if (props.numHashBytes < 2) numHashBytes = 2; + else if (props.numHashBytes < 4) numHashBytes = (unsigned)props.numHashBytes; } - p->matchFinderBase.numHashBytes = numHashBytes; + if (props.numHashBytes >= 5) numHashBytes = 5; + + MFB.numHashBytes = numHashBytes; + // MFB.numHashBytes_Min = 2; + MFB.numHashOutBits = (Byte)props.numHashOutBits; } - p->matchFinderBase.cutValue = props.mc; + MFB.cutValue = props.mc; - p->writeEndMark = props.writeEndMark; + p->writeEndMark = (BoolInt)props.writeEndMark; - #ifndef _7ZIP_ST + #ifndef Z7_ST /* if (newMultiThread != _multiThread) { @@ -500,16 +598,22 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) } */ p->multiThread = (props.numThreads > 1); + p->matchFinderMt.btSync.affinity = + p->matchFinderMt.hashSync.affinity = props.affinity; + p->matchFinderMt.btSync.affinityGroup = + p->matchFinderMt.hashSync.affinityGroup = props.affinityGroup; + p->matchFinderMt.btSync.affinityInGroup = + p->matchFinderMt.hashSync.affinityInGroup = props.affinityInGroup; #endif return SZ_OK; } -void LzmaEnc_SetDataSize(CLzmaEncHandle pp, UInt64 expectedDataSiize) +void LzmaEnc_SetDataSize(CLzmaEncHandle p, UInt64 expectedDataSiize) { - CLzmaEnc *p = (CLzmaEnc *)pp; - p->matchFinderBase.expectedDataSize = expectedDataSiize; + // GET_CLzmaEnc_p + MFB.expectedDataSize = expectedDataSiize; } @@ -536,8 +640,8 @@ static void RangeEnc_Construct(CRangeEnc *p) p->bufBase = NULL; } -#define RangeEnc_GetProcessed(p) ((p)->processed + ((p)->buf - (p)->bufBase) + (p)->cacheSize) -#define RangeEnc_GetProcessed_sizet(p) ((size_t)(p)->processed + ((p)->buf - (p)->bufBase) + (size_t)(p)->cacheSize) +#define RangeEnc_GetProcessed(p) ( (p)->processed + (size_t)((p)->buf - (p)->bufBase) + (p)->cacheSize) +#define RangeEnc_GetProcessed_sizet(p) ((size_t)(p)->processed + (size_t)((p)->buf - (p)->bufBase) + (size_t)(p)->cacheSize) #define RC_BUF_SIZE (1 << 16) @@ -556,12 +660,11 @@ static int RangeEnc_Alloc(CRangeEnc *p, ISzAllocPtr alloc) static void RangeEnc_Free(CRangeEnc *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->bufBase); - p->bufBase = 0; + p->bufBase = NULL; } static void RangeEnc_Init(CRangeEnc *p) { - /* Stream.Init(); */ p->range = 0xFFFFFFFF; p->cache = 0; p->low = 0; @@ -573,19 +676,19 @@ static void RangeEnc_Init(CRangeEnc *p) p->res = SZ_OK; } -MY_NO_INLINE static void RangeEnc_FlushStream(CRangeEnc *p) +Z7_NO_INLINE static void RangeEnc_FlushStream(CRangeEnc *p) { - size_t num; - if (p->res != SZ_OK) - return; - num = p->buf - p->bufBase; - if (num != ISeqOutStream_Write(p->outStream, p->bufBase, num)) - p->res = SZ_ERROR_WRITE; + const size_t num = (size_t)(p->buf - p->bufBase); + if (p->res == SZ_OK) + { + if (num != ISeqOutStream_Write(p->outStream, p->bufBase, num)) + p->res = SZ_ERROR_WRITE; + } p->processed += num; p->buf = p->bufBase; } -MY_NO_INLINE static void MY_FAST_CALL RangeEnc_ShiftLow(CRangeEnc *p) +Z7_NO_INLINE static void Z7_FASTCALL RangeEnc_ShiftLow(CRangeEnc *p) { UInt32 low = (UInt32)p->low; unsigned high = (unsigned)(p->low >> 32); @@ -630,9 +733,9 @@ static void RangeEnc_FlushData(CRangeEnc *p) ttt = *(prob); \ newBound = (range >> kNumBitModelTotalBits) * ttt; -// #define _LZMA_ENC_USE_BRANCH +// #define Z7_LZMA_ENC_USE_BRANCH -#ifdef _LZMA_ENC_USE_BRANCH +#ifdef Z7_LZMA_ENC_USE_BRANCH #define RC_BIT(p, prob, bit) { \ RC_BIT_PRE(p, prob) \ @@ -656,7 +759,7 @@ static void RangeEnc_FlushData(CRangeEnc *p) range += newBound & mask; \ mask &= (kBitModelTotal - ((1 << kNumMoveBits) - 1)); \ mask += ((1 << kNumMoveBits) - 1); \ - ttt += (Int32)(mask - ttt) >> kNumMoveBits; \ + ttt += (UInt32)((Int32)(mask - ttt) >> kNumMoveBits); \ *(prob) = (CLzmaProb)ttt; \ RC_NORM(p) \ } @@ -700,7 +803,7 @@ static void LitEnc_Encode(CRangeEnc *p, CLzmaProb *probs, UInt32 sym) CLzmaProb *prob = probs + (sym >> 8); UInt32 bit = (sym >> 7) & 1; sym <<= 1; - RC_BIT(p, prob, bit); + RC_BIT(p, prob, bit) } while (sym < 0x10000); p->range = range; @@ -722,7 +825,7 @@ static void LitEnc_EncodeMatched(CRangeEnc *p, CLzmaProb *probs, UInt32 sym, UIn bit = (sym >> 7) & 1; sym <<= 1; offs &= ~(matchByte ^ sym); - RC_BIT(p, prob, bit); + RC_BIT(p, prob, bit) } while (sym < 0x10000); p->range = range; @@ -749,17 +852,17 @@ static void LzmaEnc_InitPriceTables(CProbPrice *ProbPrices) bitCount++; } } - ProbPrices[i] = (CProbPrice)((kNumBitModelTotalBits << kCyclesBits) - 15 - bitCount); + ProbPrices[i] = (CProbPrice)(((unsigned)kNumBitModelTotalBits << kCyclesBits) - 15 - bitCount); // printf("\n%3d: %5d", i, ProbPrices[i]); } } #define GET_PRICE(prob, bit) \ - p->ProbPrices[((prob) ^ (unsigned)(((-(int)(bit))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits]; + p->ProbPrices[((prob) ^ (unsigned)(((-(int)(bit))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits] #define GET_PRICEa(prob, bit) \ - ProbPrices[((prob) ^ (unsigned)((-((int)(bit))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits]; + ProbPrices[((prob) ^ (unsigned)((-((int)(bit))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits] #define GET_PRICE_0(prob) p->ProbPrices[(prob) >> kNumMoveReducingBits] #define GET_PRICE_1(prob) p->ProbPrices[((prob) ^ (kBitModelTotal - 1)) >> kNumMoveReducingBits] @@ -810,7 +913,7 @@ static void RcTree_ReverseEncode(CRangeEnc *rc, CLzmaProb *probs, unsigned numBi unsigned bit = sym & 1; // RangeEnc_EncodeBit(rc, probs + m, bit); sym >>= 1; - RC_BIT(rc, probs + m, bit); + RC_BIT(rc, probs + m, bit) m = (m << 1) | bit; } while (--numBits); @@ -833,15 +936,15 @@ static void LenEnc_Encode(CLenEnc *p, CRangeEnc *rc, unsigned sym, unsigned posS UInt32 range, ttt, newBound; CLzmaProb *probs = p->low; range = rc->range; - RC_BIT_PRE(rc, probs); + RC_BIT_PRE(rc, probs) if (sym >= kLenNumLowSymbols) { - RC_BIT_1(rc, probs); + RC_BIT_1(rc, probs) probs += kLenNumLowSymbols; - RC_BIT_PRE(rc, probs); + RC_BIT_PRE(rc, probs) if (sym >= kLenNumLowSymbols * 2) { - RC_BIT_1(rc, probs); + RC_BIT_1(rc, probs) rc->range = range; // RcTree_Encode(rc, p->high, kLenNumHighBits, sym - kLenNumLowSymbols * 2); LitEnc_Encode(rc, p->high, sym - kLenNumLowSymbols * 2); @@ -854,11 +957,11 @@ static void LenEnc_Encode(CLenEnc *p, CRangeEnc *rc, unsigned sym, unsigned posS { unsigned m; unsigned bit; - RC_BIT_0(rc, probs); + RC_BIT_0(rc, probs) probs += (posState << (1 + kLenNumLowBits)); - bit = (sym >> 2) ; RC_BIT(rc, probs + 1, bit); m = (1 << 1) + bit; - bit = (sym >> 1) & 1; RC_BIT(rc, probs + m, bit); m = (m << 1) + bit; - bit = sym & 1; RC_BIT(rc, probs + m, bit); + bit = (sym >> 2) ; RC_BIT(rc, probs + 1, bit) m = (1 << 1) + bit; + bit = (sym >> 1) & 1; RC_BIT(rc, probs + m, bit) m = (m << 1) + bit; + bit = sym & 1; RC_BIT(rc, probs + m, bit) rc->range = range; } } @@ -879,7 +982,7 @@ static void SetPrices_3(const CLzmaProb *probs, UInt32 startPrice, UInt32 *price } -MY_NO_INLINE static void MY_FAST_CALL LenPriceEnc_UpdateTables( +Z7_NO_INLINE static void Z7_FASTCALL LenPriceEnc_UpdateTables( CLenPriceEnc *p, unsigned numPosStates, const CLenEnc *enc, @@ -943,14 +1046,14 @@ MY_NO_INLINE static void MY_FAST_CALL LenPriceEnc_UpdateTables( UInt32 price = b; do { - unsigned bit = sym & 1; + const unsigned bit = sym & 1; sym >>= 1; price += GET_PRICEa(probs[sym], bit); } while (sym >= 2); { - unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))]; + const unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))]; prices[(size_t)i * 2 ] = price + GET_PRICEa_0(prob); prices[(size_t)i * 2 + 1] = price + GET_PRICEa_1(prob); } @@ -959,7 +1062,7 @@ MY_NO_INLINE static void MY_FAST_CALL LenPriceEnc_UpdateTables( { unsigned posState; - size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]); + const size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]); for (posState = 1; posState < numPosStates; posState++) memcpy(p->prices[posState] + kLenNumLowSymbols * 2, p->prices[0] + kLenNumLowSymbols * 2, num); } @@ -985,7 +1088,11 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes) p->additionalOffset++; p->numAvail = p->matchFinder.GetNumAvailableBytes(p->matchFinderObj); - numPairs = p->matchFinder.GetMatches(p->matchFinderObj, p->matches); + { + const UInt32 *d = p->matchFinder.GetMatches(p->matchFinderObj, p->matches); + // if (!d) { p->mf_Failure = True; *numPairsRes = 0; return 0; } + numPairs = (unsigned)(d - p->matches); + } *numPairsRes = numPairs; #ifdef SHOW_STAT @@ -1001,7 +1108,7 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes) if (numPairs == 0) return 0; { - unsigned len = p->matches[(size_t)numPairs - 2]; + const unsigned len = p->matches[(size_t)numPairs - 2]; if (len != p->numFastBytes) return len; { @@ -1011,7 +1118,7 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes) { const Byte *p1 = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; const Byte *p2 = p1 + len; - ptrdiff_t dif = (ptrdiff_t)-1 - p->matches[(size_t)numPairs - 1]; + const ptrdiff_t dif = (ptrdiff_t)-1 - (ptrdiff_t)p->matches[(size_t)numPairs - 1]; const Byte *lim = p1 + numAvail; for (; p2 != lim && *p2 == p2[dif]; p2++) {} @@ -1037,7 +1144,7 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes) + GET_PRICE_1(p->isRep[state]) \ + GET_PRICE_0(p->isRepG0[state]) -MY_FORCE_INLINE +Z7_FORCE_INLINE static UInt32 GetPrice_PureRep(const CLzmaEnc *p, unsigned repIndex, size_t state, size_t posState) { UInt32 price; @@ -1167,6 +1274,8 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) repLens[i] = len; if (len > repLens[repMaxIndex]) repMaxIndex = i; + if (len == LZMA_MATCH_LEN_MAX) // 21.03 : optimization + break; } if (repLens[repMaxIndex] >= p->numFastBytes) @@ -1179,10 +1288,12 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) } matches = p->matches; + #define MATCHES matches + // #define MATCHES p->matches if (mainLen >= p->numFastBytes) { - p->backRes = matches[(size_t)numPairs - 1] + LZMA_NUM_REPS; + p->backRes = MATCHES[(size_t)numPairs - 1] + LZMA_NUM_REPS; MOVE_POS(p, mainLen - 1) return mainLen; } @@ -1212,7 +1323,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) LitEnc_GetPrice(probs, curByte, p->ProbPrices)); } - MakeAs_Lit(&p->opt[1]); + MakeAs_Lit(&p->opt[1]) matchPrice = GET_PRICE_1(p->isMatch[p->state][posState]); repMatchPrice = matchPrice + GET_PRICE_1(p->isRep[p->state]); @@ -1224,7 +1335,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) if (shortRepPrice < p->opt[1].price) { p->opt[1].price = shortRepPrice; - MakeAs_ShortRep(&p->opt[1]); + MakeAs_ShortRep(&p->opt[1]) } if (last < 2) { @@ -1276,13 +1387,13 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) if (len < 2) len = 2; else - while (len > matches[offs]) + while (len > MATCHES[offs]) offs += 2; for (; ; len++) { COptimal *opt; - UInt32 dist = matches[(size_t)offs + 1]; + UInt32 dist = MATCHES[(size_t)offs + 1]; UInt32 price = normalMatchPrice + GET_PRICE_LEN(&p->lenEnc, posState, len); unsigned lenToPosState = GetLenToPosState(len); @@ -1291,7 +1402,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) else { unsigned slot; - GetPosSlot2(dist, slot); + GetPosSlot2(dist, slot) price += p->alignPrices[dist & kAlignMask]; price += p->posSlotPrices[lenToPosState][slot]; } @@ -1306,7 +1417,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) opt->extra = 0; } - if (len == matches[offs]) + if (len == MATCHES[offs]) { offs += 2; if (offs == numPairs) @@ -1367,7 +1478,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) unsigned delta = best - cur; if (delta != 0) { - MOVE_POS(p, delta); + MOVE_POS(p, delta) } } cur = best; @@ -1514,7 +1625,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) { nextOpt->price = litPrice; nextOpt->len = 1; - MakeAs_Lit(nextOpt); + MakeAs_Lit(nextOpt) nextIsLit = True; } } @@ -1548,7 +1659,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) { nextOpt->price = shortRepPrice; nextOpt->len = 1; - MakeAs_ShortRep(nextOpt); + MakeAs_ShortRep(nextOpt) nextIsLit = False; } } @@ -1727,8 +1838,8 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) if (newLen > numAvail) { newLen = numAvail; - for (numPairs = 0; newLen > matches[numPairs]; numPairs += 2); - matches[numPairs] = (UInt32)newLen; + for (numPairs = 0; newLen > MATCHES[numPairs]; numPairs += 2); + MATCHES[numPairs] = (UInt32)newLen; numPairs += 2; } @@ -1747,12 +1858,12 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) } offs = 0; - while (startLen > matches[offs]) + while (startLen > MATCHES[offs]) offs += 2; - dist = matches[(size_t)offs + 1]; + dist = MATCHES[(size_t)offs + 1]; // if (dist >= kNumFullDistances) - GetPosSlot2(dist, posSlot); + GetPosSlot2(dist, posSlot) for (len = /*2*/ startLen; ; len++) { @@ -1776,7 +1887,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) } } - if (len == matches[offs]) + if (len == MATCHES[offs]) { // if (p->_maxMode) { // MATCH : LIT : REP_0 @@ -1841,9 +1952,9 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position) offs += 2; if (offs == numPairs) break; - dist = matches[(size_t)offs + 1]; + dist = MATCHES[(size_t)offs + 1]; // if (dist >= kNumFullDistances) - GetPosSlot2(dist, posSlot); + GetPosSlot2(dist, posSlot) } } } @@ -2019,7 +2130,7 @@ static void WriteEndMarker(CLzmaEnc *p, unsigned posState) { UInt32 ttt, newBound; RC_BIT_PRE(p, probs + m) - RC_BIT_1(&p->rc, probs + m); + RC_BIT_1(&p->rc, probs + m) m = (m << 1) + 1; } while (m < (1 << kNumPosSlotBits)); @@ -2044,7 +2155,7 @@ static void WriteEndMarker(CLzmaEnc *p, unsigned posState) { UInt32 ttt, newBound; RC_BIT_PRE(p, probs + m) - RC_BIT_1(&p->rc, probs + m); + RC_BIT_1(&p->rc, probs + m) m = (m << 1) + 1; } while (m < kAlignTableSize); @@ -2059,15 +2170,30 @@ static SRes CheckErrors(CLzmaEnc *p) return p->result; if (p->rc.res != SZ_OK) p->result = SZ_ERROR_WRITE; - if (p->matchFinderBase.result != SZ_OK) + + #ifndef Z7_ST + if ( + // p->mf_Failure || + (p->mtMode && + ( // p->matchFinderMt.failure_LZ_LZ || + p->matchFinderMt.failure_LZ_BT)) + ) + { + p->result = MY_HRES_ERROR_INTERNAL_ERROR; + // printf("\nCheckErrors p->matchFinderMt.failureLZ\n"); + } + #endif + + if (MFB.result != SZ_OK) p->result = SZ_ERROR_READ; + if (p->result != SZ_OK) p->finished = True; return p->result; } -MY_NO_INLINE static SRes Flush(CLzmaEnc *p, UInt32 nowPos) +Z7_NO_INLINE static SRes Flush(CLzmaEnc *p, UInt32 nowPos) { /* ReleaseMFStream(); */ p->finished = True; @@ -2079,7 +2205,7 @@ MY_NO_INLINE static SRes Flush(CLzmaEnc *p, UInt32 nowPos) } -MY_NO_INLINE static void FillAlignPrices(CLzmaEnc *p) +Z7_NO_INLINE static void FillAlignPrices(CLzmaEnc *p) { unsigned i; const CProbPrice *ProbPrices = p->ProbPrices; @@ -2103,7 +2229,7 @@ MY_NO_INLINE static void FillAlignPrices(CLzmaEnc *p) } -MY_NO_INLINE static void FillDistancesPrices(CLzmaEnc *p) +Z7_NO_INLINE static void FillDistancesPrices(CLzmaEnc *p) { // int y; for (y = 0; y < 100; y++) { @@ -2198,20 +2324,20 @@ MY_NO_INLINE static void FillDistancesPrices(CLzmaEnc *p) -void LzmaEnc_Construct(CLzmaEnc *p) +static void LzmaEnc_Construct(CLzmaEnc *p) { RangeEnc_Construct(&p->rc); - MatchFinder_Construct(&p->matchFinderBase); + MatchFinder_Construct(&MFB); - #ifndef _7ZIP_ST + #ifndef Z7_ST + p->matchFinderMt.MatchFinder = &MFB; MatchFinderMt_Construct(&p->matchFinderMt); - p->matchFinderMt.MatchFinder = &p->matchFinderBase; #endif { CLzmaEncProps props; LzmaEncProps_Init(&props); - LzmaEnc_SetProps(p, &props); + LzmaEnc_SetProps((CLzmaEncHandle)(void *)p, &props); } #ifndef LZMA_LOG_BSR @@ -2221,7 +2347,6 @@ void LzmaEnc_Construct(CLzmaEnc *p) LzmaEnc_InitPriceTables(p->ProbPrices); p->litProbs = NULL; p->saveState.litProbs = NULL; - } CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc) @@ -2233,7 +2358,7 @@ CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc) return p; } -void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAllocPtr alloc) +static void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->litProbs); ISzAlloc_Free(alloc, p->saveState.litProbs); @@ -2241,20 +2366,21 @@ void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAllocPtr alloc) p->saveState.litProbs = NULL; } -void LzmaEnc_Destruct(CLzmaEnc *p, ISzAllocPtr alloc, ISzAllocPtr allocBig) +static void LzmaEnc_Destruct(CLzmaEnc *p, ISzAllocPtr alloc, ISzAllocPtr allocBig) { - #ifndef _7ZIP_ST + #ifndef Z7_ST MatchFinderMt_Destruct(&p->matchFinderMt, allocBig); #endif - MatchFinder_Free(&p->matchFinderBase, allocBig); + MatchFinder_Free(&MFB, allocBig); LzmaEnc_FreeLits(p, alloc); RangeEnc_Free(&p->rc, alloc); } void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAllocPtr alloc, ISzAllocPtr allocBig) { - LzmaEnc_Destruct((CLzmaEnc *)p, alloc, allocBig); + // GET_CLzmaEnc_p + LzmaEnc_Destruct(p, alloc, allocBig); ISzAlloc_Free(alloc, p); } @@ -2265,13 +2391,19 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac UInt32 nowPos32, startPos32; if (p->needInit) { + #ifndef Z7_ST + if (p->mtMode) + { + RINOK(MatchFinderMt_InitMt(&p->matchFinderMt)) + } + #endif p->matchFinder.Init(p->matchFinderObj); p->needInit = 0; } if (p->finished) return p->result; - RINOK(CheckErrors(p)); + RINOK(CheckErrors(p)) nowPos32 = (UInt32)p->nowPos64; startPos32 = nowPos32; @@ -2334,7 +2466,7 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac const Byte *data; unsigned state; - RC_BIT_0(&p->rc, probs); + RC_BIT_0(&p->rc, probs) p->rc.range = range; data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - p->additionalOffset; probs = LIT_PROBS(nowPos32, *(data - 1)); @@ -2348,53 +2480,53 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac } else { - RC_BIT_1(&p->rc, probs); + RC_BIT_1(&p->rc, probs) probs = &p->isRep[p->state]; RC_BIT_PRE(&p->rc, probs) if (dist < LZMA_NUM_REPS) { - RC_BIT_1(&p->rc, probs); + RC_BIT_1(&p->rc, probs) probs = &p->isRepG0[p->state]; RC_BIT_PRE(&p->rc, probs) if (dist == 0) { - RC_BIT_0(&p->rc, probs); + RC_BIT_0(&p->rc, probs) probs = &p->isRep0Long[p->state][posState]; RC_BIT_PRE(&p->rc, probs) if (len != 1) { - RC_BIT_1_BASE(&p->rc, probs); + RC_BIT_1_BASE(&p->rc, probs) } else { - RC_BIT_0_BASE(&p->rc, probs); + RC_BIT_0_BASE(&p->rc, probs) p->state = kShortRepNextStates[p->state]; } } else { - RC_BIT_1(&p->rc, probs); + RC_BIT_1(&p->rc, probs) probs = &p->isRepG1[p->state]; RC_BIT_PRE(&p->rc, probs) if (dist == 1) { - RC_BIT_0_BASE(&p->rc, probs); + RC_BIT_0_BASE(&p->rc, probs) dist = p->reps[1]; } else { - RC_BIT_1(&p->rc, probs); + RC_BIT_1(&p->rc, probs) probs = &p->isRepG2[p->state]; RC_BIT_PRE(&p->rc, probs) if (dist == 2) { - RC_BIT_0_BASE(&p->rc, probs); + RC_BIT_0_BASE(&p->rc, probs) dist = p->reps[2]; } else { - RC_BIT_1_BASE(&p->rc, probs); + RC_BIT_1_BASE(&p->rc, probs) dist = p->reps[3]; p->reps[3] = p->reps[2]; } @@ -2418,7 +2550,7 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac else { unsigned posSlot; - RC_BIT_0(&p->rc, probs); + RC_BIT_0(&p->rc, probs) p->rc.range = range; p->state = kMatchNextStates[p->state]; @@ -2432,7 +2564,7 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac p->reps[0] = dist + 1; p->matchPriceCount++; - GetPosSlot(dist, posSlot); + GetPosSlot(dist, posSlot) // RcTree_Encode_PosSlot(&p->rc, p->posSlotEncoder[GetLenToPosState(len)], posSlot); { UInt32 sym = (UInt32)posSlot + (1 << kNumPosSlotBits); @@ -2443,7 +2575,7 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac CLzmaProb *prob = probs + (sym >> kNumPosSlotBits); UInt32 bit = (sym >> (kNumPosSlotBits - 1)) & 1; sym <<= 1; - RC_BIT(&p->rc, prob, bit); + RC_BIT(&p->rc, prob, bit) } while (sym < (1 << kNumPosSlotBits * 2)); p->rc.range = range; @@ -2487,10 +2619,10 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac { unsigned m = 1; unsigned bit; - bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit); m = (m << 1) + bit; - bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit); m = (m << 1) + bit; - bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit); m = (m << 1) + bit; - bit = dist & 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit); + bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit) m = (m << 1) + bit; + bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit) m = (m << 1) + bit; + bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit) m = (m << 1) + bit; + bit = dist & 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit) p->rc.range = range; // p->alignPriceCount++; } @@ -2522,12 +2654,12 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac // { int y; for (y = 0; y < 100; y++) { FillDistancesPrices(p); // }} - LenPriceEnc_UpdateTables(&p->lenEnc, 1 << p->pb, &p->lenProbs, p->ProbPrices); + LenPriceEnc_UpdateTables(&p->lenEnc, (unsigned)1 << p->pb, &p->lenProbs, p->ProbPrices); } if (p->repLenEncCounter <= 0) { p->repLenEncCounter = REP_LEN_COUNT; - LenPriceEnc_UpdateTables(&p->repLenEnc, 1 << p->pb, &p->repLenProbs, p->ProbPrices); + LenPriceEnc_UpdateTables(&p->repLenEnc, (unsigned)1 << p->pb, &p->repLenProbs, p->ProbPrices); } } @@ -2560,20 +2692,22 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig) { UInt32 beforeSize = kNumOpts; + UInt32 dictSize; + if (!RangeEnc_Alloc(&p->rc, alloc)) return SZ_ERROR_MEM; - #ifndef _7ZIP_ST - p->mtMode = (p->multiThread && !p->fastMode && (p->matchFinderBase.btMode != 0)); + #ifndef Z7_ST + p->mtMode = (p->multiThread && !p->fastMode && (MFB.btMode != 0)); #endif { - unsigned lclp = p->lc + p->lp; + const unsigned lclp = p->lc + p->lp; if (!p->litProbs || !p->saveState.litProbs || p->lclp != lclp) { LzmaEnc_FreeLits(p, alloc); - p->litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb)); - p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb)); + p->litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp); + p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp); if (!p->litProbs || !p->saveState.litProbs) { LzmaEnc_FreeLits(p, alloc); @@ -2583,36 +2717,55 @@ static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, } } - p->matchFinderBase.bigHash = (Byte)(p->dictSize > kBigHashDicLimit ? 1 : 0); + MFB.bigHash = (Byte)(p->dictSize > kBigHashDicLimit ? 1 : 0); - if (beforeSize + p->dictSize < keepWindowSize) - beforeSize = keepWindowSize - p->dictSize; - #ifndef _7ZIP_ST + dictSize = p->dictSize; + if (dictSize == ((UInt32)2 << 30) || + dictSize == ((UInt32)3 << 30)) + { + /* 21.03 : here we reduce the dictionary for 2 reasons: + 1) we don't want 32-bit back_distance matches in decoder for 2 GB dictionary. + 2) we want to elimate useless last MatchFinder_Normalize3() for corner cases, + where data size is aligned for 1 GB: 5/6/8 GB. + That reducing must be >= 1 for such corner cases. */ + dictSize -= 1; + } + + if (beforeSize + dictSize < keepWindowSize) + beforeSize = keepWindowSize - dictSize; + + /* in worst case we can look ahead for + max(LZMA_MATCH_LEN_MAX, numFastBytes + 1 + numFastBytes) bytes. + we send larger value for (keepAfter) to MantchFinder_Create(): + (numFastBytes + LZMA_MATCH_LEN_MAX + 1) + */ + + #ifndef Z7_ST if (p->mtMode) { - RINOK(MatchFinderMt_Create(&p->matchFinderMt, p->dictSize, beforeSize, p->numFastBytes, - LZMA_MATCH_LEN_MAX - + 1 /* 18.04 */ - , allocBig)); + RINOK(MatchFinderMt_Create(&p->matchFinderMt, dictSize, beforeSize, + p->numFastBytes, LZMA_MATCH_LEN_MAX + 1 /* 18.04 */ + , allocBig)) p->matchFinderObj = &p->matchFinderMt; - p->matchFinderBase.bigHash = (Byte)( - (p->dictSize > kBigHashDicLimit && p->matchFinderBase.hashMask >= 0xFFFFFF) ? 1 : 0); + MFB.bigHash = (Byte)(MFB.hashMask >= 0xFFFFFF ? 1 : 0); MatchFinderMt_CreateVTable(&p->matchFinderMt, &p->matchFinder); } else #endif { - if (!MatchFinder_Create(&p->matchFinderBase, p->dictSize, beforeSize, p->numFastBytes, LZMA_MATCH_LEN_MAX, allocBig)) + if (!MatchFinder_Create(&MFB, dictSize, beforeSize, + p->numFastBytes, LZMA_MATCH_LEN_MAX + 1 /* 21.03 */ + , allocBig)) return SZ_ERROR_MEM; - p->matchFinderObj = &p->matchFinderBase; - MatchFinder_CreateVTable(&p->matchFinderBase, &p->matchFinder); + p->matchFinderObj = &MFB; + MatchFinder_CreateVTable(&MFB, &p->matchFinder); } return SZ_OK; } -void LzmaEnc_Init(CLzmaEnc *p) +static void LzmaEnc_Init(CLzmaEnc *p) { unsigned i; p->state = 0; @@ -2655,8 +2808,8 @@ void LzmaEnc_Init(CLzmaEnc *p) } { - UInt32 num = (UInt32)0x300 << (p->lp + p->lc); - UInt32 k; + const size_t num = (size_t)0x300 << (p->lp + p->lc); + size_t k; CLzmaProb *probs = p->litProbs; for (k = 0; k < num; k++) probs[k] = kProbInitValue; @@ -2676,12 +2829,14 @@ void LzmaEnc_Init(CLzmaEnc *p) p->additionalOffset = 0; - p->pbMask = (1 << p->pb) - 1; + p->pbMask = ((unsigned)1 << p->pb) - 1; p->lpMask = ((UInt32)0x100 << p->lp) - ((unsigned)0x100 >> p->lc); + + // p->mf_Failure = False; } -void LzmaEnc_InitPrices(CLzmaEnc *p) +static void LzmaEnc_InitPrices(CLzmaEnc *p) { if (!p->fastMode) { @@ -2695,8 +2850,8 @@ void LzmaEnc_InitPrices(CLzmaEnc *p) p->repLenEncCounter = REP_LEN_COUNT; - LenPriceEnc_UpdateTables(&p->lenEnc, 1 << p->pb, &p->lenProbs, p->ProbPrices); - LenPriceEnc_UpdateTables(&p->repLenEnc, 1 << p->pb, &p->repLenProbs, p->ProbPrices); + LenPriceEnc_UpdateTables(&p->lenEnc, (unsigned)1 << p->pb, &p->lenProbs, p->ProbPrices); + LenPriceEnc_UpdateTables(&p->repLenEnc, (unsigned)1 << p->pb, &p->repLenProbs, p->ProbPrices); } static SRes LzmaEnc_AllocAndInit(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig) @@ -2709,59 +2864,59 @@ static SRes LzmaEnc_AllocAndInit(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr p->finished = False; p->result = SZ_OK; - RINOK(LzmaEnc_Alloc(p, keepWindowSize, alloc, allocBig)); + p->nowPos64 = 0; + p->needInit = 1; + RINOK(LzmaEnc_Alloc(p, keepWindowSize, alloc, allocBig)) LzmaEnc_Init(p); LzmaEnc_InitPrices(p); - p->nowPos64 = 0; return SZ_OK; } -SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, +SRes LzmaEnc_Prepare(CLzmaEncHandle p, + ISeqOutStreamPtr outStream, + ISeqInStreamPtr inStream, ISzAllocPtr alloc, ISzAllocPtr allocBig) { - CLzmaEnc *p = (CLzmaEnc *)pp; - p->matchFinderBase.stream = inStream; - p->needInit = 1; + // GET_CLzmaEnc_p + MatchFinder_SET_STREAM(&MFB, inStream) p->rc.outStream = outStream; return LzmaEnc_AllocAndInit(p, 0, alloc, allocBig); } -SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp, - ISeqInStream *inStream, UInt32 keepWindowSize, - ISzAllocPtr alloc, ISzAllocPtr allocBig) +BoolInt LzmaEnc_IsFinished(CLzmaEncHandle pp) { CLzmaEnc *p = (CLzmaEnc *)pp; - p->matchFinderBase.stream = inStream; - p->needInit = 1; - return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig); + return p->finished; } -static void LzmaEnc_SetInputBuf(CLzmaEnc *p, const Byte *src, SizeT srcLen) +SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle p, + ISeqInStreamPtr inStream, UInt32 keepWindowSize, + ISzAllocPtr alloc, ISzAllocPtr allocBig) { - p->matchFinderBase.directInput = 1; - p->matchFinderBase.bufferBase = (Byte *)src; - p->matchFinderBase.directInputRem = srcLen; + // GET_CLzmaEnc_p + MatchFinder_SET_STREAM(&MFB, inStream) + return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig); } -SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const Byte *src, SizeT srcLen, - UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig) +SRes LzmaEnc_MemPrepare(CLzmaEncHandle p, + const Byte *src, SizeT srcLen, + UInt32 keepWindowSize, + ISzAllocPtr alloc, ISzAllocPtr allocBig) { - CLzmaEnc *p = (CLzmaEnc *)pp; - LzmaEnc_SetInputBuf(p, src, srcLen); - p->needInit = 1; - - LzmaEnc_SetDataSize(pp, srcLen); + // GET_CLzmaEnc_p + MatchFinder_SET_DIRECT_INPUT_BUF(&MFB, src, srcLen) + LzmaEnc_SetDataSize(p, srcLen); return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig); } -void LzmaEnc_Finish(CLzmaEncHandle pp) +void LzmaEnc_Finish(CLzmaEncHandle p) { - #ifndef _7ZIP_ST - CLzmaEnc *p = (CLzmaEnc *)pp; + #ifndef Z7_ST + // GET_CLzmaEnc_p if (p->mtMode) MatchFinderMt_ReleaseStream(&p->matchFinderMt); #else - UNUSED_VAR(pp); + UNUSED_VAR(p) #endif } @@ -2770,43 +2925,48 @@ typedef struct { ISeqOutStream vt; Byte *data; - SizeT rem; + size_t rem; BoolInt overflow; } CLzmaEnc_SeqOutStreamBuf; -static size_t SeqOutStreamBuf_Write(const ISeqOutStream *pp, const void *data, size_t size) +static size_t SeqOutStreamBuf_Write(ISeqOutStreamPtr pp, const void *data, size_t size) { - CLzmaEnc_SeqOutStreamBuf *p = CONTAINER_FROM_VTBL(pp, CLzmaEnc_SeqOutStreamBuf, vt); + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CLzmaEnc_SeqOutStreamBuf) if (p->rem < size) { size = p->rem; p->overflow = True; } - memcpy(p->data, data, size); - p->rem -= size; - p->data += size; + if (size != 0) + { + memcpy(p->data, data, size); + p->rem -= size; + p->data += size; + } return size; } -UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp) +/* +UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle p) { - const CLzmaEnc *p = (CLzmaEnc *)pp; + GET_const_CLzmaEnc_p return p->matchFinder.GetNumAvailableBytes(p->matchFinderObj); } +*/ - -const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle pp) +const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle p) { - const CLzmaEnc *p = (CLzmaEnc *)pp; + // GET_const_CLzmaEnc_p return p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - p->additionalOffset; } -SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit, +// (desiredPackSize == 0) is not allowed +SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle p, BoolInt reInit, Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize) { - CLzmaEnc *p = (CLzmaEnc *)pp; + // GET_CLzmaEnc_p UInt64 nowPos64; SRes res; CLzmaEnc_SeqOutStreamBuf outStream; @@ -2823,14 +2983,10 @@ SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit, if (reInit) LzmaEnc_Init(p); LzmaEnc_InitPrices(p); - - nowPos64 = p->nowPos64; RangeEnc_Init(&p->rc); p->rc.outStream = &outStream.vt; - - if (desiredPackSize == 0) - return SZ_ERROR_OUTPUT_EOF; - + nowPos64 = p->nowPos64; + res = LzmaEnc_CodeOneBlock(p, desiredPackSize, *unpackSize); *unpackSize = (UInt32)(p->nowPos64 - nowPos64); @@ -2842,11 +2998,12 @@ SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit, } -static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress) +Z7_NO_INLINE +static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgressPtr progress) { SRes res = SZ_OK; - #ifndef _7ZIP_ST + #ifndef Z7_ST Byte allocaDummy[0x300]; allocaDummy[0] = 0; allocaDummy[1] = allocaDummy[0]; @@ -2868,10 +3025,10 @@ static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress) } } - LzmaEnc_Finish(p); + LzmaEnc_Finish((CLzmaEncHandle)(void *)p); /* - if (res == SZ_OK && !Inline_MatchFinder_IsFinishedOK(&p->matchFinderBase)) + if (res == SZ_OK && !Inline_MatchFinder_IsFinishedOK(&MFB)) res = SZ_ERROR_FAIL; } */ @@ -2880,53 +3037,63 @@ static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress) } -SRes LzmaEnc_Encode(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ICompressProgress *progress, +SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig) { - RINOK(LzmaEnc_Prepare(pp, outStream, inStream, alloc, allocBig)); - return LzmaEnc_Encode2((CLzmaEnc *)pp, progress); + // GET_CLzmaEnc_p + RINOK(LzmaEnc_Prepare(p, outStream, inStream, alloc, allocBig)) + return LzmaEnc_Encode2(p, progress); } -SRes LzmaEnc_WriteProperties(CLzmaEncHandle pp, Byte *props, SizeT *size) +SRes LzmaEnc_WriteProperties(CLzmaEncHandle p, Byte *props, SizeT *size) { - CLzmaEnc *p = (CLzmaEnc *)pp; - unsigned i; - UInt32 dictSize = p->dictSize; if (*size < LZMA_PROPS_SIZE) return SZ_ERROR_PARAM; *size = LZMA_PROPS_SIZE; - props[0] = (Byte)((p->pb * 5 + p->lp) * 9 + p->lc); - - if (dictSize >= ((UInt32)1 << 22)) { - UInt32 kDictMask = ((UInt32)1 << 20) - 1; - if (dictSize < (UInt32)0xFFFFFFFF - kDictMask) - dictSize = (dictSize + kDictMask) & ~kDictMask; - } - else for (i = 11; i <= 30; i++) - { - if (dictSize <= ((UInt32)2 << i)) { dictSize = (2 << i); break; } - if (dictSize <= ((UInt32)3 << i)) { dictSize = (3 << i); break; } - } + // GET_CLzmaEnc_p + const UInt32 dictSize = p->dictSize; + UInt32 v; + props[0] = (Byte)((p->pb * 5 + p->lp) * 9 + p->lc); + + // we write aligned dictionary value to properties for lzma decoder + if (dictSize >= ((UInt32)1 << 21)) + { + const UInt32 kDictMask = ((UInt32)1 << 20) - 1; + v = (dictSize + kDictMask) & ~kDictMask; + if (v < dictSize) + v = dictSize; + } + else + { + unsigned i = 11 * 2; + do + { + v = (UInt32)(2 + (i & 1)) << (i >> 1); + i++; + } + while (v < dictSize); + } - for (i = 0; i < 4; i++) - props[1 + i] = (Byte)(dictSize >> (8 * i)); - return SZ_OK; + SetUi32(props + 1, v) + return SZ_OK; + } } -unsigned LzmaEnc_IsWriteEndMark(CLzmaEncHandle pp) +unsigned LzmaEnc_IsWriteEndMark(CLzmaEncHandle p) { - return ((CLzmaEnc *)pp)->writeEndMark; + // GET_CLzmaEnc_p + return (unsigned)p->writeEndMark; } -SRes LzmaEnc_MemEncode(CLzmaEncHandle pp, Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, - int writeEndMark, ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig) +SRes LzmaEnc_MemEncode(CLzmaEncHandle p, Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, + int writeEndMark, ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig) { SRes res; - CLzmaEnc *p = (CLzmaEnc *)pp; + // GET_CLzmaEnc_p CLzmaEnc_SeqOutStreamBuf outStream; @@ -2938,7 +3105,7 @@ SRes LzmaEnc_MemEncode(CLzmaEncHandle pp, Byte *dest, SizeT *destLen, const Byte p->writeEndMark = writeEndMark; p->rc.outStream = &outStream.vt; - res = LzmaEnc_MemPrepare(pp, src, srcLen, 0, alloc, allocBig); + res = LzmaEnc_MemPrepare(p, src, srcLen, 0, alloc, allocBig); if (res == SZ_OK) { @@ -2947,7 +3114,7 @@ SRes LzmaEnc_MemEncode(CLzmaEncHandle pp, Byte *dest, SizeT *destLen, const Byte res = SZ_ERROR_FAIL; } - *destLen -= outStream.rem; + *destLen -= (SizeT)outStream.rem; if (outStream.overflow) return SZ_ERROR_OUTPUT_EOF; return res; @@ -2956,9 +3123,9 @@ SRes LzmaEnc_MemEncode(CLzmaEncHandle pp, Byte *dest, SizeT *destLen, const Byte SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark, - ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig) + ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig) { - CLzmaEnc *p = (CLzmaEnc *)LzmaEnc_Create(alloc); + CLzmaEncHandle p = LzmaEnc_Create(alloc); SRes res; if (!p) return SZ_ERROR_MEM; @@ -2976,8 +3143,14 @@ SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, return res; } -BoolInt LzmaEnc_IsFinished(CLzmaEncHandle pp) + +/* +#ifndef Z7_ST +void LzmaEnc_GetLzThreads(CLzmaEncHandle p, HANDLE lz_threads[2]) { - CLzmaEnc *p = (CLzmaEnc *)pp; - return p->finished; + GET_const_CLzmaEnc_p + lz_threads[0] = p->matchFinderMt.hashSync.thread; + lz_threads[1] = p->matchFinderMt.btSync.thread; } +#endif +*/ diff --git a/src/sdk/C/LzmaEnc.h b/src/sdk/C/LzmaEnc.h index 37a0906..2071f42 100644 --- a/src/sdk/C/LzmaEnc.h +++ b/src/sdk/C/LzmaEnc.h @@ -1,8 +1,8 @@ /* LzmaEnc.h -- LZMA Encoder -2017-07-27 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ -#ifndef __LZMA_ENC_H -#define __LZMA_ENC_H +#ifndef ZIP7_INC_LZMA_ENC_H +#define ZIP7_INC_LZMA_ENC_H #include "7zTypes.h" @@ -10,7 +10,7 @@ EXTERN_C_BEGIN #define LZMA_PROPS_SIZE 5 -typedef struct _CLzmaEncProps +typedef struct { int level; /* 0 <= level <= 9 */ UInt32 dictSize; /* (1 << 12) <= dictSize <= (1 << 27) for 32-bit version @@ -23,12 +23,19 @@ typedef struct _CLzmaEncProps int fb; /* 5 <= fb <= 273, default = 32 */ int btMode; /* 0 - hashChain Mode, 1 - binTree mode - normal, default = 1 */ int numHashBytes; /* 2, 3 or 4, default = 4 */ + unsigned numHashOutBits; /* default = ? */ UInt32 mc; /* 1 <= mc <= (1 << 30), default = 32 */ unsigned writeEndMark; /* 0 - do not write EOPM, 1 - write EOPM, default = 0 */ int numThreads; /* 1 or 2, default = 2 */ + // int _pad; + Int32 affinityGroup; + UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1. Encoder uses this value to reduce dictionary size */ + + UInt64 affinity; + UInt64 affinityInGroup; } CLzmaEncProps; void LzmaEncProps_Init(CLzmaEncProps *p); @@ -49,7 +56,9 @@ UInt32 LzmaEncProps_GetDictSize(const CLzmaEncProps *props2); SZ_ERROR_THREAD - error in multithreading functions (only for Mt version) */ -typedef void * CLzmaEncHandle; +typedef struct CLzmaEnc CLzmaEnc; +typedef CLzmaEnc * CLzmaEncHandle; +// Z7_DECLARE_HANDLE(CLzmaEncHandle) CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc); void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAllocPtr alloc, ISzAllocPtr allocBig); @@ -59,23 +68,23 @@ void LzmaEnc_SetDataSize(CLzmaEncHandle p, UInt64 expectedDataSiize); SRes LzmaEnc_WriteProperties(CLzmaEncHandle p, Byte *properties, SizeT *size); unsigned LzmaEnc_IsWriteEndMark(CLzmaEncHandle p); -SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStream *outStream, ISeqInStream *inStream, - ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig); +SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, + ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig); SRes LzmaEnc_MemEncode(CLzmaEncHandle p, Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, - int writeEndMark, ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig); + int writeEndMark, ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig); /* ---------- One Call Interface ---------- */ SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark, - ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig); + ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig); EXTERN_C_END /* ---------- Streaming Interface ---------- */ -SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ISzAllocPtr alloc, ISzAllocPtr allocBig); +SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, ISzAllocPtr alloc, ISzAllocPtr allocBig); SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpackSize); BoolInt LzmaEnc_IsFinished(CLzmaEncHandle pp); void LzmaEnc_Finish(CLzmaEncHandle pp); diff --git a/src/sdk/C/LzmaLib.c b/src/sdk/C/LzmaLib.c index 706e9e5..785e884 100644 --- a/src/sdk/C/LzmaLib.c +++ b/src/sdk/C/LzmaLib.c @@ -1,12 +1,14 @@ /* LzmaLib.c -- LZMA library wrapper -2015-06-13 : Igor Pavlov : Public domain */ +2023-04-02 : Igor Pavlov : Public domain */ + +#include "Precomp.h" #include "Alloc.h" #include "LzmaDec.h" #include "LzmaEnc.h" #include "LzmaLib.h" -MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen, +Z7_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen, unsigned char *outProps, size_t *outPropsSize, int level, /* 0 <= level <= 9, default = 5 */ unsigned dictSize, /* use (1 << N) or (3 << N). 4 KB < dictSize <= 128 MB */ @@ -32,7 +34,7 @@ MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char } -MY_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t *srcLen, +Z7_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t *srcLen, const unsigned char *props, size_t propsSize) { ELzmaStatus status; diff --git a/src/sdk/C/LzmaLib.h b/src/sdk/C/LzmaLib.h index 88fa87d..d7c0724 100644 --- a/src/sdk/C/LzmaLib.h +++ b/src/sdk/C/LzmaLib.h @@ -1,14 +1,14 @@ /* LzmaLib.h -- LZMA library interface -2013-01-18 : Igor Pavlov : Public domain */ +2023-04-02 : Igor Pavlov : Public domain */ -#ifndef __LZMA_LIB_H -#define __LZMA_LIB_H +#ifndef ZIP7_INC_LZMA_LIB_H +#define ZIP7_INC_LZMA_LIB_H #include "7zTypes.h" EXTERN_C_BEGIN -#define MY_STDAPI int MY_STD_CALL +#define Z7_STDAPI int Z7_STDCALL #define LZMA_PROPS_SIZE 5 @@ -40,14 +40,16 @@ outPropsSize - level - compression level: 0 <= level <= 9; level dictSize algo fb - 0: 16 KB 0 32 - 1: 64 KB 0 32 - 2: 256 KB 0 32 - 3: 1 MB 0 32 - 4: 4 MB 0 32 + 0: 64 KB 0 32 + 1: 256 KB 0 32 + 2: 1 MB 0 32 + 3: 4 MB 0 32 + 4: 16 MB 0 32 5: 16 MB 1 32 6: 32 MB 1 32 - 7+: 64 MB 1 64 + 7: 32 MB 1 64 + 8: 64 MB 1 64 + 9: 64 MB 1 64 The default value for "level" is 5. @@ -83,6 +85,11 @@ fb - Word size (the number of fast bytes). numThreads - The number of thereads. 1 or 2. The default value is 2. Fast mode (algo = 0) can use only 1 thread. +In: + dest - output data buffer + destLen - output data buffer size + src - input data + srcLen - input data size Out: destLen - processed output size Returns: @@ -93,7 +100,7 @@ numThreads - The number of thereads. 1 or 2. The default value is 2. SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version) */ -MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen, +Z7_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen, unsigned char *outProps, size_t *outPropsSize, /* *outPropsSize must be = 5 */ int level, /* 0 <= level <= 9, default = 5 */ unsigned dictSize, /* default = (1 << 24) */ @@ -108,8 +115,8 @@ MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char LzmaUncompress -------------- In: - dest - output data - destLen - output data size + dest - output data buffer + destLen - output data buffer size src - input data srcLen - input data size Out: @@ -123,7 +130,7 @@ LzmaUncompress SZ_ERROR_INPUT_EOF - it needs more bytes in input buffer (src) */ -MY_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, SizeT *srcLen, +Z7_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, SizeT *srcLen, const unsigned char *props, size_t propsSize); EXTERN_C_END diff --git a/src/sdk/C/MtCoder.c b/src/sdk/C/MtCoder.c index 9535985..923b19a 100644 --- a/src/sdk/C/MtCoder.c +++ b/src/sdk/C/MtCoder.c @@ -1,28 +1,28 @@ /* MtCoder.c -- Multi-thread Coder -2018-07-04 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" #include "MtCoder.h" -#ifndef _7ZIP_ST +#ifndef Z7_ST -SRes MtProgressThunk_Progress(const ICompressProgress *pp, UInt64 inSize, UInt64 outSize) +static SRes MtProgressThunk_Progress(ICompressProgressPtr pp, UInt64 inSize, UInt64 outSize) { - CMtProgressThunk *thunk = CONTAINER_FROM_VTBL(pp, CMtProgressThunk, vt); + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CMtProgressThunk) UInt64 inSize2 = 0; UInt64 outSize2 = 0; if (inSize != (UInt64)(Int64)-1) { - inSize2 = inSize - thunk->inSize; - thunk->inSize = inSize; + inSize2 = inSize - p->inSize; + p->inSize = inSize; } if (outSize != (UInt64)(Int64)-1) { - outSize2 = outSize - thunk->outSize; - thunk->outSize = outSize; + outSize2 = outSize - p->outSize; + p->outSize = outSize; } - return MtProgress_ProgressAdd(thunk->mtProgress, inSize2, outSize2); + return MtProgress_ProgressAdd(p->mtProgress, inSize2, outSize2); } @@ -36,25 +36,31 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p) #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; } -static WRes ArEvent_OptCreate_And_Reset(CEvent *p) -{ - if (Event_IsCreated(p)) - return Event_Reset(p); - return AutoResetEvent_CreateNotSignaled(p); -} - +static THREAD_FUNC_DECL ThreadFunc(void *pp); -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp); - -static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) +static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t +#ifdef _WIN32 + , CMtCoder * const mtc +#endif + ) { - WRes wres = ArEvent_OptCreate_And_Reset(&t->startEvent); + WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent); + // printf("\n====== MtCoderThread_CreateAndStart : \n"); if (wres == 0) { t->stop = False; if (!Thread_WasCreated(&t->thread)) - wres = Thread_Create(&t->thread, ThreadFunc, t); + { +#ifdef _WIN32 + if (mtc->numThreadGroups) + wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t, + ThreadNextGroup_GetNext(&mtc->nextGroup), // group + 0); // affinityMask + else +#endif + wres = Thread_Create(&t->thread, ThreadFunc, t); + } if (wres == 0) wres = Event_Set(&t->startEvent); } @@ -64,14 +70,14 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) } +Z7_FORCE_INLINE static void MtCoderThread_Destruct(CMtCoderThread *t) { if (Thread_WasCreated(&t->thread)) { t->stop = 1; Event_Set(&t->startEvent); - Thread_Wait(&t->thread); - Thread_Close(&t->thread); + Thread_Wait_Close(&t->thread); } Event_Close(&t->startEvent); @@ -85,24 +91,6 @@ static void MtCoderThread_Destruct(CMtCoderThread *t) -static SRes FullRead(ISeqInStream *stream, Byte *data, size_t *processedSize) -{ - size_t size = *processedSize; - *processedSize = 0; - while (size != 0) - { - size_t cur = size; - SRes res = ISeqInStream_Read(stream, data, &cur); - *processedSize += cur; - data += cur; - size -= cur; - RINOK(res); - if (cur == 0) - return SZ_OK; - } - return SZ_OK; -} - /* ThreadFunc2() returns: @@ -112,7 +100,7 @@ static SRes FullRead(ISeqInStream *stream, Byte *data, size_t *processedSize) static SRes ThreadFunc2(CMtCoderThread *t) { - CMtCoder *mtc = t->mtCoder; + CMtCoder * const mtc = t->mtCoder; for (;;) { @@ -153,7 +141,7 @@ static SRes ThreadFunc2(CMtCoderThread *t) } if (res == SZ_OK) { - res = FullRead(mtc->inStream, t->inBuf, &size); + res = SeqInStream_ReadMax(mtc->inStream, t->inBuf, &size); readProcessed = mtc->readProcessed + size; mtc->readProcessed = readProcessed; } @@ -212,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t) if (mtc->numStartedThreads < mtc->numStartedThreadsLimit && mtc->expectedDataSize != readProcessed) { - res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]); + res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads] +#ifdef _WIN32 + , mtc +#endif + ); if (res == SZ_OK) mtc->numStartedThreads++; else @@ -248,13 +240,13 @@ static SRes ThreadFunc2(CMtCoderThread *t) } { - CMtCoderBlock *block = &mtc->blocks[bi]; + CMtCoderBlock * const block = &mtc->blocks[bi]; block->res = res; block->bufIndex = bufIndex; block->finished = finished; } - #ifdef MTCODER__USE_WRITE_THREAD + #ifdef MTCODER_USE_WRITE_THREAD RINOK_THREAD(Event_Set(&mtc->writeEvents[bi])) #else { @@ -336,29 +328,29 @@ static SRes ThreadFunc2(CMtCoderThread *t) } -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp) +static THREAD_FUNC_DECL ThreadFunc(void *pp) { - CMtCoderThread *t = (CMtCoderThread *)pp; + CMtCoderThread * const t = (CMtCoderThread *)pp; for (;;) { if (Event_Wait(&t->startEvent) != 0) - return SZ_ERROR_THREAD; + return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; if (t->stop) return 0; { - SRes res = ThreadFunc2(t); + const SRes res = ThreadFunc2(t); CMtCoder *mtc = t->mtCoder; if (res != SZ_OK) { MtProgress_SetError(&mtc->mtProgress, res); } - #ifndef MTCODER__USE_WRITE_THREAD + #ifndef MTCODER_USE_WRITE_THREAD { - unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); + const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); if (numFinished == mtc->numStartedThreads) if (Event_Set(&mtc->finishedEvent) != 0) - return SZ_ERROR_THREAD; + return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; } #endif } @@ -373,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p) p->blockSize = 0; p->numThreadsMax = 0; + p->numThreadGroups = 0; p->expectedDataSize = (UInt64)(Int64)-1; p->inStream = NULL; @@ -390,7 +383,7 @@ void MtCoder_Construct(CMtCoder *p) Event_Construct(&p->readEvent); Semaphore_Construct(&p->blocksSemaphore); - for (i = 0; i < MTCODER__THREADS_MAX; i++) + for (i = 0; i < MTCODER_THREADS_MAX; i++) { CMtCoderThread *t = &p->threads[i]; t->mtCoder = p; @@ -398,11 +391,11 @@ void MtCoder_Construct(CMtCoder *p) t->inBuf = NULL; t->stop = False; Event_Construct(&t->startEvent); - Thread_Construct(&t->thread); + Thread_CONSTRUCT(&t->thread) } - #ifdef MTCODER__USE_WRITE_THREAD - for (i = 0; i < MTCODER__BLOCKS_MAX; i++) + #ifdef MTCODER_USE_WRITE_THREAD + for (i = 0; i < MTCODER_BLOCKS_MAX; i++) Event_Construct(&p->writeEvents[i]); #else Event_Construct(&p->finishedEvent); @@ -425,14 +418,14 @@ static void MtCoder_Free(CMtCoder *p) Event_Set(&p->readEvent); */ - for (i = 0; i < MTCODER__THREADS_MAX; i++) + for (i = 0; i < MTCODER_THREADS_MAX; i++) MtCoderThread_Destruct(&p->threads[i]); Event_Close(&p->readEvent); Semaphore_Close(&p->blocksSemaphore); - #ifdef MTCODER__USE_WRITE_THREAD - for (i = 0; i < MTCODER__BLOCKS_MAX; i++) + #ifdef MTCODER_USE_WRITE_THREAD + for (i = 0; i < MTCODER_BLOCKS_MAX; i++) Event_Close(&p->writeEvents[i]); #else Event_Close(&p->finishedEvent); @@ -456,20 +449,22 @@ SRes MtCoder_Code(CMtCoder *p) unsigned i; SRes res = SZ_OK; - if (numThreads > MTCODER__THREADS_MAX) - numThreads = MTCODER__THREADS_MAX; - numBlocksMax = MTCODER__GET_NUM_BLOCKS_FROM_THREADS(numThreads); + // printf("\n====== MtCoder_Code : \n"); + + if (numThreads > MTCODER_THREADS_MAX) + numThreads = MTCODER_THREADS_MAX; + numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++; if (p->blockSize < ((UInt32)1 << 24)) numBlocksMax++; if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++; - if (numBlocksMax > MTCODER__BLOCKS_MAX) - numBlocksMax = MTCODER__BLOCKS_MAX; + if (numBlocksMax > MTCODER_BLOCKS_MAX) + numBlocksMax = MTCODER_BLOCKS_MAX; if (p->blockSize != p->allocatedBufsSize) { - for (i = 0; i < MTCODER__THREADS_MAX; i++) + for (i = 0; i < MTCODER_THREADS_MAX; i++) { CMtCoderThread *t = &p->threads[i]; if (t->inBuf) @@ -485,28 +480,23 @@ SRes MtCoder_Code(CMtCoder *p) MtProgress_Init(&p->mtProgress, p->progress); - #ifdef MTCODER__USE_WRITE_THREAD + #ifdef MTCODER_USE_WRITE_THREAD for (i = 0; i < numBlocksMax; i++) { - RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->writeEvents[i])); + RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->writeEvents[i])) } #else - RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->finishedEvent)); + RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->finishedEvent)) #endif { - RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->readEvent)); - - if (Semaphore_IsCreated(&p->blocksSemaphore)) - { - RINOK_THREAD(Semaphore_Close(&p->blocksSemaphore)); - } - RINOK_THREAD(Semaphore_Create(&p->blocksSemaphore, numBlocksMax, numBlocksMax)); + RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->readEvent)) + RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, (UInt32)numBlocksMax, (UInt32)numBlocksMax)) } - for (i = 0; i < MTCODER__BLOCKS_MAX - 1; i++) + for (i = 0; i < MTCODER_BLOCKS_MAX - 1; i++) p->freeBlockList[i] = i + 1; - p->freeBlockList[MTCODER__BLOCKS_MAX - 1] = (unsigned)(int)-1; + p->freeBlockList[MTCODER_BLOCKS_MAX - 1] = (unsigned)(int)-1; p->freeBlockHead = 0; p->readProcessed = 0; @@ -514,26 +504,37 @@ SRes MtCoder_Code(CMtCoder *p) p->numBlocksMax = numBlocksMax; p->stopReading = False; - #ifndef MTCODER__USE_WRITE_THREAD + #ifndef MTCODER_USE_WRITE_THREAD p->writeIndex = 0; p->writeRes = SZ_OK; - for (i = 0; i < MTCODER__BLOCKS_MAX; i++) + for (i = 0; i < MTCODER_BLOCKS_MAX; i++) p->ReadyBlocks[i] = False; p->numFinishedThreads = 0; #endif p->numStartedThreadsLimit = numThreads; p->numStartedThreads = 0; + ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup // for (i = 0; i < numThreads; i++) { + // here we create new thread for first block. + // And each new thread will create another new thread after block reading + // until numStartedThreadsLimit is reached. CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++]; - RINOK(MtCoderThread_CreateAndStart(nextThread)); + { + const SRes res2 = MtCoderThread_CreateAndStart(nextThread +#ifdef _WIN32 + , p +#endif + ); + RINOK(res2) + } } RINOK_THREAD(Event_Set(&p->readEvent)) - #ifdef MTCODER__USE_WRITE_THREAD + #ifdef MTCODER_USE_WRITE_THREAD { unsigned bi = 0; @@ -545,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p) RINOK_THREAD(Event_Wait(&p->writeEvents[bi])) { - const CMtCoderBlock *block = &p->blocks[bi]; - unsigned bufIndex = block->bufIndex; - BoolInt finished = block->finished; + const CMtCoderBlock * const block = &p->blocks[bi]; + const unsigned bufIndex = block->bufIndex; + const BoolInt finished = block->finished; if (res == SZ_OK && block->res != SZ_OK) res = block->res; @@ -577,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p) } #else { - WRes wres = Event_Wait(&p->finishedEvent); + const WRes wres = Event_Wait(&p->finishedEvent); res = MY_SRes_HRESULT_FROM_WRes(wres); } #endif @@ -588,7 +589,7 @@ SRes MtCoder_Code(CMtCoder *p) if (res == SZ_OK) res = p->mtProgress.res; - #ifndef MTCODER__USE_WRITE_THREAD + #ifndef MTCODER_USE_WRITE_THREAD if (res == SZ_OK) res = p->writeRes; #endif @@ -599,3 +600,5 @@ SRes MtCoder_Code(CMtCoder *p) } #endif + +#undef RINOK_THREAD diff --git a/src/sdk/C/MtCoder.h b/src/sdk/C/MtCoder.h index 5a5f4d1..8166cca 100644 --- a/src/sdk/C/MtCoder.h +++ b/src/sdk/C/MtCoder.h @@ -1,30 +1,30 @@ /* MtCoder.h -- Multi-thread Coder -2018-07-04 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ -#ifndef __MT_CODER_H -#define __MT_CODER_H +#ifndef ZIP7_INC_MT_CODER_H +#define ZIP7_INC_MT_CODER_H #include "MtDec.h" EXTERN_C_BEGIN /* - if ( defined MTCODER__USE_WRITE_THREAD) : main thread writes all data blocks to output stream - if (not defined MTCODER__USE_WRITE_THREAD) : any coder thread can write data blocks to output stream + if ( defined MTCODER_USE_WRITE_THREAD) : main thread writes all data blocks to output stream + if (not defined MTCODER_USE_WRITE_THREAD) : any coder thread can write data blocks to output stream */ -/* #define MTCODER__USE_WRITE_THREAD */ +/* #define MTCODER_USE_WRITE_THREAD */ -#ifndef _7ZIP_ST - #define MTCODER__GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1) - #define MTCODER__THREADS_MAX 64 - #define MTCODER__BLOCKS_MAX (MTCODER__GET_NUM_BLOCKS_FROM_THREADS(MTCODER__THREADS_MAX) + 3) +#ifndef Z7_ST + #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1) + #define MTCODER_THREADS_MAX 256 + #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3) #else - #define MTCODER__THREADS_MAX 1 - #define MTCODER__BLOCKS_MAX 1 + #define MTCODER_THREADS_MAX 1 + #define MTCODER_BLOCKS_MAX 1 #endif -#ifndef _7ZIP_ST +#ifndef Z7_ST typedef struct @@ -37,15 +37,15 @@ typedef struct void MtProgressThunk_CreateVTable(CMtProgressThunk *p); -#define MtProgressThunk_Init(p) { (p)->inSize = 0; (p)->outSize = 0; } +#define MtProgressThunk_INIT(p) { (p)->inSize = 0; (p)->outSize = 0; } -struct _CMtCoder; +struct CMtCoder_; typedef struct { - struct _CMtCoder *mtCoder; + struct CMtCoder_ *mtCoder; unsigned index; int stop; Byte *inBuf; @@ -71,19 +71,20 @@ typedef struct } CMtCoderBlock; -typedef struct _CMtCoder +typedef struct CMtCoder_ { /* input variables */ size_t blockSize; /* size of input block */ unsigned numThreadsMax; + unsigned numThreadGroups; UInt64 expectedDataSize; - ISeqInStream *inStream; + ISeqInStreamPtr inStream; const Byte *inData; size_t inDataSize; - ICompressProgress *progress; + ICompressProgressPtr progress; ISzAllocPtr allocBig; IMtCoderCallback2 *mtCallback; @@ -100,13 +101,13 @@ typedef struct _CMtCoder BoolInt stopReading; SRes readRes; - #ifdef MTCODER__USE_WRITE_THREAD - CAutoResetEvent writeEvents[MTCODER__BLOCKS_MAX]; + #ifdef MTCODER_USE_WRITE_THREAD + CAutoResetEvent writeEvents[MTCODER_BLOCKS_MAX]; #else CAutoResetEvent finishedEvent; SRes writeRes; unsigned writeIndex; - Byte ReadyBlocks[MTCODER__BLOCKS_MAX]; + Byte ReadyBlocks[MTCODER_BLOCKS_MAX]; LONG numFinishedThreads; #endif @@ -120,11 +121,13 @@ typedef struct _CMtCoder CCriticalSection cs; unsigned freeBlockHead; - unsigned freeBlockList[MTCODER__BLOCKS_MAX]; + unsigned freeBlockList[MTCODER_BLOCKS_MAX]; CMtProgress mtProgress; - CMtCoderBlock blocks[MTCODER__BLOCKS_MAX]; - CMtCoderThread threads[MTCODER__THREADS_MAX]; + CMtCoderBlock blocks[MTCODER_BLOCKS_MAX]; + CMtCoderThread threads[MTCODER_THREADS_MAX]; + + CThreadNextGroup nextGroup; } CMtCoder; diff --git a/src/sdk/C/MtDec.c b/src/sdk/C/MtDec.c index 7803bf2..96274b6 100644 --- a/src/sdk/C/MtDec.c +++ b/src/sdk/C/MtDec.c @@ -1,16 +1,21 @@ /* MtDec.c -- Multi-thread Decoder -2019-02-02 : Igor Pavlov : Public domain */ +2024-02-20 : Igor Pavlov : Public domain */ #include "Precomp.h" // #define SHOW_DEBUG_INFO // #include +#include #ifdef SHOW_DEBUG_INFO #include #endif +#include "MtDec.h" + +#ifndef Z7_ST + #ifdef SHOW_DEBUG_INFO #define PRF(x) x #else @@ -19,11 +24,7 @@ #define PRF_STR_INT(s, d) PRF(printf("\n" s " %d\n", (unsigned)d)) -#include "MtDec.h" - -#ifndef _7ZIP_ST - -void MtProgress_Init(CMtProgress *p, ICompressProgress *progress) +void MtProgress_Init(CMtProgress *p, ICompressProgressPtr progress) { p->progress = progress; p->res = SZ_OK; @@ -77,39 +78,31 @@ void MtProgress_SetError(CMtProgress *p, SRes res) } -#define RINOK_THREAD(x) RINOK(x) - - -static WRes ArEvent_OptCreate_And_Reset(CEvent *p) -{ - if (Event_IsCreated(p)) - return Event_Reset(p); - return AutoResetEvent_CreateNotSignaled(p); -} +#define RINOK_THREAD(x) RINOK_WRes(x) -struct __CMtDecBufLink +struct CMtDecBufLink_ { - struct __CMtDecBufLink *next; + struct CMtDecBufLink_ *next; void *pad[3]; }; -typedef struct __CMtDecBufLink CMtDecBufLink; +typedef struct CMtDecBufLink_ CMtDecBufLink; #define MTDEC__LINK_DATA_OFFSET sizeof(CMtDecBufLink) #define MTDEC__DATA_PTR_FROM_LINK(link) ((Byte *)(link) + MTDEC__LINK_DATA_OFFSET) -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp); +static THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp); static WRes MtDecThread_CreateEvents(CMtDecThread *t) { - WRes wres = ArEvent_OptCreate_And_Reset(&t->canWrite); + WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->canWrite); if (wres == 0) { - wres = ArEvent_OptCreate_And_Reset(&t->canRead); + wres = AutoResetEvent_OptCreate_And_Reset(&t->canRead); if (wres == 0) return SZ_OK; } @@ -125,7 +118,7 @@ static SRes MtDecThread_CreateAndStart(CMtDecThread *t) { if (Thread_WasCreated(&t->thread)) return SZ_OK; - wres = Thread_Create(&t->thread, ThreadFunc, t); + wres = Thread_Create(&t->thread, MtDec_ThreadFunc, t); if (wres == 0) return SZ_OK; } @@ -156,8 +149,7 @@ static void MtDecThread_CloseThread(CMtDecThread *t) { Event_Set(&t->canWrite); /* we can disable it. There are no threads waiting canWrite in normal cases */ Event_Set(&t->canRead); - Thread_Wait(&t->thread); - Thread_Close(&t->thread); + Thread_Wait_Close(&t->thread); } Event_Close(&t->canRead); @@ -167,7 +159,7 @@ static void MtDecThread_CloseThread(CMtDecThread *t) static void MtDec_CloseThreads(CMtDec *p) { unsigned i; - for (i = 0; i < MTDEC__THREADS_MAX; i++) + for (i = 0; i < MTDEC_THREADS_MAX; i++) MtDecThread_CloseThread(&p->threads[i]); } @@ -179,25 +171,6 @@ static void MtDecThread_Destruct(CMtDecThread *t) -static SRes FullRead(ISeqInStream *stream, Byte *data, size_t *processedSize) -{ - size_t size = *processedSize; - *processedSize = 0; - while (size != 0) - { - size_t cur = size; - SRes res = ISeqInStream_Read(stream, data, &cur); - *processedSize += cur; - data += cur; - size -= cur; - RINOK(res); - if (cur == 0) - return SZ_OK; - } - return SZ_OK; -} - - static SRes MtDec_GetError_Spec(CMtDec *p, UInt64 interruptIndex, BoolInt *wasInterrupted) { SRes res; @@ -253,7 +226,7 @@ Byte *MtDec_GetCrossBuff(CMtDec *p) /* - ThreadFunc2() returns: + MtDec_ThreadFunc2() returns: 0 - in all normal cases (even for stream error or memory allocation error) (!= 0) - WRes error return by system threading function */ @@ -261,11 +234,11 @@ Byte *MtDec_GetCrossBuff(CMtDec *p) // #define MTDEC_ProgessStep (1 << 22) #define MTDEC_ProgessStep (1 << 0) -static WRes ThreadFunc2(CMtDecThread *t) +static WRes MtDec_ThreadFunc2(CMtDecThread *t) { CMtDec *p = t->mtDec; - PRF_STR_INT("ThreadFunc2", t->index); + PRF_STR_INT("MtDec_ThreadFunc2", t->index) // SetThreadAffinityMask(GetCurrentThread(), 1 << t->index); @@ -289,18 +262,19 @@ static WRes ThreadFunc2(CMtDecThread *t) Byte *afterEndData = NULL; size_t afterEndData_Size = 0; + BoolInt afterEndData_IsCross = False; BoolInt canCreateNewThread = False; // CMtDecCallbackInfo parse; CMtDecThread *nextThread; - PRF_STR_INT("Event_Wait(&t->canRead)", t->index); + PRF_STR_INT("=============== Event_Wait(&t->canRead)", t->index) - RINOK_THREAD(Event_Wait(&t->canRead)); + RINOK_THREAD(Event_Wait(&t->canRead)) if (p->exitThread) return 0; - PRF_STR_INT("after Event_Wait(&t->canRead)", t->index); + PRF_STR_INT("after Event_Wait(&t->canRead)", t->index) // if (t->index == 3) return 19; // for test @@ -372,7 +346,7 @@ static WRes ThreadFunc2(CMtDecThread *t) { size = p->inBufSize; - res = FullRead(p->inStream, data, &size); + res = SeqInStream_ReadMax(p->inStream, data, &size); // size = 10; // test @@ -418,10 +392,12 @@ static WRes ThreadFunc2(CMtDecThread *t) parse.srcFinished = finish; parse.canCreateNewThread = True; - // PRF(printf("\nParse size = %d\n", (unsigned)size)) + PRF(printf("\nParse size = %d\n", (unsigned)size)); p->mtCallback->Parse(p->mtCallbackObject, t->index, &parse); + PRF(printf(" Parse processed = %d, state = %d \n", (unsigned)parse.srcSize, (unsigned)parse.state)); + needWrite = True; canCreateNewThread = parse.canCreateNewThread; @@ -478,16 +454,12 @@ static WRes ThreadFunc2(CMtDecThread *t) if (parse.state == MTDEC_PARSE_END) { - p->crossStart = 0; - p->crossEnd = 0; - - if (crossSize != 0) - memcpy(data + parse.srcSize, parseData + parse.srcSize, size - parse.srcSize); // we need all data - afterEndData_Size = size - parse.srcSize; afterEndData = parseData + parse.srcSize; - + afterEndData_Size = size - parse.srcSize; + if (crossSize != 0) + afterEndData_IsCross = True; // we reduce data size to required bytes (parsed only) - inDataSize -= (size - parse.srcSize); + inDataSize -= afterEndData_Size; if (!prev) inDataSize_Start = parse.srcSize; break; @@ -616,7 +588,7 @@ static WRes ThreadFunc2(CMtDecThread *t) // if ( !finish ) we must call Event_Set(&nextThread->canWrite) in any case // if ( finish ) we switch to single-thread mode and there are 2 ways at the end of current iteration (current block): // - if (needContinue) after Write(&needContinue), we restore decoding with new iteration - // - otherwise we stop decoding and exit from ThreadFunc2() + // - otherwise we stop decoding and exit from MtDec_ThreadFunc2() // Don't change (finish) variable in the further code @@ -689,7 +661,7 @@ static WRes ThreadFunc2(CMtDecThread *t) // ---------- WRITE ---------- - RINOK_THREAD(Event_Wait(&t->canWrite)); + RINOK_THREAD(Event_Wait(&t->canWrite)) { BoolInt isErrorMode = False; @@ -752,13 +724,15 @@ static WRes ThreadFunc2(CMtDecThread *t) { // p->inProcessed += inCodePos; + PRF(printf("\n--Write afterSize = %d\n", (unsigned)afterEndData_Size)); + res = p->mtCallback->Write(p->mtCallbackObject, t->index, res == SZ_OK && needWriteToStream && !wasInterrupted, // needWrite - afterEndData, afterEndData_Size, + afterEndData, afterEndData_Size, afterEndData_IsCross, &needContinue, &canRecode); - - // res= E_INVALIDARG; // for test + + // res = SZ_ERROR_FAIL; // for test PRF(printf("\nAfter Write needContinue = %d\n", (unsigned)needContinue)); PRF(printf("\nprocessed = %d\n", (unsigned)p->inProcessed)); @@ -800,14 +774,14 @@ static WRes ThreadFunc2(CMtDecThread *t) if (!finish) { - RINOK_THREAD(Event_Set(&nextThread->canWrite)); + RINOK_THREAD(Event_Set(&nextThread->canWrite)) } else { if (needContinue) { // we restore decoding with new iteration - RINOK_THREAD(Event_Set(&p->threads[0].canWrite)); + RINOK_THREAD(Event_Set(&p->threads[0].canWrite)) } else { @@ -816,7 +790,7 @@ static WRes ThreadFunc2(CMtDecThread *t) return SZ_OK; p->exitThread = True; } - RINOK_THREAD(Event_Set(&p->threads[0].canRead)); + RINOK_THREAD(Event_Set(&p->threads[0].canRead)) } } } @@ -835,7 +809,17 @@ static WRes ThreadFunc2(CMtDecThread *t) #endif -static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc1(void *pp) +typedef + #ifdef _WIN32 + UINT_PTR + #elif 1 + uintptr_t + #else + ptrdiff_t + #endif + MY_uintptr_t; + +static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) { WRes res; @@ -844,10 +828,10 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc1(void *pp) // fprintf(stdout, "\n%d = %p\n", t->index, &t); - res = ThreadFunc2(t); + res = MtDec_ThreadFunc2(t); p = t->mtDec; if (res == 0) - return p->exitThreadWRes; + return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)p->exitThreadWRes; { // it's unexpected situation for some threading function error if (p->exitThreadWRes == 0) @@ -858,18 +842,17 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc1(void *pp) Event_Set(&p->threads[0].canWrite); MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res)); } - return res; + return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)res; } -static MY_NO_INLINE THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp) +static Z7_NO_INLINE THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp) { + #ifdef USE_ALLOCA CMtDecThread *t = (CMtDecThread *)pp; - // fprintf(stderr, "\n%d = %p - before", t->index, &t); - #ifdef USE_ALLOCA t->allocaPtr = alloca(t->index * 128); #endif - return ThreadFunc1(pp); + return MtDec_ThreadFunc1(pp); } @@ -883,7 +866,7 @@ int MtDec_PrepareRead(CMtDec *p) { unsigned i; - for (i = 0; i < MTDEC__THREADS_MAX; i++) + for (i = 0; i < MTDEC_THREADS_MAX; i++) if (i > p->numStartedThreads || p->numFilledThreads <= (i >= p->filledThreadStart ? @@ -987,7 +970,7 @@ void MtDec_Construct(CMtDec *p) p->allocatedBufsSize = 0; - for (i = 0; i < MTDEC__THREADS_MAX; i++) + for (i = 0; i < MTDEC_THREADS_MAX; i++) { CMtDecThread *t = &p->threads[i]; t->mtDec = p; @@ -995,7 +978,7 @@ void MtDec_Construct(CMtDec *p) t->inBuf = NULL; Event_Construct(&t->canRead); Event_Construct(&t->canWrite); - Thread_Construct(&t->thread); + Thread_CONSTRUCT(&t->thread) } // Event_Construct(&p->finishedEvent); @@ -1010,7 +993,7 @@ static void MtDec_Free(CMtDec *p) p->exitThread = True; - for (i = 0; i < MTDEC__THREADS_MAX; i++) + for (i = 0; i < MTDEC_THREADS_MAX; i++) MtDecThread_Destruct(&p->threads[i]); // Event_Close(&p->finishedEvent); @@ -1061,15 +1044,15 @@ SRes MtDec_Code(CMtDec *p) { unsigned numThreads = p->numThreadsMax; - if (numThreads > MTDEC__THREADS_MAX) - numThreads = MTDEC__THREADS_MAX; + if (numThreads > MTDEC_THREADS_MAX) + numThreads = MTDEC_THREADS_MAX; p->numStartedThreads_Limit = numThreads; p->numStartedThreads = 0; } if (p->inBufSize != p->allocatedBufsSize) { - for (i = 0; i < MTDEC__THREADS_MAX; i++) + for (i = 0; i < MTDEC_THREADS_MAX; i++) { CMtDecThread *t = &p->threads[i]; if (t->inBuf) @@ -1086,19 +1069,20 @@ SRes MtDec_Code(CMtDec *p) MtProgress_Init(&p->mtProgress, p->progress); - // RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->finishedEvent)); + // RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->finishedEvent)) p->exitThread = False; p->exitThreadWRes = 0; { WRes wres; - WRes sres; + SRes sres; CMtDecThread *nextThread = &p->threads[p->numStartedThreads++]; // wres = MtDecThread_CreateAndStart(nextThread); wres = MtDecThread_CreateEvents(nextThread); if (wres == 0) { wres = Event_Set(&nextThread->canWrite); if (wres == 0) { wres = Event_Set(&nextThread->canRead); - if (wres == 0) { wres = ThreadFunc(nextThread); + if (wres == 0) { THREAD_FUNC_RET_TYPE res = MtDec_ThreadFunc(nextThread); + wres = (WRes)(MY_uintptr_t)res; if (wres != 0) { p->needContinue = False; @@ -1130,9 +1114,11 @@ SRes MtDec_Code(CMtDec *p) return SZ_OK; // if (sres != SZ_OK) - return sres; - // return E_FAIL; + return sres; + // return SZ_ERROR_FAIL; } } #endif + +#undef PRF diff --git a/src/sdk/C/MtDec.h b/src/sdk/C/MtDec.h index 9b57766..c28e8d9 100644 --- a/src/sdk/C/MtDec.h +++ b/src/sdk/C/MtDec.h @@ -1,46 +1,46 @@ /* MtDec.h -- Multi-thread Decoder -2018-07-04 : Igor Pavlov : Public domain */ +2023-04-02 : Igor Pavlov : Public domain */ -#ifndef __MT_DEC_H -#define __MT_DEC_H +#ifndef ZIP7_INC_MT_DEC_H +#define ZIP7_INC_MT_DEC_H #include "7zTypes.h" -#ifndef _7ZIP_ST +#ifndef Z7_ST #include "Threads.h" #endif EXTERN_C_BEGIN -#ifndef _7ZIP_ST +#ifndef Z7_ST -#ifndef _7ZIP_ST - #define MTDEC__THREADS_MAX 32 +#ifndef Z7_ST + #define MTDEC_THREADS_MAX 32 #else - #define MTDEC__THREADS_MAX 1 + #define MTDEC_THREADS_MAX 1 #endif typedef struct { - ICompressProgress *progress; + ICompressProgressPtr progress; SRes res; UInt64 totalInSize; UInt64 totalOutSize; CCriticalSection cs; } CMtProgress; -void MtProgress_Init(CMtProgress *p, ICompressProgress *progress); +void MtProgress_Init(CMtProgress *p, ICompressProgressPtr progress); SRes MtProgress_Progress_ST(CMtProgress *p); SRes MtProgress_ProgressAdd(CMtProgress *p, UInt64 inSize, UInt64 outSize); SRes MtProgress_GetError(CMtProgress *p); void MtProgress_SetError(CMtProgress *p, SRes res); -struct _CMtDec; +struct CMtDec; typedef struct { - struct _CMtDec *mtDec; + struct CMtDec_ *mtDec; unsigned index; void *inBuf; @@ -108,15 +108,16 @@ typedef struct */ SRes (*Write)(void *p, unsigned coderIndex, BoolInt needWriteToStream, - const Byte *src, size_t srcSize, + const Byte *src, size_t srcSize, BoolInt isCross, // int srcFinished, BoolInt *needContinue, BoolInt *canRecode); -} IMtDecCallback; +} IMtDecCallback2; -typedef struct _CMtDec + +typedef struct CMtDec_ { /* input variables */ @@ -125,14 +126,14 @@ typedef struct _CMtDec // size_t inBlockMax; unsigned numThreadsMax_2; - ISeqInStream *inStream; + ISeqInStreamPtr inStream; // const Byte *inData; // size_t inDataSize; - ICompressProgress *progress; + ICompressProgressPtr progress; ISzAllocPtr alloc; - IMtDecCallback *mtCallback; + IMtDecCallback2 *mtCallback; void *mtCallbackObject; @@ -170,11 +171,11 @@ typedef struct _CMtDec unsigned filledThreadStart; unsigned numFilledThreads; - #ifndef _7ZIP_ST + #ifndef Z7_ST BoolInt needInterrupt; UInt64 interruptIndex; CMtProgress mtProgress; - CMtDecThread threads[MTDEC__THREADS_MAX]; + CMtDecThread threads[MTDEC_THREADS_MAX]; #endif } CMtDec; diff --git a/src/sdk/C/Ppmd.h b/src/sdk/C/Ppmd.h index a5c1e3e..66b2626 100644 --- a/src/sdk/C/Ppmd.h +++ b/src/sdk/C/Ppmd.h @@ -1,15 +1,24 @@ /* Ppmd.h -- PPMD codec common code -2017-04-03 : Igor Pavlov : Public domain +2023-03-05 : Igor Pavlov : Public domain This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ -#ifndef __PPMD_H -#define __PPMD_H +#ifndef ZIP7_INC_PPMD_H +#define ZIP7_INC_PPMD_H #include "CpuArch.h" EXTERN_C_BEGIN -#ifdef MY_CPU_32BIT +#if defined(MY_CPU_SIZEOF_POINTER) && (MY_CPU_SIZEOF_POINTER == 4) +/* + PPMD code always uses 32-bit internal fields in PPMD structures to store internal references in main block. + if (PPMD_32BIT is defined), the PPMD code stores internal pointers to 32-bit reference fields. + if (PPMD_32BIT is NOT defined), the PPMD code stores internal UInt32 offsets to reference fields. + if (pointer size is 64-bit), then (PPMD_32BIT) mode is not allowed, + if (pointer size is 32-bit), then (PPMD_32BIT) mode is optional, + and it's allowed to disable PPMD_32BIT mode even if pointer is 32-bit. + PPMD code works slightly faster in (PPMD_32BIT) mode. +*/ #define PPMD_32BIT #endif @@ -28,7 +37,7 @@ EXTERN_C_BEGIN #define PPMD_N4 ((128 + 3 - 1 * PPMD_N1 - 2 * PPMD_N2 - 3 * PPMD_N3) / 4) #define PPMD_NUM_INDEXES (PPMD_N1 + PPMD_N2 + PPMD_N3 + PPMD_N4) -#pragma pack(push, 1) +MY_CPU_pragma_pack_push_1 /* Most compilers works OK here even without #pragma pack(push, 1), but some GCC compilers need it. */ /* SEE-contexts for PPM-contexts with masked symbols */ @@ -39,42 +48,117 @@ typedef struct Byte Count; /* Count to next change of Shift */ } CPpmd_See; -#define Ppmd_See_Update(p) if ((p)->Shift < PPMD_PERIOD_BITS && --(p)->Count == 0) \ - { (p)->Summ <<= 1; (p)->Count = (Byte)(3 << (p)->Shift++); } +#define Ppmd_See_UPDATE(p) \ + { if ((p)->Shift < PPMD_PERIOD_BITS && --(p)->Count == 0) \ + { (p)->Summ = (UInt16)((p)->Summ << 1); \ + (p)->Count = (Byte)(3 << (p)->Shift++); }} + typedef struct { Byte Symbol; Byte Freq; - UInt16 SuccessorLow; - UInt16 SuccessorHigh; + UInt16 Successor_0; + UInt16 Successor_1; } CPpmd_State; -#pragma pack(pop) - -typedef - #ifdef PPMD_32BIT - CPpmd_State * - #else - UInt32 - #endif - CPpmd_State_Ref; - -typedef - #ifdef PPMD_32BIT - void * - #else - UInt32 - #endif - CPpmd_Void_Ref; - -typedef - #ifdef PPMD_32BIT - Byte * - #else - UInt32 - #endif - CPpmd_Byte_Ref; +typedef struct CPpmd_State2_ +{ + Byte Symbol; + Byte Freq; +} CPpmd_State2; + +typedef struct CPpmd_State4_ +{ + UInt16 Successor_0; + UInt16 Successor_1; +} CPpmd_State4; + +MY_CPU_pragma_pop + +/* + PPMD code can write full CPpmd_State structure data to CPpmd*_Context + at (byte offset = 2) instead of some fields of original CPpmd*_Context structure. + + If we use pointers to different types, but that point to shared + memory space, we can have aliasing problem (strict aliasing). + + XLC compiler in -O2 mode can change the order of memory write instructions + in relation to read instructions, if we have use pointers to different types. + + To solve that aliasing problem we use combined CPpmd*_Context structure + with unions that contain the fields from both structures: + the original CPpmd*_Context and CPpmd_State. + So we can access the fields from both structures via one pointer, + and the compiler doesn't change the order of write instructions + in relation to read instructions. + + If we don't use memory write instructions to shared memory in + some local code, and we use only reading instructions (read only), + then probably it's safe to use pointers to different types for reading. +*/ + + + +#ifdef PPMD_32BIT + + #define Ppmd_Ref_Type(type) type * + #define Ppmd_GetRef(p, ptr) (ptr) + #define Ppmd_GetPtr(p, ptr) (ptr) + #define Ppmd_GetPtr_Type(p, ptr, note_type) (ptr) + +#else + + #define Ppmd_Ref_Type(type) UInt32 + #define Ppmd_GetRef(p, ptr) ((UInt32)((Byte *)(ptr) - (p)->Base)) + #define Ppmd_GetPtr(p, offs) ((void *)((p)->Base + (offs))) + #define Ppmd_GetPtr_Type(p, offs, type) ((type *)Ppmd_GetPtr(p, offs)) + +#endif // PPMD_32BIT + + +typedef Ppmd_Ref_Type(CPpmd_State) CPpmd_State_Ref; +typedef Ppmd_Ref_Type(void) CPpmd_Void_Ref; +typedef Ppmd_Ref_Type(Byte) CPpmd_Byte_Ref; + + +/* +#ifdef MY_CPU_LE_UNALIGN +// the unaligned 32-bit access latency can be too large, if the data is not in L1 cache. +#define Ppmd_GET_SUCCESSOR(p) ((CPpmd_Void_Ref)*(const UInt32 *)(const void *)&(p)->Successor_0) +#define Ppmd_SET_SUCCESSOR(p, v) *(UInt32 *)(void *)(void *)&(p)->Successor_0 = (UInt32)(v) + +#else +*/ + +/* + We can write 16-bit halves to 32-bit (Successor) field in any selected order. + But the native order is more consistent way. + So we use the native order, if LE/BE order can be detected here at compile time. +*/ + +#ifdef MY_CPU_BE + + #define Ppmd_GET_SUCCESSOR(p) \ + ( (CPpmd_Void_Ref) (((UInt32)(p)->Successor_0 << 16) | (p)->Successor_1) ) + + #define Ppmd_SET_SUCCESSOR(p, v) { \ + (p)->Successor_0 = (UInt16)(((UInt32)(v) >> 16) /* & 0xFFFF */); \ + (p)->Successor_1 = (UInt16)((UInt32)(v) /* & 0xFFFF */); } + +#else + + #define Ppmd_GET_SUCCESSOR(p) \ + ( (CPpmd_Void_Ref) ((p)->Successor_0 | ((UInt32)(p)->Successor_1 << 16)) ) + + #define Ppmd_SET_SUCCESSOR(p, v) { \ + (p)->Successor_0 = (UInt16)((UInt32)(v) /* & 0xFFFF */); \ + (p)->Successor_1 = (UInt16)(((UInt32)(v) >> 16) /* & 0xFFFF */); } + +#endif + +// #endif + #define PPMD_SetAllBitsIn256Bytes(p) \ { size_t z; for (z = 0; z < 256 / sizeof(p[0]); z += 8) { \ diff --git a/src/sdk/C/Ppmd7.c b/src/sdk/C/Ppmd7.c index 470aadc..efcc5d8 100644 --- a/src/sdk/C/Ppmd7.c +++ b/src/sdk/C/Ppmd7.c @@ -1,5 +1,5 @@ /* Ppmd7.c -- PPMdH codec -2018-07-04 : Igor Pavlov : Public domain +2023-09-07 : Igor Pavlov : Public domain This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ #include "Precomp.h" @@ -8,21 +8,23 @@ This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ #include "Ppmd7.h" -const Byte PPMD7_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 }; -static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051}; +/* define PPMD7_ORDER_0_SUPPPORT to suport order-0 mode, unsupported by orignal PPMd var.H. code */ +// #define PPMD7_ORDER_0_SUPPPORT + +MY_ALIGN(16) +static const Byte PPMD7_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 }; +MY_ALIGN(16) +static const UInt16 PPMD7_kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051}; #define MAX_FREQ 124 #define UNIT_SIZE 12 #define U2B(nu) ((UInt32)(nu) * UNIT_SIZE) #define U2I(nu) (p->Units2Indx[(size_t)(nu) - 1]) -#define I2U(indx) (p->Indx2Units[indx]) +#define I2U(indx) ((unsigned)p->Indx2Units[indx]) +#define I2U_UInt16(indx) ((UInt16)p->Indx2Units[indx]) -#ifdef PPMD_32BIT - #define REF(ptr) (ptr) -#else - #define REF(ptr) ((UInt32)((Byte *)(ptr) - (p)->Base)) -#endif +#define REF(ptr) Ppmd_GetRef(p, ptr) #define STATS_REF(ptr) ((CPpmd_State_Ref)REF(ptr)) @@ -31,17 +33,11 @@ static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x #define ONE_STATE(ctx) Ppmd7Context_OneState(ctx) #define SUFFIX(ctx) CTX((ctx)->Suffix) -typedef CPpmd7_Context * CTX_PTR; +typedef CPpmd7_Context * PPMD7_CTX_PTR; struct CPpmd7_Node_; -typedef - #ifdef PPMD_32BIT - struct CPpmd7_Node_ * - #else - UInt32 - #endif - CPpmd7_Node_Ref; +typedef Ppmd_Ref_Type(struct CPpmd7_Node_) CPpmd7_Node_Ref; typedef struct CPpmd7_Node_ { @@ -51,17 +47,13 @@ typedef struct CPpmd7_Node_ CPpmd7_Node_Ref Prev; } CPpmd7_Node; -#ifdef PPMD_32BIT - #define NODE(ptr) (ptr) -#else - #define NODE(offs) ((CPpmd7_Node *)(p->Base + (offs))) -#endif +#define NODE(r) Ppmd_GetPtr_Type(p, r, CPpmd7_Node) void Ppmd7_Construct(CPpmd7 *p) { unsigned i, k, m; - p->Base = 0; + p->Base = NULL; for (i = 0, k = 0; i < PPMD_NUM_INDEXES; i++) { @@ -77,6 +69,7 @@ void Ppmd7_Construct(CPpmd7 *p) for (i = 0; i < 3; i++) p->NS2Indx[i] = (Byte)i; + for (m = i, k = 1; i < 256; i++) { p->NS2Indx[i] = (Byte)m; @@ -84,183 +77,245 @@ void Ppmd7_Construct(CPpmd7 *p) k = (++m) - 2; } - memset(p->HB2Flag, 0, 0x40); - memset(p->HB2Flag + 0x40, 8, 0x100 - 0x40); + memcpy(p->ExpEscape, PPMD7_kExpEscape, 16); } + void Ppmd7_Free(CPpmd7 *p, ISzAllocPtr alloc) { ISzAlloc_Free(alloc, p->Base); p->Size = 0; - p->Base = 0; + p->Base = NULL; } + BoolInt Ppmd7_Alloc(CPpmd7 *p, UInt32 size, ISzAllocPtr alloc) { if (!p->Base || p->Size != size) { - size_t size2; Ppmd7_Free(p, alloc); - size2 = 0 - #ifndef PPMD_32BIT - + UNIT_SIZE - #endif - ; - p->AlignOffset = - #ifdef PPMD_32BIT - (4 - size) & 3; - #else - 4 - (size & 3); - #endif - if ((p->Base = (Byte *)ISzAlloc_Alloc(alloc, p->AlignOffset + size + size2)) == 0) + p->AlignOffset = (4 - size) & 3; + if ((p->Base = (Byte *)ISzAlloc_Alloc(alloc, p->AlignOffset + size)) == NULL) return False; p->Size = size; } return True; } -static void InsertNode(CPpmd7 *p, void *node, unsigned indx) + + +// ---------- Internal Memory Allocator ---------- + +/* We can use CPpmd7_Node in list of free units (as in Ppmd8) + But we still need one additional list walk pass in Ppmd7_GlueFreeBlocks(). + So we use simple CPpmd_Void_Ref instead of CPpmd7_Node in Ppmd7_InsertNode() / Ppmd7_RemoveNode() +*/ + +#define EMPTY_NODE 0 + + +static void Ppmd7_InsertNode(CPpmd7 *p, void *node, unsigned indx) { *((CPpmd_Void_Ref *)node) = p->FreeList[indx]; + // ((CPpmd7_Node *)node)->Next = (CPpmd7_Node_Ref)p->FreeList[indx]; + p->FreeList[indx] = REF(node); + } -static void *RemoveNode(CPpmd7 *p, unsigned indx) + +static void *Ppmd7_RemoveNode(CPpmd7 *p, unsigned indx) { CPpmd_Void_Ref *node = (CPpmd_Void_Ref *)Ppmd7_GetPtr(p, p->FreeList[indx]); p->FreeList[indx] = *node; + // CPpmd7_Node *node = NODE((CPpmd7_Node_Ref)p->FreeList[indx]); + // p->FreeList[indx] = node->Next; return node; } -static void SplitBlock(CPpmd7 *p, void *ptr, unsigned oldIndx, unsigned newIndx) + +static void Ppmd7_SplitBlock(CPpmd7 *p, void *ptr, unsigned oldIndx, unsigned newIndx) { unsigned i, nu = I2U(oldIndx) - I2U(newIndx); ptr = (Byte *)ptr + U2B(I2U(newIndx)); if (I2U(i = U2I(nu)) != nu) { unsigned k = I2U(--i); - InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1); + Ppmd7_InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1); } - InsertNode(p, ptr, i); + Ppmd7_InsertNode(p, ptr, i); } -static void GlueFreeBlocks(CPpmd7 *p) + +/* we use CPpmd7_Node_Union union to solve XLC -O2 strict pointer aliasing problem */ + +typedef union { - #ifdef PPMD_32BIT - CPpmd7_Node headItem; - CPpmd7_Node_Ref head = &headItem; - #else - CPpmd7_Node_Ref head = p->AlignOffset + p->Size; - #endif - - CPpmd7_Node_Ref n = head; - unsigned i; + CPpmd7_Node Node; + CPpmd7_Node_Ref NextRef; +} CPpmd7_Node_Union; + +/* Original PPmdH (Ppmd7) code uses doubly linked list in Ppmd7_GlueFreeBlocks() + we use single linked list similar to Ppmd8 code */ + +static void Ppmd7_GlueFreeBlocks(CPpmd7 *p) +{ + /* + we use first UInt16 field of 12-bytes UNITs as record type stamp + CPpmd_State { Byte Symbol; Byte Freq; : Freq != 0 + CPpmd7_Context { UInt16 NumStats; : NumStats != 0 + CPpmd7_Node { UInt16 Stamp : Stamp == 0 for free record + : Stamp == 1 for head record and guard + Last 12-bytes UNIT in array is always contains 12-bytes order-0 CPpmd7_Context record. + */ + CPpmd7_Node_Ref head, n = 0; + p->GlueCount = 255; - /* create doubly-linked list of free blocks */ - for (i = 0; i < PPMD_NUM_INDEXES; i++) + + /* we set guard NODE at LoUnit */ + if (p->LoUnit != p->HiUnit) + ((CPpmd7_Node *)(void *)p->LoUnit)->Stamp = 1; + { - UInt16 nu = I2U(i); - CPpmd7_Node_Ref next = (CPpmd7_Node_Ref)p->FreeList[i]; - p->FreeList[i] = 0; - while (next != 0) + /* Create list of free blocks. + We still need one additional list walk pass before Glue. */ + unsigned i; + for (i = 0; i < PPMD_NUM_INDEXES; i++) { - CPpmd7_Node *node = NODE(next); - node->Next = n; - n = NODE(n)->Prev = next; - next = *(const CPpmd7_Node_Ref *)node; - node->Stamp = 0; - node->NU = (UInt16)nu; + const UInt16 nu = I2U_UInt16(i); + CPpmd7_Node_Ref next = (CPpmd7_Node_Ref)p->FreeList[i]; + p->FreeList[i] = 0; + while (next != 0) + { + /* Don't change the order of the following commands: */ + CPpmd7_Node_Union *un = (CPpmd7_Node_Union *)NODE(next); + const CPpmd7_Node_Ref tmp = next; + next = un->NextRef; + un->Node.Stamp = EMPTY_NODE; + un->Node.NU = nu; + un->Node.Next = n; + n = tmp; + } } } - NODE(head)->Stamp = 1; - NODE(head)->Next = n; - NODE(n)->Prev = head; - if (p->LoUnit != p->HiUnit) - ((CPpmd7_Node *)p->LoUnit)->Stamp = 1; - - /* Glue free blocks */ - while (n != head) + + head = n; + /* Glue and Fill must walk the list in same direction */ { - CPpmd7_Node *node = NODE(n); - UInt32 nu = (UInt32)node->NU; - for (;;) + /* Glue free blocks */ + CPpmd7_Node_Ref *prev = &head; + while (n) { - CPpmd7_Node *node2 = NODE(n) + nu; - nu += node2->NU; - if (node2->Stamp != 0 || nu >= 0x10000) - break; - NODE(node2->Prev)->Next = node2->Next; - NODE(node2->Next)->Prev = node2->Prev; - node->NU = (UInt16)nu; + CPpmd7_Node *node = NODE(n); + UInt32 nu = node->NU; + n = node->Next; + if (nu == 0) + { + *prev = n; + continue; + } + prev = &node->Next; + for (;;) + { + CPpmd7_Node *node2 = node + nu; + nu += node2->NU; + if (node2->Stamp != EMPTY_NODE || nu >= 0x10000) + break; + node->NU = (UInt16)nu; + node2->NU = 0; + } } - n = node->Next; } - + /* Fill lists of free blocks */ - for (n = NODE(head)->Next; n != head;) + for (n = head; n != 0;) { CPpmd7_Node *node = NODE(n); - unsigned nu; - CPpmd7_Node_Ref next = node->Next; - for (nu = node->NU; nu > 128; nu -= 128, node += 128) - InsertNode(p, node, PPMD_NUM_INDEXES - 1); + UInt32 nu = node->NU; + unsigned i; + n = node->Next; + if (nu == 0) + continue; + for (; nu > 128; nu -= 128, node += 128) + Ppmd7_InsertNode(p, node, PPMD_NUM_INDEXES - 1); if (I2U(i = U2I(nu)) != nu) { unsigned k = I2U(--i); - InsertNode(p, node + k, nu - k - 1); + Ppmd7_InsertNode(p, node + k, (unsigned)nu - k - 1); } - InsertNode(p, node, i); - n = next; + Ppmd7_InsertNode(p, node, i); } } -static void *AllocUnitsRare(CPpmd7 *p, unsigned indx) + +Z7_NO_INLINE +static void *Ppmd7_AllocUnitsRare(CPpmd7 *p, unsigned indx) { unsigned i; - void *retVal; + if (p->GlueCount == 0) { - GlueFreeBlocks(p); + Ppmd7_GlueFreeBlocks(p); if (p->FreeList[indx] != 0) - return RemoveNode(p, indx); + return Ppmd7_RemoveNode(p, indx); } + i = indx; + do { if (++i == PPMD_NUM_INDEXES) { UInt32 numBytes = U2B(I2U(indx)); + Byte *us = p->UnitsStart; p->GlueCount--; - return ((UInt32)(p->UnitsStart - p->Text) > numBytes) ? (p->UnitsStart -= numBytes) : (NULL); + return ((UInt32)(us - p->Text) > numBytes) ? (p->UnitsStart = us - numBytes) : NULL; } } while (p->FreeList[i] == 0); - retVal = RemoveNode(p, i); - SplitBlock(p, retVal, i, indx); - return retVal; + + { + void *block = Ppmd7_RemoveNode(p, i); + Ppmd7_SplitBlock(p, block, i, indx); + return block; + } } -static void *AllocUnits(CPpmd7 *p, unsigned indx) + +static void *Ppmd7_AllocUnits(CPpmd7 *p, unsigned indx) { - UInt32 numBytes; if (p->FreeList[indx] != 0) - return RemoveNode(p, indx); - numBytes = U2B(I2U(indx)); - if (numBytes <= (UInt32)(p->HiUnit - p->LoUnit)) + return Ppmd7_RemoveNode(p, indx); { - void *retVal = p->LoUnit; - p->LoUnit += numBytes; - return retVal; + UInt32 numBytes = U2B(I2U(indx)); + Byte *lo = p->LoUnit; + if ((UInt32)(p->HiUnit - lo) >= numBytes) + { + p->LoUnit = lo + numBytes; + return lo; + } } - return AllocUnitsRare(p, indx); + return Ppmd7_AllocUnitsRare(p, indx); } -#define MyMem12Cpy(dest, src, num) \ - { UInt32 *d = (UInt32 *)dest; const UInt32 *s = (const UInt32 *)src; UInt32 n = num; \ - do { d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; s += 3; d += 3; } while (--n); } +#define MEM_12_CPY(dest, src, num) \ + { UInt32 *d = (UInt32 *)(dest); \ + const UInt32 *z = (const UInt32 *)(src); \ + unsigned n = (num); \ + do { \ + d[0] = z[0]; \ + d[1] = z[1]; \ + d[2] = z[2]; \ + z += 3; \ + d += 3; \ + } while (--n); \ + } + + +/* static void *ShrinkUnits(CPpmd7 *p, void *oldPtr, unsigned oldNU, unsigned newNU) { unsigned i0 = U2I(oldNU); @@ -269,28 +324,33 @@ static void *ShrinkUnits(CPpmd7 *p, void *oldPtr, unsigned oldNU, unsigned newNU return oldPtr; if (p->FreeList[i1] != 0) { - void *ptr = RemoveNode(p, i1); - MyMem12Cpy(ptr, oldPtr, newNU); - InsertNode(p, oldPtr, i0); + void *ptr = Ppmd7_RemoveNode(p, i1); + MEM_12_CPY(ptr, oldPtr, newNU) + Ppmd7_InsertNode(p, oldPtr, i0); return ptr; } - SplitBlock(p, oldPtr, i0, i1); + Ppmd7_SplitBlock(p, oldPtr, i0, i1); return oldPtr; } +*/ -#define SUCCESSOR(p) ((CPpmd_Void_Ref)((p)->SuccessorLow | ((UInt32)(p)->SuccessorHigh << 16))) +#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) static void SetSuccessor(CPpmd_State *p, CPpmd_Void_Ref v) { - (p)->SuccessorLow = (UInt16)((UInt32)(v) & 0xFFFF); - (p)->SuccessorHigh = (UInt16)(((UInt32)(v) >> 16) & 0xFFFF); + Ppmd_SET_SUCCESSOR(p, v) } -static void RestartModel(CPpmd7 *p) + + +Z7_NO_INLINE +static +void Ppmd7_RestartModel(CPpmd7 *p) { - unsigned i, k, m; + unsigned i, k; memset(p->FreeList, 0, sizeof(p->FreeList)); + p->Text = p->Base + p->AlignOffset; p->HiUnit = p->Text + p->Size; p->LoUnit = p->UnitsStart = p->HiUnit - p->Size / 8 / UNIT_SIZE * 7 * UNIT_SIZE; @@ -300,57 +360,110 @@ static void RestartModel(CPpmd7 *p) p->RunLength = p->InitRL = -(Int32)((p->MaxOrder < 12) ? p->MaxOrder : 12) - 1; p->PrevSuccess = 0; - p->MinContext = p->MaxContext = (CTX_PTR)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */ - p->MinContext->Suffix = 0; - p->MinContext->NumStats = 256; - p->MinContext->SummFreq = 256 + 1; - p->FoundState = (CPpmd_State *)p->LoUnit; /* AllocUnits(p, PPMD_NUM_INDEXES - 1); */ - p->LoUnit += U2B(256 / 2); - p->MinContext->Stats = REF(p->FoundState); - for (i = 0; i < 256; i++) { - CPpmd_State *s = &p->FoundState[i]; - s->Symbol = (Byte)i; - s->Freq = 1; - SetSuccessor(s, 0); + CPpmd7_Context *mc = (PPMD7_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */ + CPpmd_State *s = (CPpmd_State *)p->LoUnit; /* Ppmd7_AllocUnits(p, PPMD_NUM_INDEXES - 1); */ + + p->LoUnit += U2B(256 / 2); + p->MaxContext = p->MinContext = mc; + p->FoundState = s; + + mc->NumStats = 256; + mc->Union2.SummFreq = 256 + 1; + mc->Union4.Stats = REF(s); + mc->Suffix = 0; + + for (i = 0; i < 256; i++, s++) + { + s->Symbol = (Byte)i; + s->Freq = 1; + SetSuccessor(s, 0); + } + + #ifdef PPMD7_ORDER_0_SUPPPORT + if (p->MaxOrder == 0) + { + CPpmd_Void_Ref r = REF(mc); + s = p->FoundState; + for (i = 0; i < 256; i++, s++) + SetSuccessor(s, r); + return; + } + #endif } for (i = 0; i < 128; i++) + + + for (k = 0; k < 8; k++) { + unsigned m; UInt16 *dest = p->BinSumm[i] + k; - UInt16 val = (UInt16)(PPMD_BIN_SCALE - kInitBinEsc[k] / (i + 2)); + const UInt16 val = (UInt16)(PPMD_BIN_SCALE - PPMD7_kInitBinEsc[k] / (i + 2)); for (m = 0; m < 64; m += 8) dest[m] = val; } - + + for (i = 0; i < 25; i++) - for (k = 0; k < 16; k++) + { + + CPpmd_See *s = p->See[i]; + + + + unsigned summ = ((5 * i + 10) << (PPMD_PERIOD_BITS - 4)); + for (k = 0; k < 16; k++, s++) { - CPpmd_See *s = &p->See[i][k]; - s->Summ = (UInt16)((5 * i + 10) << (s->Shift = PPMD_PERIOD_BITS - 4)); + s->Summ = (UInt16)summ; + s->Shift = (PPMD_PERIOD_BITS - 4); s->Count = 4; } + } + + p->DummySee.Summ = 0; /* unused */ + p->DummySee.Shift = PPMD_PERIOD_BITS; + p->DummySee.Count = 64; /* unused */ } + void Ppmd7_Init(CPpmd7 *p, unsigned maxOrder) { p->MaxOrder = maxOrder; - RestartModel(p); - p->DummySee.Shift = PPMD_PERIOD_BITS; - p->DummySee.Summ = 0; /* unused */ - p->DummySee.Count = 64; /* unused */ + + Ppmd7_RestartModel(p); } -static CTX_PTR CreateSuccessors(CPpmd7 *p, BoolInt skip) + + +/* + Ppmd7_CreateSuccessors() + It's called when (FoundState->Successor) is RAW-Successor, + that is the link to position in Raw text. + So we create Context records and write the links to + FoundState->Successor and to identical RAW-Successors in suffix + contexts of MinContex. + + The function returns: + if (OrderFall == 0) then MinContext is already at MAX order, + { return pointer to new or existing context of same MAX order } + else + { return pointer to new real context that will be (Order+1) in comparison with MinContext + + also it can return pointer to real context of same order, +*/ + +Z7_NO_INLINE +static PPMD7_CTX_PTR Ppmd7_CreateSuccessors(CPpmd7 *p) { - CPpmd_State upState; - CTX_PTR c = p->MinContext; + PPMD7_CTX_PTR c = p->MinContext; CPpmd_Byte_Ref upBranch = (CPpmd_Byte_Ref)SUCCESSOR(p->FoundState); - CPpmd_State *ps[PPMD7_MAX_ORDER]; + Byte newSym, newFreq; unsigned numPs = 0; - - if (!skip) + CPpmd_State *ps[PPMD7_MAX_ORDER]; + + if (p->OrderFall != 0) ps[numPs++] = p->FoundState; while (c->Suffix) @@ -358,54 +471,83 @@ static CTX_PTR CreateSuccessors(CPpmd7 *p, BoolInt skip) CPpmd_Void_Ref successor; CPpmd_State *s; c = SUFFIX(c); + + if (c->NumStats != 1) { - for (s = STATS(c); s->Symbol != p->FoundState->Symbol; s++); + Byte sym = p->FoundState->Symbol; + for (s = STATS(c); s->Symbol != sym; s++); + } else + { s = ONE_STATE(c); + + } successor = SUCCESSOR(s); if (successor != upBranch) { + // (c) is real record Context here, c = CTX(successor); if (numPs == 0) + { + // (c) is real record MAX Order Context here, + // So we don't need to create any new contexts. return c; + } break; } ps[numPs++] = s; } - upState.Symbol = *(const Byte *)Ppmd7_GetPtr(p, upBranch); - SetSuccessor(&upState, upBranch + 1); + // All created contexts will have single-symbol with new RAW-Successor + // All new RAW-Successors will point to next position in RAW text + // after FoundState->Successor + + newSym = *(const Byte *)Ppmd7_GetPtr(p, upBranch); + upBranch++; + if (c->NumStats == 1) - upState.Freq = ONE_STATE(c)->Freq; + newFreq = ONE_STATE(c)->Freq; else { UInt32 cf, s0; CPpmd_State *s; - for (s = STATS(c); s->Symbol != upState.Symbol; s++); - cf = s->Freq - 1; - s0 = c->SummFreq - c->NumStats - cf; - upState.Freq = (Byte)(1 + ((2 * cf <= s0) ? (5 * cf > s0) : ((2 * cf + 3 * s0 - 1) / (2 * s0)))); + for (s = STATS(c); s->Symbol != newSym; s++); + cf = (UInt32)s->Freq - 1; + s0 = (UInt32)c->Union2.SummFreq - c->NumStats - cf; + /* + cf - is frequency of symbol that will be Successor in new context records. + s0 - is commulative frequency sum of another symbols from parent context. + max(newFreq)= (s->Freq + 1), when (s0 == 1) + we have requirement (Ppmd7Context_OneState()->Freq <= 128) in BinSumm[] + so (s->Freq < 128) - is requirement for multi-symbol contexts + */ + newFreq = (Byte)(1 + ((2 * cf <= s0) ? (5 * cf > s0) : (2 * cf + s0 - 1) / (2 * s0) + 1)); } + // Create new single-symbol contexts from low order to high order in loop + do { - /* Create Child */ - CTX_PTR c1; /* = AllocContext(p); */ + PPMD7_CTX_PTR c1; + /* = AllocContext(p); */ if (p->HiUnit != p->LoUnit) - c1 = (CTX_PTR)(p->HiUnit -= UNIT_SIZE); + c1 = (PPMD7_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); else if (p->FreeList[0] != 0) - c1 = (CTX_PTR)RemoveNode(p, 0); + c1 = (PPMD7_CTX_PTR)Ppmd7_RemoveNode(p, 0); else { - c1 = (CTX_PTR)AllocUnitsRare(p, 0); + c1 = (PPMD7_CTX_PTR)Ppmd7_AllocUnitsRare(p, 0); if (!c1) return NULL; } + c1->NumStats = 1; - *ONE_STATE(c1) = upState; + ONE_STATE(c1)->Symbol = newSym; + ONE_STATE(c1)->Freq = newFreq; + SetSuccessor(ONE_STATE(c1), upBranch); c1->Suffix = REF(c); SetSuccessor(ps[--numPs], REF(c1)); c = c1; @@ -415,21 +557,26 @@ static CTX_PTR CreateSuccessors(CPpmd7 *p, BoolInt skip) return c; } -static void SwapStates(CPpmd_State *t1, CPpmd_State *t2) -{ - CPpmd_State tmp = *t1; - *t1 = *t2; - *t2 = tmp; -} -static void UpdateModel(CPpmd7 *p) + +#define SWAP_STATES(s) \ + { CPpmd_State tmp = s[0]; s[0] = s[-1]; s[-1] = tmp; } + + +void Ppmd7_UpdateModel(CPpmd7 *p); +Z7_NO_INLINE +void Ppmd7_UpdateModel(CPpmd7 *p) { - CPpmd_Void_Ref successor, fSuccessor = SUCCESSOR(p->FoundState); - CTX_PTR c; + CPpmd_Void_Ref maxSuccessor, minSuccessor; + PPMD7_CTX_PTR c, mc; unsigned s0, ns; - + + + if (p->FoundState->Freq < MAX_FREQ / 4 && p->MinContext->Suffix != 0) { + /* Update Freqs in Suffix Context */ + c = SUFFIX(p->MinContext); if (c->NumStats == 1) @@ -441,166 +588,273 @@ static void UpdateModel(CPpmd7 *p) else { CPpmd_State *s = STATS(c); - if (s->Symbol != p->FoundState->Symbol) + Byte sym = p->FoundState->Symbol; + + if (s->Symbol != sym) { - do { s++; } while (s->Symbol != p->FoundState->Symbol); + do + { + // s++; if (s->Symbol == sym) break; + s++; + } + while (s->Symbol != sym); + if (s[0].Freq >= s[-1].Freq) { - SwapStates(&s[0], &s[-1]); + SWAP_STATES(s) s--; } } + if (s->Freq < MAX_FREQ - 9) { - s->Freq += 2; - c->SummFreq += 2; + s->Freq = (Byte)(s->Freq + 2); + c->Union2.SummFreq = (UInt16)(c->Union2.SummFreq + 2); } } } + if (p->OrderFall == 0) { - p->MinContext = p->MaxContext = CreateSuccessors(p, True); - if (p->MinContext == 0) + /* MAX ORDER context */ + /* (FoundState->Successor) is RAW-Successor. */ + p->MaxContext = p->MinContext = Ppmd7_CreateSuccessors(p); + if (!p->MinContext) { - RestartModel(p); + Ppmd7_RestartModel(p); return; } SetSuccessor(p->FoundState, REF(p->MinContext)); return; } + + + /* NON-MAX ORDER context */ - *p->Text++ = p->FoundState->Symbol; - successor = REF(p->Text); - if (p->Text >= p->UnitsStart) { - RestartModel(p); - return; + Byte *text = p->Text; + *text++ = p->FoundState->Symbol; + p->Text = text; + if (text >= p->UnitsStart) + { + Ppmd7_RestartModel(p); + return; + } + maxSuccessor = REF(text); } - if (fSuccessor) + minSuccessor = SUCCESSOR(p->FoundState); + + if (minSuccessor) { - if (fSuccessor <= successor) + // there is Successor for FoundState in MinContext. + // So the next context will be one order higher than MinContext. + + if (minSuccessor <= maxSuccessor) { - CTX_PTR cs = CreateSuccessors(p, False); - if (cs == NULL) + // minSuccessor is RAW-Successor. So we will create real contexts records: + PPMD7_CTX_PTR cs = Ppmd7_CreateSuccessors(p); + if (!cs) { - RestartModel(p); + Ppmd7_RestartModel(p); return; } - fSuccessor = REF(cs); + minSuccessor = REF(cs); } + + // minSuccessor now is real Context pointer that points to existing (Order+1) context + if (--p->OrderFall == 0) { - successor = fSuccessor; + /* + if we move to MaxOrder context, then minSuccessor will be common Succesor for both: + MinContext that is (MaxOrder - 1) + MaxContext that is (MaxOrder) + so we don't need new RAW-Successor, and we can use real minSuccessor + as succssors for both MinContext and MaxContext. + */ + maxSuccessor = minSuccessor; + + /* + if (MaxContext != MinContext) + { + there was order fall from MaxOrder and we don't need current symbol + to transfer some RAW-Succesors to real contexts. + So we roll back pointer in raw data for one position. + } + */ p->Text -= (p->MaxContext != p->MinContext); } } else { - SetSuccessor(p->FoundState, successor); - fSuccessor = REF(p->MinContext); + /* + FoundState has NULL-Successor here. + And only root 0-order context can contain NULL-Successors. + We change Successor in FoundState to RAW-Successor, + And next context will be same 0-order root Context. + */ + SetSuccessor(p->FoundState, maxSuccessor); + minSuccessor = REF(p->MinContext); } - - s0 = p->MinContext->SummFreq - (ns = p->MinContext->NumStats) - (p->FoundState->Freq - 1); - - for (c = p->MaxContext; c != p->MinContext; c = SUFFIX(c)) + + mc = p->MinContext; + c = p->MaxContext; + + p->MaxContext = p->MinContext = CTX(minSuccessor); + + if (c == mc) + return; + + // s0 : is pure Escape Freq + s0 = mc->Union2.SummFreq - (ns = mc->NumStats) - ((unsigned)p->FoundState->Freq - 1); + + do { unsigned ns1; - UInt32 cf, sf; + UInt32 sum; + if ((ns1 = c->NumStats) != 1) { if ((ns1 & 1) == 0) { /* Expand for one UNIT */ - unsigned oldNU = ns1 >> 1; - unsigned i = U2I(oldNU); + const unsigned oldNU = ns1 >> 1; + const unsigned i = U2I(oldNU); if (i != U2I((size_t)oldNU + 1)) { - void *ptr = AllocUnits(p, i + 1); + void *ptr = Ppmd7_AllocUnits(p, i + 1); void *oldPtr; if (!ptr) { - RestartModel(p); + Ppmd7_RestartModel(p); return; } oldPtr = STATS(c); - MyMem12Cpy(ptr, oldPtr, oldNU); - InsertNode(p, oldPtr, i); - c->Stats = STATS_REF(ptr); + MEM_12_CPY(ptr, oldPtr, oldNU) + Ppmd7_InsertNode(p, oldPtr, i); + c->Union4.Stats = STATS_REF(ptr); } } - c->SummFreq = (UInt16)(c->SummFreq + (2 * ns1 < ns) + 2 * ((4 * ns1 <= ns) & (c->SummFreq <= 8 * ns1))); + sum = c->Union2.SummFreq; + /* max increase of Escape_Freq is 3 here. + total increase of Union2.SummFreq for all symbols is less than 256 here */ + sum += (UInt32)(unsigned)((2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1))); + /* original PPMdH uses 16-bit variable for (sum) here. + But (sum < 0x9000). So we don't truncate (sum) to 16-bit */ + // sum = (UInt16)sum; } else { - CPpmd_State *s = (CPpmd_State*)AllocUnits(p, 0); + // instead of One-symbol context we create 2-symbol context + CPpmd_State *s = (CPpmd_State*)Ppmd7_AllocUnits(p, 0); if (!s) { - RestartModel(p); + Ppmd7_RestartModel(p); return; } - *s = *ONE_STATE(c); - c->Stats = REF(s); - if (s->Freq < MAX_FREQ / 4 - 1) - s->Freq <<= 1; - else - s->Freq = MAX_FREQ - 4; - c->SummFreq = (UInt16)(s->Freq + p->InitEsc + (ns > 3)); - } - cf = 2 * (UInt32)p->FoundState->Freq * (c->SummFreq + 6); - sf = (UInt32)s0 + c->SummFreq; - if (cf < 6 * sf) - { - cf = 1 + (cf > sf) + (cf >= 4 * sf); - c->SummFreq += 3; - } - else - { - cf = 4 + (cf >= 9 * sf) + (cf >= 12 * sf) + (cf >= 15 * sf); - c->SummFreq = (UInt16)(c->SummFreq + cf); + { + unsigned freq = c->Union2.State2.Freq; + // s = *ONE_STATE(c); + s->Symbol = c->Union2.State2.Symbol; + s->Successor_0 = c->Union4.State4.Successor_0; + s->Successor_1 = c->Union4.State4.Successor_1; + // SetSuccessor(s, c->Union4.Stats); // call it only for debug purposes to check the order of + // (Successor_0 and Successor_1) in LE/BE. + c->Union4.Stats = REF(s); + if (freq < MAX_FREQ / 4 - 1) + freq <<= 1; + else + freq = MAX_FREQ - 4; + // (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context + s->Freq = (Byte)freq; + // max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here + sum = (UInt32)(freq + p->InitEsc + (ns > 3)); + } } + { CPpmd_State *s = STATS(c) + ns1; - SetSuccessor(s, successor); + UInt32 cf = 2 * (sum + 6) * (UInt32)p->FoundState->Freq; + UInt32 sf = (UInt32)s0 + sum; s->Symbol = p->FoundState->Symbol; - s->Freq = (Byte)cf; c->NumStats = (UInt16)(ns1 + 1); + SetSuccessor(s, maxSuccessor); + + if (cf < 6 * sf) + { + cf = (UInt32)1 + (cf > sf) + (cf >= 4 * sf); + sum += 3; + /* It can add (0, 1, 2) to Escape_Freq */ + } + else + { + cf = (UInt32)4 + (cf >= 9 * sf) + (cf >= 12 * sf) + (cf >= 15 * sf); + sum += cf; + } + + c->Union2.SummFreq = (UInt16)sum; + s->Freq = (Byte)cf; } + c = SUFFIX(c); } - p->MaxContext = p->MinContext = CTX(fSuccessor); + while (c != mc); } -static void Rescale(CPpmd7 *p) + + +Z7_NO_INLINE +static void Ppmd7_Rescale(CPpmd7 *p) { unsigned i, adder, sumFreq, escFreq; CPpmd_State *stats = STATS(p->MinContext); CPpmd_State *s = p->FoundState; + + /* Sort the list by Freq */ + if (s != stats) { CPpmd_State tmp = *s; - for (; s != stats; s--) + do s[0] = s[-1]; + while (--s != stats); *s = tmp; } - escFreq = p->MinContext->SummFreq - s->Freq; - s->Freq += 4; - adder = (p->OrderFall != 0); - s->Freq = (Byte)((s->Freq + adder) >> 1); + sumFreq = s->Freq; + escFreq = p->MinContext->Union2.SummFreq - sumFreq; + + /* + if (p->OrderFall == 0), adder = 0 : it's allowed to remove symbol from MAX Order context + if (p->OrderFall != 0), adder = 1 : it's NOT allowed to remove symbol from NON-MAX Order context + */ + + adder = (p->OrderFall != 0); + + #ifdef PPMD7_ORDER_0_SUPPPORT + adder |= (p->MaxOrder == 0); // we don't remove symbols from order-0 context + #endif + + sumFreq = (sumFreq + 4 + adder) >> 1; + i = (unsigned)p->MinContext->NumStats - 1; + s->Freq = (Byte)sumFreq; - i = p->MinContext->NumStats - 1; do { - escFreq -= (++s)->Freq; - s->Freq = (Byte)((s->Freq + adder) >> 1); - sumFreq += s->Freq; - if (s[0].Freq > s[-1].Freq) + unsigned freq = (++s)->Freq; + escFreq -= freq; + freq = (freq + adder) >> 1; + sumFreq += freq; + s->Freq = (Byte)freq; + if (freq > s[-1].Freq) { + CPpmd_State tmp = *s; CPpmd_State *s1 = s; - CPpmd_State tmp = *s1; do + { s1[0] = s1[-1]; - while (--s1 != stats && tmp.Freq > s1[-1].Freq); + } + while (--s1 != stats && freq > s1[-1].Freq); *s1 = tmp; } } @@ -608,48 +862,90 @@ static void Rescale(CPpmd7 *p) if (s->Freq == 0) { - unsigned numStats = p->MinContext->NumStats; - unsigned n0, n1; - do { i++; } while ((--s)->Freq == 0); + /* Remove all items with Freq == 0 */ + CPpmd7_Context *mc; + unsigned numStats, numStatsNew, n0, n1; + + i = 0; do { i++; } while ((--s)->Freq == 0); + + /* We increase (escFreq) for the number of removed symbols. + So we will have (0.5) increase for Escape_Freq in avarage per + removed symbol after Escape_Freq halving */ escFreq += i; - p->MinContext->NumStats = (UInt16)(p->MinContext->NumStats - i); - if (p->MinContext->NumStats == 1) + mc = p->MinContext; + numStats = mc->NumStats; + numStatsNew = numStats - i; + mc->NumStats = (UInt16)(numStatsNew); + n0 = (numStats + 1) >> 1; + + if (numStatsNew == 1) { - CPpmd_State tmp = *stats; + /* Create Single-Symbol context */ + unsigned freq = stats->Freq; + do { - tmp.Freq = (Byte)(tmp.Freq - (tmp.Freq >> 1)); escFreq >>= 1; + freq = (freq + 1) >> 1; } while (escFreq > 1); - InsertNode(p, stats, U2I(((numStats + 1) >> 1))); - *(p->FoundState = ONE_STATE(p->MinContext)) = tmp; + + s = ONE_STATE(mc); + *s = *stats; + s->Freq = (Byte)freq; // (freq <= 260 / 4) + p->FoundState = s; + Ppmd7_InsertNode(p, stats, U2I(n0)); return; } - n0 = (numStats + 1) >> 1; - n1 = (p->MinContext->NumStats + 1) >> 1; + + n1 = (numStatsNew + 1) >> 1; if (n0 != n1) - p->MinContext->Stats = STATS_REF(ShrinkUnits(p, stats, n0, n1)); + { + // p->MinContext->Union4.Stats = STATS_REF(ShrinkUnits(p, stats, n0, n1)); + unsigned i0 = U2I(n0); + unsigned i1 = U2I(n1); + if (i0 != i1) + { + if (p->FreeList[i1] != 0) + { + void *ptr = Ppmd7_RemoveNode(p, i1); + p->MinContext->Union4.Stats = STATS_REF(ptr); + MEM_12_CPY(ptr, (const void *)stats, n1) + Ppmd7_InsertNode(p, stats, i0); + } + else + Ppmd7_SplitBlock(p, stats, i0, i1); + } + } + } + { + CPpmd7_Context *mc = p->MinContext; + mc->Union2.SummFreq = (UInt16)(sumFreq + escFreq - (escFreq >> 1)); + // Escape_Freq halving here + p->FoundState = STATS(mc); } - p->MinContext->SummFreq = (UInt16)(sumFreq + escFreq - (escFreq >> 1)); - p->FoundState = STATS(p->MinContext); } + CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq) { CPpmd_See *see; - unsigned nonMasked = p->MinContext->NumStats - numMasked; - if (p->MinContext->NumStats != 256) + const CPpmd7_Context *mc = p->MinContext; + unsigned numStats = mc->NumStats; + if (numStats != 256) { - see = p->See[(unsigned)p->NS2Indx[(size_t)nonMasked - 1]] + - (nonMasked < (unsigned)SUFFIX(p->MinContext)->NumStats - p->MinContext->NumStats) + - 2 * (unsigned)(p->MinContext->SummFreq < 11 * p->MinContext->NumStats) + - 4 * (unsigned)(numMasked > nonMasked) + + unsigned nonMasked = numStats - numMasked; + see = p->See[(unsigned)p->NS2Indx[(size_t)nonMasked - 1]] + + (nonMasked < (unsigned)SUFFIX(mc)->NumStats - numStats) + + 2 * (unsigned)(mc->Union2.SummFreq < 11 * numStats) + + 4 * (unsigned)(numMasked > nonMasked) + p->HiBitsFlag; { - unsigned r = (see->Summ >> see->Shift); - see->Summ = (UInt16)(see->Summ - r); - *escFreq = r + (r == 0); + // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ + const unsigned summ = (UInt16)see->Summ; // & 0xFFFF + const unsigned r = (summ >> see->Shift); + see->Summ = (UInt16)(summ - r); + *escFreq = (UInt32)(r + (r == 0)); } } else @@ -660,53 +956,176 @@ CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq) return see; } -static void NextContext(CPpmd7 *p) + +static void Ppmd7_NextContext(CPpmd7 *p) { - CTX_PTR c = CTX(SUCCESSOR(p->FoundState)); - if (p->OrderFall == 0 && (Byte *)c > p->Text) - p->MinContext = p->MaxContext = c; + PPMD7_CTX_PTR c = CTX(SUCCESSOR(p->FoundState)); + if (p->OrderFall == 0 && (const Byte *)c > p->Text) + p->MaxContext = p->MinContext = c; else - UpdateModel(p); + Ppmd7_UpdateModel(p); } + void Ppmd7_Update1(CPpmd7 *p) { CPpmd_State *s = p->FoundState; - s->Freq += 4; - p->MinContext->SummFreq += 4; - if (s[0].Freq > s[-1].Freq) + unsigned freq = s->Freq; + freq += 4; + p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4); + s->Freq = (Byte)freq; + if (freq > s[-1].Freq) { - SwapStates(&s[0], &s[-1]); + SWAP_STATES(s) p->FoundState = --s; - if (s->Freq > MAX_FREQ) - Rescale(p); + if (freq > MAX_FREQ) + Ppmd7_Rescale(p); } - NextContext(p); + Ppmd7_NextContext(p); } + void Ppmd7_Update1_0(CPpmd7 *p) { - p->PrevSuccess = (2 * p->FoundState->Freq > p->MinContext->SummFreq); - p->RunLength += p->PrevSuccess; - p->MinContext->SummFreq += 4; - if ((p->FoundState->Freq += 4) > MAX_FREQ) - Rescale(p); - NextContext(p); + CPpmd_State *s = p->FoundState; + CPpmd7_Context *mc = p->MinContext; + unsigned freq = s->Freq; + const unsigned summFreq = mc->Union2.SummFreq; + p->PrevSuccess = (2 * freq > summFreq); + p->RunLength += (Int32)p->PrevSuccess; + mc->Union2.SummFreq = (UInt16)(summFreq + 4); + freq += 4; + s->Freq = (Byte)freq; + if (freq > MAX_FREQ) + Ppmd7_Rescale(p); + Ppmd7_NextContext(p); } + +/* void Ppmd7_UpdateBin(CPpmd7 *p) { - p->FoundState->Freq = (Byte)(p->FoundState->Freq + (p->FoundState->Freq < 128 ? 1: 0)); + unsigned freq = p->FoundState->Freq; + p->FoundState->Freq = (Byte)(freq + (freq < 128)); p->PrevSuccess = 1; p->RunLength++; - NextContext(p); + Ppmd7_NextContext(p); } +*/ void Ppmd7_Update2(CPpmd7 *p) { - p->MinContext->SummFreq += 4; - if ((p->FoundState->Freq += 4) > MAX_FREQ) - Rescale(p); + CPpmd_State *s = p->FoundState; + unsigned freq = s->Freq; + freq += 4; p->RunLength = p->InitRL; - UpdateModel(p); + p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4); + s->Freq = (Byte)freq; + if (freq > MAX_FREQ) + Ppmd7_Rescale(p); + Ppmd7_UpdateModel(p); +} + + + +/* +PPMd Memory Map: +{ + [ 0 ] contains subset of original raw text, that is required to create context + records, Some symbols are not written, when max order context was reached + [ Text ] free area + [ UnitsStart ] CPpmd_State vectors and CPpmd7_Context records + [ LoUnit ] free area for CPpmd_State and CPpmd7_Context items +[ HiUnit ] CPpmd7_Context records + [ Size ] end of array } + +These addresses don't cross at any time. +And the following condtions is true for addresses: + (0 <= Text < UnitsStart <= LoUnit <= HiUnit <= Size) + +Raw text is BYTE--aligned. +the data in block [ UnitsStart ... Size ] contains 12-bytes aligned UNITs. + +Last UNIT of array at offset (Size - 12) is root order-0 CPpmd7_Context record. +The code can free UNITs memory blocks that were allocated to store CPpmd_State vectors. +The code doesn't free UNITs allocated for CPpmd7_Context records. + +The code calls Ppmd7_RestartModel(), when there is no free memory for allocation. +And Ppmd7_RestartModel() changes the state to orignal start state, with full free block. + + +The code allocates UNITs with the following order: + +Allocation of 1 UNIT for Context record + - from free space (HiUnit) down to (LoUnit) + - from FreeList[0] + - Ppmd7_AllocUnitsRare() + +Ppmd7_AllocUnits() for CPpmd_State vectors: + - from FreeList[i] + - from free space (LoUnit) up to (HiUnit) + - Ppmd7_AllocUnitsRare() + +Ppmd7_AllocUnitsRare() + - if (GlueCount == 0) + { Glue lists, GlueCount = 255, allocate from FreeList[i]] } + - loop for all higher sized FreeList[...] lists + - from (UnitsStart - Text), GlueCount-- + - ERROR + + +Each Record with Context contains the CPpmd_State vector, where each +CPpmd_State contains the link to Successor. +There are 3 types of Successor: + 1) NULL-Successor - NULL pointer. NULL-Successor links can be stored + only in 0-order Root Context Record. + We use 0 value as NULL-Successor + 2) RAW-Successor - the link to position in raw text, + that "RAW-Successor" is being created after first + occurrence of new symbol for some existing context record. + (RAW-Successor > 0). + 3) RECORD-Successor - the link to CPpmd7_Context record of (Order+1), + that record is being created when we go via RAW-Successor again. + +For any successors at any time: the following condtions are true for Successor links: +(NULL-Successor < RAW-Successor < UnitsStart <= RECORD-Successor) + + +---------- Symbol Frequency, SummFreq and Range in Range_Coder ---------- + +CPpmd7_Context::SummFreq = Sum(Stats[].Freq) + Escape_Freq + +The PPMd code tries to fulfill the condition: + (SummFreq <= (256 * 128 = RC::kBot)) + +We have (Sum(Stats[].Freq) <= 256 * 124), because of (MAX_FREQ = 124) +So (4 = 128 - 124) is average reserve for Escape_Freq for each symbol. +If (CPpmd_State::Freq) is not aligned for 4, the reserve can be 5, 6 or 7. +SummFreq and Escape_Freq can be changed in Ppmd7_Rescale() and *Update*() functions. +Ppmd7_Rescale() can remove symbols only from max-order contexts. So Escape_Freq can increase after multiple calls of Ppmd7_Rescale() for +max-order context. + +When the PPMd code still break (Total <= RC::Range) condition in range coder, +we have two ways to resolve that problem: + 1) we can report error, if we want to keep compatibility with original PPMd code that has no fix for such cases. + 2) we can reduce (Total) value to (RC::Range) by reducing (Escape_Freq) part of (Total) value. +*/ + +#undef MAX_FREQ +#undef UNIT_SIZE +#undef U2B +#undef U2I +#undef I2U +#undef I2U_UInt16 +#undef REF +#undef STATS_REF +#undef CTX +#undef STATS +#undef ONE_STATE +#undef SUFFIX +#undef NODE +#undef EMPTY_NODE +#undef MEM_12_CPY +#undef SUCCESSOR +#undef SWAP_STATES diff --git a/src/sdk/C/Ppmd7.h b/src/sdk/C/Ppmd7.h index 610539a..d9eb326 100644 --- a/src/sdk/C/Ppmd7.h +++ b/src/sdk/C/Ppmd7.h @@ -1,13 +1,11 @@ -/* Ppmd7.h -- PPMdH compression codec -2018-07-04 : Igor Pavlov : Public domain -This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ - -/* This code supports virtual RangeDecoder and includes the implementation -of RangeCoder from 7z, instead of RangeCoder from original PPMd var.H. -If you need the compatibility with original PPMd var.H, you can use external RangeDecoder */ +/* Ppmd7.h -- Ppmd7 (PPMdH) compression codec +2023-04-02 : Igor Pavlov : Public domain +This code is based on: + PPMd var.H (2001): Dmitry Shkarin : Public domain */ + -#ifndef __PPMD7_H -#define __PPMD7_H +#ifndef ZIP7_INC_PPMD7_H +#define ZIP7_INC_PPMD7_H #include "Ppmd.h" @@ -21,23 +19,56 @@ EXTERN_C_BEGIN struct CPpmd7_Context_; -typedef - #ifdef PPMD_32BIT - struct CPpmd7_Context_ * - #else - UInt32 - #endif - CPpmd7_Context_Ref; +typedef Ppmd_Ref_Type(struct CPpmd7_Context_) CPpmd7_Context_Ref; + +// MY_CPU_pragma_pack_push_1 typedef struct CPpmd7_Context_ { UInt16 NumStats; - UInt16 SummFreq; - CPpmd_State_Ref Stats; + + + union + { + UInt16 SummFreq; + CPpmd_State2 State2; + } Union2; + + union + { + CPpmd_State_Ref Stats; + CPpmd_State4 State4; + } Union4; + CPpmd7_Context_Ref Suffix; } CPpmd7_Context; -#define Ppmd7Context_OneState(p) ((CPpmd_State *)&(p)->SummFreq) +// MY_CPU_pragma_pop + +#define Ppmd7Context_OneState(p) ((CPpmd_State *)&(p)->Union2) + + + + +typedef struct +{ + UInt32 Range; + UInt32 Code; + UInt32 Low; + IByteInPtr Stream; +} CPpmd7_RangeDec; + + +typedef struct +{ + UInt32 Range; + Byte Cache; + // Byte _dummy_[3]; + UInt64 Low; + UInt64 CacheSize; + IByteOutPtr Stream; +} CPpmd7z_RangeEnc; + typedef struct { @@ -48,17 +79,30 @@ typedef struct UInt32 Size; UInt32 GlueCount; - Byte *Base, *LoUnit, *HiUnit, *Text, *UnitsStart; UInt32 AlignOffset; + Byte *Base, *LoUnit, *HiUnit, *Text, *UnitsStart; - Byte Indx2Units[PPMD_NUM_INDEXES]; + + + + union + { + CPpmd7_RangeDec dec; + CPpmd7z_RangeEnc enc; + } rc; + + Byte Indx2Units[PPMD_NUM_INDEXES + 2]; // +2 for alignment Byte Units2Indx[128]; CPpmd_Void_Ref FreeList[PPMD_NUM_INDEXES]; - Byte NS2Indx[256], NS2BSIndx[256], HB2Flag[256]; + + Byte NS2BSIndx[256], NS2Indx[256]; + Byte ExpEscape[16]; CPpmd_See DummySee, See[25][16]; UInt16 BinSumm[128][64]; + // int LastSymbol; } CPpmd7; + void Ppmd7_Construct(CPpmd7 *p); BoolInt Ppmd7_Alloc(CPpmd7 *p, UInt32 size, ISzAllocPtr alloc); void Ppmd7_Free(CPpmd7 *p, ISzAllocPtr alloc); @@ -68,74 +112,69 @@ void Ppmd7_Init(CPpmd7 *p, unsigned maxOrder); /* ---------- Internal Functions ---------- */ -extern const Byte PPMD7_kExpEscape[16]; - -#ifdef PPMD_32BIT - #define Ppmd7_GetPtr(p, ptr) (ptr) - #define Ppmd7_GetContext(p, ptr) (ptr) - #define Ppmd7_GetStats(p, ctx) ((ctx)->Stats) -#else - #define Ppmd7_GetPtr(p, offs) ((void *)((p)->Base + (offs))) - #define Ppmd7_GetContext(p, offs) ((CPpmd7_Context *)Ppmd7_GetPtr((p), (offs))) - #define Ppmd7_GetStats(p, ctx) ((CPpmd_State *)Ppmd7_GetPtr((p), ((ctx)->Stats))) -#endif +#define Ppmd7_GetPtr(p, ptr) Ppmd_GetPtr(p, ptr) +#define Ppmd7_GetContext(p, ptr) Ppmd_GetPtr_Type(p, ptr, CPpmd7_Context) +#define Ppmd7_GetStats(p, ctx) Ppmd_GetPtr_Type(p, (ctx)->Union4.Stats, CPpmd_State) void Ppmd7_Update1(CPpmd7 *p); void Ppmd7_Update1_0(CPpmd7 *p); void Ppmd7_Update2(CPpmd7 *p); -void Ppmd7_UpdateBin(CPpmd7 *p); + +#define PPMD7_HiBitsFlag_3(sym) ((((unsigned)sym + 0xC0) >> (8 - 3)) & (1 << 3)) +#define PPMD7_HiBitsFlag_4(sym) ((((unsigned)sym + 0xC0) >> (8 - 4)) & (1 << 4)) +// #define PPMD7_HiBitsFlag_3(sym) ((sym) < 0x40 ? 0 : (1 << 3)) +// #define PPMD7_HiBitsFlag_4(sym) ((sym) < 0x40 ? 0 : (1 << 4)) #define Ppmd7_GetBinSumm(p) \ - &p->BinSumm[(size_t)(unsigned)Ppmd7Context_OneState(p->MinContext)->Freq - 1][p->PrevSuccess + \ - p->NS2BSIndx[(size_t)Ppmd7_GetContext(p, p->MinContext->Suffix)->NumStats - 1] + \ - (p->HiBitsFlag = p->HB2Flag[p->FoundState->Symbol]) + \ - 2 * p->HB2Flag[(unsigned)Ppmd7Context_OneState(p->MinContext)->Symbol] + \ - ((p->RunLength >> 26) & 0x20)] + &p->BinSumm[(size_t)(unsigned)Ppmd7Context_OneState(p->MinContext)->Freq - 1] \ + [ p->PrevSuccess + ((p->RunLength >> 26) & 0x20) \ + + p->NS2BSIndx[(size_t)Ppmd7_GetContext(p, p->MinContext->Suffix)->NumStats - 1] \ + + PPMD7_HiBitsFlag_4(Ppmd7Context_OneState(p->MinContext)->Symbol) \ + + (p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol)) ] CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *scale); +/* +We support two versions of Ppmd7 (PPMdH) methods that use same CPpmd7 structure: + 1) Ppmd7a_*: original PPMdH + 2) Ppmd7z_*: modified PPMdH with 7z Range Coder +Ppmd7_*: the structures and functions that are common for both versions of PPMd7 (PPMdH) +*/ + /* ---------- Decode ---------- */ -typedef struct IPpmd7_RangeDec IPpmd7_RangeDec; +#define PPMD7_SYM_END (-1) +#define PPMD7_SYM_ERROR (-2) -struct IPpmd7_RangeDec -{ - UInt32 (*GetThreshold)(const IPpmd7_RangeDec *p, UInt32 total); - void (*Decode)(const IPpmd7_RangeDec *p, UInt32 start, UInt32 size); - UInt32 (*DecodeBit)(const IPpmd7_RangeDec *p, UInt32 size0); -}; +/* +You must set (CPpmd7::rc.dec.Stream) before Ppmd7*_RangeDec_Init() -typedef struct -{ - IPpmd7_RangeDec vt; - UInt32 Range; - UInt32 Code; - IByteIn *Stream; -} CPpmd7z_RangeDec; +Ppmd7*_DecodeSymbol() +out: + >= 0 : decoded byte + -1 : PPMD7_SYM_END : End of payload marker + -2 : PPMD7_SYM_ERROR : Data error +*/ -void Ppmd7z_RangeDec_CreateVTable(CPpmd7z_RangeDec *p); -BoolInt Ppmd7z_RangeDec_Init(CPpmd7z_RangeDec *p); -#define Ppmd7z_RangeDec_IsFinishedOK(p) ((p)->Code == 0) +/* Ppmd7a_* : original PPMdH */ +BoolInt Ppmd7a_RangeDec_Init(CPpmd7_RangeDec *p); +#define Ppmd7a_RangeDec_IsFinishedOK(p) ((p)->Code == 0) +int Ppmd7a_DecodeSymbol(CPpmd7 *p); -int Ppmd7_DecodeSymbol(CPpmd7 *p, const IPpmd7_RangeDec *rc); +/* Ppmd7z_* : modified PPMdH with 7z Range Coder */ +BoolInt Ppmd7z_RangeDec_Init(CPpmd7_RangeDec *p); +#define Ppmd7z_RangeDec_IsFinishedOK(p) ((p)->Code == 0) +int Ppmd7z_DecodeSymbol(CPpmd7 *p); +// Byte *Ppmd7z_DecodeSymbols(CPpmd7 *p, Byte *buf, const Byte *lim); /* ---------- Encode ---------- */ -typedef struct -{ - UInt64 Low; - UInt32 Range; - Byte Cache; - UInt64 CacheSize; - IByteOut *Stream; -} CPpmd7z_RangeEnc; - -void Ppmd7z_RangeEnc_Init(CPpmd7z_RangeEnc *p); -void Ppmd7z_RangeEnc_FlushData(CPpmd7z_RangeEnc *p); - -void Ppmd7_EncodeSymbol(CPpmd7 *p, CPpmd7z_RangeEnc *rc, int symbol); +void Ppmd7z_Init_RangeEnc(CPpmd7 *p); +void Ppmd7z_Flush_RangeEnc(CPpmd7 *p); +// void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol); +void Ppmd7z_EncodeSymbols(CPpmd7 *p, const Byte *buf, const Byte *lim); EXTERN_C_END diff --git a/src/sdk/C/Ppmd7Dec.c b/src/sdk/C/Ppmd7Dec.c index 311e9f9..081ab89 100644 --- a/src/sdk/C/Ppmd7Dec.c +++ b/src/sdk/C/Ppmd7Dec.c @@ -1,191 +1,312 @@ -/* Ppmd7Dec.c -- PPMdH Decoder -2018-07-04 : Igor Pavlov : Public domain -This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ +/* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder +2023-09-07 : Igor Pavlov : Public domain +This code is based on: + PPMd var.H (2001): Dmitry Shkarin : Public domain */ + #include "Precomp.h" #include "Ppmd7.h" -#define kTopValue (1 << 24) +#define kTopValue ((UInt32)1 << 24) + + +#define READ_BYTE(p) IByteIn_Read((p)->Stream) -BoolInt Ppmd7z_RangeDec_Init(CPpmd7z_RangeDec *p) +BoolInt Ppmd7z_RangeDec_Init(CPpmd7_RangeDec *p) { unsigned i; p->Code = 0; p->Range = 0xFFFFFFFF; - if (IByteIn_Read(p->Stream) != 0) + if (READ_BYTE(p) != 0) return False; for (i = 0; i < 4; i++) - p->Code = (p->Code << 8) | IByteIn_Read(p->Stream); + p->Code = (p->Code << 8) | READ_BYTE(p); return (p->Code < 0xFFFFFFFF); } -#define GET_Ppmd7z_RangeDec CPpmd7z_RangeDec *p = CONTAINER_FROM_VTBL(pp, CPpmd7z_RangeDec, vt); - -static UInt32 Range_GetThreshold(const IPpmd7_RangeDec *pp, UInt32 total) -{ - GET_Ppmd7z_RangeDec - return p->Code / (p->Range /= total); -} +#define RC_NORM_BASE(p) if ((p)->Range < kTopValue) \ + { (p)->Code = ((p)->Code << 8) | READ_BYTE(p); (p)->Range <<= 8; -static void Range_Normalize(CPpmd7z_RangeDec *p) -{ - if (p->Range < kTopValue) - { - p->Code = (p->Code << 8) | IByteIn_Read(p->Stream); - p->Range <<= 8; - if (p->Range < kTopValue) - { - p->Code = (p->Code << 8) | IByteIn_Read(p->Stream); - p->Range <<= 8; - } - } -} +#define RC_NORM_1(p) RC_NORM_BASE(p) } +#define RC_NORM(p) RC_NORM_BASE(p) RC_NORM_BASE(p) }} -static void Range_Decode(const IPpmd7_RangeDec *pp, UInt32 start, UInt32 size) -{ - GET_Ppmd7z_RangeDec - p->Code -= start * p->Range; - p->Range *= size; - Range_Normalize(p); -} +// we must use only one type of Normalization from two: LOCAL or REMOTE +#define RC_NORM_LOCAL(p) // RC_NORM(p) +#define RC_NORM_REMOTE(p) RC_NORM(p) -static UInt32 Range_DecodeBit(const IPpmd7_RangeDec *pp, UInt32 size0) -{ - GET_Ppmd7z_RangeDec - UInt32 newBound = (p->Range >> 14) * size0; - UInt32 symbol; - if (p->Code < newBound) - { - symbol = 0; - p->Range = newBound; - } - else - { - symbol = 1; - p->Code -= newBound; - p->Range -= newBound; - } - Range_Normalize(p); - return symbol; -} +#define R (&p->rc.dec) -void Ppmd7z_RangeDec_CreateVTable(CPpmd7z_RangeDec *p) +Z7_FORCE_INLINE +// Z7_NO_INLINE +static void Ppmd7z_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size) { - p->vt.GetThreshold = Range_GetThreshold; - p->vt.Decode = Range_Decode; - p->vt.DecodeBit = Range_DecodeBit; + + + R->Code -= start * R->Range; + R->Range *= size; + RC_NORM_LOCAL(R) } +#define RC_Decode(start, size) Ppmd7z_RD_Decode(p, start, size); +#define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R) +#define RC_GetThreshold(total) (R->Code / (R->Range /= (total))) + -#define MASK(sym) ((signed char *)charMask)[sym] +#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref)) +// typedef CPpmd7_Context * CTX_PTR; +#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) +void Ppmd7_UpdateModel(CPpmd7 *p); -int Ppmd7_DecodeSymbol(CPpmd7 *p, const IPpmd7_RangeDec *rc) +#define MASK(sym) ((Byte *)charMask)[sym] +// Z7_FORCE_INLINE +// static +int Ppmd7z_DecodeSymbol(CPpmd7 *p) { size_t charMask[256 / sizeof(size_t)]; + if (p->MinContext->NumStats != 1) { CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext); unsigned i; UInt32 count, hiCnt; - if ((count = rc->GetThreshold(rc, p->MinContext->SummFreq)) < (hiCnt = s->Freq)) + const UInt32 summFreq = p->MinContext->Union2.SummFreq; + + + + + count = RC_GetThreshold(summFreq); + hiCnt = count; + + if ((Int32)(count -= s->Freq) < 0) { - Byte symbol; - rc->Decode(rc, 0, s->Freq); + Byte sym; + RC_DecodeFinal(0, s->Freq) p->FoundState = s; - symbol = s->Symbol; + sym = s->Symbol; Ppmd7_Update1_0(p); - return symbol; + return sym; } + p->PrevSuccess = 0; - i = p->MinContext->NumStats - 1; + i = (unsigned)p->MinContext->NumStats - 1; + do { - if ((hiCnt += (++s)->Freq) > count) + if ((Int32)(count -= (++s)->Freq) < 0) { - Byte symbol; - rc->Decode(rc, hiCnt - s->Freq, s->Freq); + Byte sym; + RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq) p->FoundState = s; - symbol = s->Symbol; + sym = s->Symbol; Ppmd7_Update1(p); - return symbol; + return sym; } } while (--i); - if (count >= p->MinContext->SummFreq) - return -2; - p->HiBitsFlag = p->HB2Flag[p->FoundState->Symbol]; - rc->Decode(rc, hiCnt, p->MinContext->SummFreq - hiCnt); - PPMD_SetAllBitsIn256Bytes(charMask); - MASK(s->Symbol) = 0; - i = p->MinContext->NumStats - 1; - do { MASK((--s)->Symbol) = 0; } while (--i); + + if (hiCnt >= summFreq) + return PPMD7_SYM_ERROR; + + hiCnt -= count; + RC_Decode(hiCnt, summFreq - hiCnt) + + p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol); + PPMD_SetAllBitsIn256Bytes(charMask) + // i = p->MinContext->NumStats - 1; + // do { MASK((--s)->Symbol) = 0; } while (--i); + { + CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext); + MASK(s->Symbol) = 0; + do + { + const unsigned sym0 = s2[0].Symbol; + const unsigned sym1 = s2[1].Symbol; + s2 += 2; + MASK(sym0) = 0; + MASK(sym1) = 0; + } + while (s2 < s); + } } else { + CPpmd_State *s = Ppmd7Context_OneState(p->MinContext); UInt16 *prob = Ppmd7_GetBinSumm(p); - if (rc->DecodeBit(rc, *prob) == 0) + UInt32 pr = *prob; + UInt32 size0 = (R->Range >> 14) * pr; + pr = PPMD_UPDATE_PROB_1(pr); + + if (R->Code < size0) { - Byte symbol; - *prob = (UInt16)PPMD_UPDATE_PROB_0(*prob); - symbol = (p->FoundState = Ppmd7Context_OneState(p->MinContext))->Symbol; - Ppmd7_UpdateBin(p); - return symbol; + Byte sym; + *prob = (UInt16)(pr + (1 << PPMD_INT_BITS)); + + // RangeDec_DecodeBit0(size0); + R->Range = size0; + RC_NORM_1(R) + /* we can use single byte normalization here because of + (min(BinSumm[][]) = 95) > (1 << (14 - 8)) */ + + // sym = (p->FoundState = Ppmd7Context_OneState(p->MinContext))->Symbol; + // Ppmd7_UpdateBin(p); + { + unsigned freq = s->Freq; + CPpmd7_Context *c = CTX(SUCCESSOR(s)); + sym = s->Symbol; + p->FoundState = s; + p->PrevSuccess = 1; + p->RunLength++; + s->Freq = (Byte)(freq + (freq < 128)); + // NextContext(p); + if (p->OrderFall == 0 && (const Byte *)c > p->Text) + p->MaxContext = p->MinContext = c; + else + Ppmd7_UpdateModel(p); + } + return sym; } - *prob = (UInt16)PPMD_UPDATE_PROB_1(*prob); - p->InitEsc = PPMD7_kExpEscape[*prob >> 10]; - PPMD_SetAllBitsIn256Bytes(charMask); + + *prob = (UInt16)pr; + p->InitEsc = p->ExpEscape[pr >> 10]; + + // RangeDec_DecodeBit1(size0); + + R->Code -= size0; + R->Range -= size0; + RC_NORM_LOCAL(R) + + PPMD_SetAllBitsIn256Bytes(charMask) MASK(Ppmd7Context_OneState(p->MinContext)->Symbol) = 0; p->PrevSuccess = 0; } + for (;;) { - CPpmd_State *ps[256], *s; + CPpmd_State *s, *s2; UInt32 freqSum, count, hiCnt; + CPpmd_See *see; - unsigned i, num, numMasked = p->MinContext->NumStats; + CPpmd7_Context *mc; + unsigned numMasked; + RC_NORM_REMOTE(R) + mc = p->MinContext; + numMasked = mc->NumStats; + do { p->OrderFall++; - if (!p->MinContext->Suffix) - return -1; - p->MinContext = Ppmd7_GetContext(p, p->MinContext->Suffix); + if (!mc->Suffix) + return PPMD7_SYM_END; + mc = Ppmd7_GetContext(p, mc->Suffix); } - while (p->MinContext->NumStats == numMasked); - hiCnt = 0; - s = Ppmd7_GetStats(p, p->MinContext); - i = 0; - num = p->MinContext->NumStats - numMasked; - do + while (mc->NumStats == numMasked); + + s = Ppmd7_GetStats(p, mc); + { - int k = (int)(MASK(s->Symbol)); - hiCnt += (s->Freq & k); - ps[i] = s++; - i -= k; + unsigned num = mc->NumStats; + unsigned num2 = num / 2; + + num &= 1; + hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num); + s += num; + p->MinContext = mc; + + do + { + const unsigned sym0 = s[0].Symbol; + const unsigned sym1 = s[1].Symbol; + s += 2; + hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0))); + hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1))); + } + while (--num2); } - while (i != num); - + see = Ppmd7_MakeEscFreq(p, numMasked, &freqSum); freqSum += hiCnt; - count = rc->GetThreshold(rc, freqSum); + + + + + count = RC_GetThreshold(freqSum); if (count < hiCnt) { - Byte symbol; - CPpmd_State **pps = ps; - for (hiCnt = 0; (hiCnt += (*pps)->Freq) <= count; pps++); - s = *pps; - rc->Decode(rc, hiCnt - s->Freq, s->Freq); - Ppmd_See_Update(see); + Byte sym; + + s = Ppmd7_GetStats(p, p->MinContext); + hiCnt = count; + // count -= s->Freq & (UInt32)(MASK(s->Symbol)); + // if ((Int32)count >= 0) + { + for (;;) + { + count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; + // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; + } + } + s--; + RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq) + + // new (see->Summ) value can overflow over 16-bits in some rare cases + Ppmd_See_UPDATE(see) p->FoundState = s; - symbol = s->Symbol; + sym = s->Symbol; Ppmd7_Update2(p); - return symbol; + return sym; } + if (count >= freqSum) - return -2; - rc->Decode(rc, hiCnt, freqSum - hiCnt); + return PPMD7_SYM_ERROR; + + RC_Decode(hiCnt, freqSum - hiCnt) + + // We increase (see->Summ) for sum of Freqs of all non_Masked symbols. + // new (see->Summ) value can overflow over 16-bits in some rare cases see->Summ = (UInt16)(see->Summ + freqSum); - do { MASK(ps[--i]->Symbol) = 0; } while (i != 0); + + s = Ppmd7_GetStats(p, p->MinContext); + s2 = s + p->MinContext->NumStats; + do + { + MASK(s->Symbol) = 0; + s++; + } + while (s != s2); } } + +/* +Byte *Ppmd7z_DecodeSymbols(CPpmd7 *p, Byte *buf, const Byte *lim) +{ + int sym = 0; + if (buf != lim) + do + { + sym = Ppmd7z_DecodeSymbol(p); + if (sym < 0) + break; + *buf = (Byte)sym; + } + while (++buf < lim); + p->LastSymbol = sym; + return buf; +} +*/ + +#undef kTopValue +#undef READ_BYTE +#undef RC_NORM_BASE +#undef RC_NORM_1 +#undef RC_NORM +#undef RC_NORM_LOCAL +#undef RC_NORM_REMOTE +#undef R +#undef RC_Decode +#undef RC_DecodeFinal +#undef RC_GetThreshold +#undef CTX +#undef SUCCESSOR +#undef MASK diff --git a/src/sdk/C/Ppmd7Enc.c b/src/sdk/C/Ppmd7Enc.c index 286b871..49cbbe6 100644 --- a/src/sdk/C/Ppmd7Enc.c +++ b/src/sdk/C/Ppmd7Enc.c @@ -1,104 +1,123 @@ -/* Ppmd7Enc.c -- PPMdH Encoder -2017-04-03 : Igor Pavlov : Public domain -This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ +/* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder +2023-09-07 : Igor Pavlov : Public domain +This code is based on: + PPMd var.H (2001): Dmitry Shkarin : Public domain */ + #include "Precomp.h" #include "Ppmd7.h" -#define kTopValue (1 << 24) +#define kTopValue ((UInt32)1 << 24) + +#define R (&p->rc.enc) -void Ppmd7z_RangeEnc_Init(CPpmd7z_RangeEnc *p) +void Ppmd7z_Init_RangeEnc(CPpmd7 *p) { - p->Low = 0; - p->Range = 0xFFFFFFFF; - p->Cache = 0; - p->CacheSize = 1; + R->Low = 0; + R->Range = 0xFFFFFFFF; + R->Cache = 0; + R->CacheSize = 1; } -static void RangeEnc_ShiftLow(CPpmd7z_RangeEnc *p) +Z7_NO_INLINE +static void Ppmd7z_RangeEnc_ShiftLow(CPpmd7 *p) { - if ((UInt32)p->Low < (UInt32)0xFF000000 || (unsigned)(p->Low >> 32) != 0) + if ((UInt32)R->Low < (UInt32)0xFF000000 || (unsigned)(R->Low >> 32) != 0) { - Byte temp = p->Cache; + Byte temp = R->Cache; do { - IByteOut_Write(p->Stream, (Byte)(temp + (Byte)(p->Low >> 32))); + IByteOut_Write(R->Stream, (Byte)(temp + (Byte)(R->Low >> 32))); temp = 0xFF; } - while (--p->CacheSize != 0); - p->Cache = (Byte)((UInt32)p->Low >> 24); + while (--R->CacheSize != 0); + R->Cache = (Byte)((UInt32)R->Low >> 24); } - p->CacheSize++; - p->Low = (UInt32)p->Low << 8; + R->CacheSize++; + R->Low = (UInt32)((UInt32)R->Low << 8); } -static void RangeEnc_Encode(CPpmd7z_RangeEnc *p, UInt32 start, UInt32 size, UInt32 total) -{ - p->Low += start * (p->Range /= total); - p->Range *= size; - while (p->Range < kTopValue) - { - p->Range <<= 8; - RangeEnc_ShiftLow(p); - } -} +#define RC_NORM_BASE(p) if (R->Range < kTopValue) { R->Range <<= 8; Ppmd7z_RangeEnc_ShiftLow(p); +#define RC_NORM_1(p) RC_NORM_BASE(p) } +#define RC_NORM(p) RC_NORM_BASE(p) RC_NORM_BASE(p) }} -static void RangeEnc_EncodeBit_0(CPpmd7z_RangeEnc *p, UInt32 size0) -{ - p->Range = (p->Range >> 14) * size0; - while (p->Range < kTopValue) - { - p->Range <<= 8; - RangeEnc_ShiftLow(p); - } -} +// we must use only one type of Normalization from two: LOCAL or REMOTE +#define RC_NORM_LOCAL(p) // RC_NORM(p) +#define RC_NORM_REMOTE(p) RC_NORM(p) -static void RangeEnc_EncodeBit_1(CPpmd7z_RangeEnc *p, UInt32 size0) +/* +#define Ppmd7z_RangeEnc_Encode(p, start, _size_) \ + { UInt32 size = _size_; \ + R->Low += start * R->Range; \ + R->Range *= size; \ + RC_NORM_LOCAL(p); } +*/ + +Z7_FORCE_INLINE +// Z7_NO_INLINE +static void Ppmd7z_RangeEnc_Encode(CPpmd7 *p, UInt32 start, UInt32 size) { - UInt32 newBound = (p->Range >> 14) * size0; - p->Low += newBound; - p->Range -= newBound; - while (p->Range < kTopValue) - { - p->Range <<= 8; - RangeEnc_ShiftLow(p); - } + R->Low += start * R->Range; + R->Range *= size; + RC_NORM_LOCAL(p) } -void Ppmd7z_RangeEnc_FlushData(CPpmd7z_RangeEnc *p) +void Ppmd7z_Flush_RangeEnc(CPpmd7 *p) { unsigned i; for (i = 0; i < 5; i++) - RangeEnc_ShiftLow(p); + Ppmd7z_RangeEnc_ShiftLow(p); } -#define MASK(sym) ((signed char *)charMask)[sym] -void Ppmd7_EncodeSymbol(CPpmd7 *p, CPpmd7z_RangeEnc *rc, int symbol) +#define RC_Encode(start, size) Ppmd7z_RangeEnc_Encode(p, start, size); +#define RC_EncodeFinal(start, size) RC_Encode(start, size) RC_NORM_REMOTE(p) + +#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref)) +#define SUFFIX(ctx) CTX((ctx)->Suffix) +// typedef CPpmd7_Context * CTX_PTR; +#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) + +void Ppmd7_UpdateModel(CPpmd7 *p); + +#define MASK(sym) ((Byte *)charMask)[sym] + +Z7_FORCE_INLINE +static +void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol) { size_t charMask[256 / sizeof(size_t)]; + if (p->MinContext->NumStats != 1) { CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext); UInt32 sum; unsigned i; + + + + + R->Range /= p->MinContext->Union2.SummFreq; + if (s->Symbol == symbol) { - RangeEnc_Encode(rc, 0, s->Freq, p->MinContext->SummFreq); + // R->Range /= p->MinContext->Union2.SummFreq; + RC_EncodeFinal(0, s->Freq) p->FoundState = s; Ppmd7_Update1_0(p); return; } p->PrevSuccess = 0; sum = s->Freq; - i = p->MinContext->NumStats - 1; + i = (unsigned)p->MinContext->NumStats - 1; do { if ((++s)->Symbol == symbol) { - RangeEnc_Encode(rc, sum, s->Freq, p->MinContext->SummFreq); + // R->Range /= p->MinContext->Union2.SummFreq; + RC_EncodeFinal(sum, s->Freq) p->FoundState = s; Ppmd7_Update1(p); return; @@ -106,82 +125,213 @@ void Ppmd7_EncodeSymbol(CPpmd7 *p, CPpmd7z_RangeEnc *rc, int symbol) sum += s->Freq; } while (--i); + + // R->Range /= p->MinContext->Union2.SummFreq; + RC_Encode(sum, p->MinContext->Union2.SummFreq - sum) - p->HiBitsFlag = p->HB2Flag[p->FoundState->Symbol]; - PPMD_SetAllBitsIn256Bytes(charMask); - MASK(s->Symbol) = 0; - i = p->MinContext->NumStats - 1; - do { MASK((--s)->Symbol) = 0; } while (--i); - RangeEnc_Encode(rc, sum, p->MinContext->SummFreq - sum, p->MinContext->SummFreq); + p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol); + PPMD_SetAllBitsIn256Bytes(charMask) + // MASK(s->Symbol) = 0; + // i = p->MinContext->NumStats - 1; + // do { MASK((--s)->Symbol) = 0; } while (--i); + { + CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext); + MASK(s->Symbol) = 0; + do + { + const unsigned sym0 = s2[0].Symbol; + const unsigned sym1 = s2[1].Symbol; + s2 += 2; + MASK(sym0) = 0; + MASK(sym1) = 0; + } + while (s2 < s); + } } else { UInt16 *prob = Ppmd7_GetBinSumm(p); CPpmd_State *s = Ppmd7Context_OneState(p->MinContext); + UInt32 pr = *prob; + const UInt32 bound = (R->Range >> 14) * pr; + pr = PPMD_UPDATE_PROB_1(pr); if (s->Symbol == symbol) { - RangeEnc_EncodeBit_0(rc, *prob); - *prob = (UInt16)PPMD_UPDATE_PROB_0(*prob); - p->FoundState = s; - Ppmd7_UpdateBin(p); + *prob = (UInt16)(pr + (1 << PPMD_INT_BITS)); + // RangeEnc_EncodeBit_0(p, bound); + R->Range = bound; + RC_NORM_1(p) + + // p->FoundState = s; + // Ppmd7_UpdateBin(p); + { + const unsigned freq = s->Freq; + CPpmd7_Context *c = CTX(SUCCESSOR(s)); + p->FoundState = s; + p->PrevSuccess = 1; + p->RunLength++; + s->Freq = (Byte)(freq + (freq < 128)); + // NextContext(p); + if (p->OrderFall == 0 && (const Byte *)c > p->Text) + p->MaxContext = p->MinContext = c; + else + Ppmd7_UpdateModel(p); + } return; } - else - { - RangeEnc_EncodeBit_1(rc, *prob); - *prob = (UInt16)PPMD_UPDATE_PROB_1(*prob); - p->InitEsc = PPMD7_kExpEscape[*prob >> 10]; - PPMD_SetAllBitsIn256Bytes(charMask); - MASK(s->Symbol) = 0; - p->PrevSuccess = 0; - } + + *prob = (UInt16)pr; + p->InitEsc = p->ExpEscape[pr >> 10]; + // RangeEnc_EncodeBit_1(p, bound); + R->Low += bound; + R->Range -= bound; + RC_NORM_LOCAL(p) + + PPMD_SetAllBitsIn256Bytes(charMask) + MASK(s->Symbol) = 0; + p->PrevSuccess = 0; } + for (;;) { - UInt32 escFreq; CPpmd_See *see; CPpmd_State *s; - UInt32 sum; - unsigned i, numMasked = p->MinContext->NumStats; + UInt32 sum, escFreq; + CPpmd7_Context *mc; + unsigned i, numMasked; + + RC_NORM_REMOTE(p) + + mc = p->MinContext; + numMasked = mc->NumStats; + do { p->OrderFall++; - if (!p->MinContext->Suffix) + if (!mc->Suffix) return; /* EndMarker (symbol = -1) */ - p->MinContext = Ppmd7_GetContext(p, p->MinContext->Suffix); + mc = Ppmd7_GetContext(p, mc->Suffix); + i = mc->NumStats; } - while (p->MinContext->NumStats == numMasked); + while (i == numMasked); + + p->MinContext = mc; - see = Ppmd7_MakeEscFreq(p, numMasked, &escFreq); - s = Ppmd7_GetStats(p, p->MinContext); + // see = Ppmd7_MakeEscFreq(p, numMasked, &escFreq); + { + if (i != 256) + { + unsigned nonMasked = i - numMasked; + see = p->See[(unsigned)p->NS2Indx[(size_t)nonMasked - 1]] + + p->HiBitsFlag + + (nonMasked < (unsigned)SUFFIX(mc)->NumStats - i) + + 2 * (unsigned)(mc->Union2.SummFreq < 11 * i) + + 4 * (unsigned)(numMasked > nonMasked); + { + // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ + unsigned summ = (UInt16)see->Summ; // & 0xFFFF + unsigned r = (summ >> see->Shift); + see->Summ = (UInt16)(summ - r); + escFreq = r + (r == 0); + } + } + else + { + see = &p->DummySee; + escFreq = 1; + } + } + + s = Ppmd7_GetStats(p, mc); sum = 0; - i = p->MinContext->NumStats; + // i = mc->NumStats; + do { - int cur = s->Symbol; - if (cur == symbol) + const unsigned cur = s->Symbol; + if ((int)cur == symbol) { - UInt32 low = sum; - CPpmd_State *s1 = s; - do + const UInt32 low = sum; + const UInt32 freq = s->Freq; + unsigned num2; + + Ppmd_See_UPDATE(see) + p->FoundState = s; + sum += escFreq; + + num2 = i / 2; + i &= 1; + sum += freq & (0 - (UInt32)i); + if (num2 != 0) { - sum += (s->Freq & (int)(MASK(s->Symbol))); - s++; + s += i; + do + { + const unsigned sym0 = s[0].Symbol; + const unsigned sym1 = s[1].Symbol; + s += 2; + sum += (s[-2].Freq & (unsigned)(MASK(sym0))); + sum += (s[-1].Freq & (unsigned)(MASK(sym1))); + } + while (--num2); } - while (--i); - RangeEnc_Encode(rc, low, s1->Freq, sum + escFreq); - Ppmd_See_Update(see); - p->FoundState = s1; + + + R->Range /= sum; + RC_EncodeFinal(low, freq) Ppmd7_Update2(p); return; } - sum += (s->Freq & (int)(MASK(cur))); - MASK(cur) = 0; + sum += (s->Freq & (unsigned)(MASK(cur))); s++; } while (--i); - RangeEnc_Encode(rc, sum, escFreq, sum + escFreq); - see->Summ = (UInt16)(see->Summ + sum + escFreq); + { + const UInt32 total = sum + escFreq; + see->Summ = (UInt16)(see->Summ + total); + + R->Range /= total; + RC_Encode(sum, escFreq) + } + + { + const CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext); + s--; + MASK(s->Symbol) = 0; + do + { + const unsigned sym0 = s2[0].Symbol; + const unsigned sym1 = s2[1].Symbol; + s2 += 2; + MASK(sym0) = 0; + MASK(sym1) = 0; + } + while (s2 < s); + } } } + + +void Ppmd7z_EncodeSymbols(CPpmd7 *p, const Byte *buf, const Byte *lim) +{ + for (; buf < lim; buf++) + { + Ppmd7z_EncodeSymbol(p, *buf); + } +} + +#undef kTopValue +#undef WRITE_BYTE +#undef RC_NORM_BASE +#undef RC_NORM_1 +#undef RC_NORM +#undef RC_NORM_LOCAL +#undef RC_NORM_REMOTE +#undef R +#undef RC_Encode +#undef RC_EncodeFinal +#undef SUFFIX +#undef CTX +#undef SUCCESSOR +#undef MASK diff --git a/src/sdk/C/Precomp.h b/src/sdk/C/Precomp.h index e8ff8b4..7747fdd 100644 --- a/src/sdk/C/Precomp.h +++ b/src/sdk/C/Precomp.h @@ -1,10 +1,127 @@ -/* Precomp.h -- StdAfx -2013-11-12 : Igor Pavlov : Public domain */ +/* Precomp.h -- precompilation file +2024-01-25 : Igor Pavlov : Public domain */ -#ifndef __7Z_PRECOMP_H -#define __7Z_PRECOMP_H +#ifndef ZIP7_INC_PRECOMP_H +#define ZIP7_INC_PRECOMP_H + +/* + this file must be included before another *.h files and before . + this file is included from the following files: + C\*.c + C\Util\*\Precomp.h <- C\Util\*\*.c + CPP\Common\Common.h <- *\StdAfx.h <- *\*.cpp + + this file can set the following macros: + Z7_LARGE_PAGES 1 + Z7_LONG_PATH 1 + Z7_WIN32_WINNT_MIN 0x0500 (or higher) : we require at least win2000+ for 7-Zip + _WIN32_WINNT 0x0500 (or higher) + WINVER _WIN32_WINNT + UNICODE 1 + _UNICODE 1 +*/ #include "Compiler.h" -/* #include "7zTypes.h" */ + +#ifdef _MSC_VER +// #pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty +#if _MSC_VER >= 1912 +// #pragma warning(disable : 5039) // pointer or reference to potentially throwing function passed to 'extern "C"' function under - EHc.Undefined behavior may occur if this function throws an exception. +#endif +#endif + +/* +// for debug: +#define UNICODE 1 +#define _UNICODE 1 +#define _WIN32_WINNT 0x0500 // win2000 +#ifndef WINVER + #define WINVER _WIN32_WINNT +#endif +*/ + +#ifdef _WIN32 +/* + this "Precomp.h" file must be included before , + if we want to define _WIN32_WINNT before . +*/ + +#ifndef Z7_LARGE_PAGES +#ifndef Z7_NO_LARGE_PAGES +#define Z7_LARGE_PAGES 1 +#endif +#endif + +#ifndef Z7_LONG_PATH +#ifndef Z7_NO_LONG_PATH +#define Z7_LONG_PATH 1 +#endif +#endif + +#ifndef Z7_DEVICE_FILE +#ifndef Z7_NO_DEVICE_FILE +// #define Z7_DEVICE_FILE 1 +#endif +#endif + +// we don't change macros if included after +#ifndef _WINDOWS_ + +#ifndef Z7_WIN32_WINNT_MIN + #if defined(_M_ARM64) || defined(__aarch64__) + // #define Z7_WIN32_WINNT_MIN 0x0a00 // win10 + #define Z7_WIN32_WINNT_MIN 0x0600 // vista + #elif defined(_M_ARM) && defined(_M_ARMT) && defined(_M_ARM_NT) + // #define Z7_WIN32_WINNT_MIN 0x0602 // win8 + #define Z7_WIN32_WINNT_MIN 0x0600 // vista + #elif defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(_M_IA64) + #define Z7_WIN32_WINNT_MIN 0x0503 // win2003 + // #elif defined(_M_IX86) || defined(__i386__) + // #define Z7_WIN32_WINNT_MIN 0x0500 // win2000 + #else // x86 and another(old) systems + #define Z7_WIN32_WINNT_MIN 0x0500 // win2000 + // #define Z7_WIN32_WINNT_MIN 0x0502 // win2003 // for debug + #endif +#endif // Z7_WIN32_WINNT_MIN + + +#ifndef Z7_DO_NOT_DEFINE_WIN32_WINNT +#ifdef _WIN32_WINNT + // #error Stop_Compiling_Bad_WIN32_WINNT +#else + #ifndef Z7_NO_DEFINE_WIN32_WINNT +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #define _WIN32_WINNT Z7_WIN32_WINNT_MIN +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER + #endif +#endif // _WIN32_WINNT + +#ifndef WINVER + #define WINVER _WIN32_WINNT +#endif +#endif // Z7_DO_NOT_DEFINE_WIN32_WINNT + + +#ifndef _MBCS +#ifndef Z7_NO_UNICODE +// UNICODE and _UNICODE are used by and by 7-zip code. + +#ifndef UNICODE +#define UNICODE 1 +#endif + +#ifndef _UNICODE +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +#define _UNICODE 1 +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif + +#endif // Z7_NO_UNICODE +#endif // _MBCS +#endif // _WINDOWS_ + +// #include "7zWindows.h" + +#endif // _WIN32 #endif diff --git a/src/sdk/C/RotateDefs.h b/src/sdk/C/RotateDefs.h index 8f01d1a..c16b4f8 100644 --- a/src/sdk/C/RotateDefs.h +++ b/src/sdk/C/RotateDefs.h @@ -1,14 +1,14 @@ /* RotateDefs.h -- Rotate functions -2015-03-25 : Igor Pavlov : Public domain */ +2023-06-18 : Igor Pavlov : Public domain */ -#ifndef __ROTATE_DEFS_H -#define __ROTATE_DEFS_H +#ifndef ZIP7_INC_ROTATE_DEFS_H +#define ZIP7_INC_ROTATE_DEFS_H #ifdef _MSC_VER #include -/* don't use _rotl with MINGW. It can insert slow call to function. */ +/* don't use _rotl with old MINGW. It can insert slow call to function. */ /* #if (_MSC_VER >= 1200) */ #pragma intrinsic(_rotl) @@ -18,12 +18,32 @@ #define rotlFixed(x, n) _rotl((x), (n)) #define rotrFixed(x, n) _rotr((x), (n)) +#if (_MSC_VER >= 1300) +#define Z7_ROTL64(x, n) _rotl64((x), (n)) +#define Z7_ROTR64(x, n) _rotr64((x), (n)) +#else +#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) +#endif + #else /* new compilers can translate these macros to fast commands. */ +#if defined(__clang__) && (__clang_major__ >= 4) \ + || defined(__GNUC__) && (__GNUC__ >= 5) +/* GCC 4.9.0 and clang 3.5 can recognize more correct version: */ +#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (-(n) & 31))) +#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (-(n) & 31))) +#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (-(n) & 63))) +#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (-(n) & 63))) +#else +/* for old GCC / clang: */ #define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) #define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) +#endif #endif diff --git a/src/sdk/C/Sha256.c b/src/sdk/C/Sha256.c index 04b688c..ea7ed8e 100644 --- a/src/sdk/C/Sha256.c +++ b/src/sdk/C/Sha256.c @@ -1,25 +1,113 @@ -/* Crypto/Sha256.c -- SHA-256 Hash -2017-04-03 : Igor Pavlov : Public domain +/* Sha256.c -- SHA-256 Hash +: Igor Pavlov : Public domain This code is based on public domain code from Wei Dai's Crypto++ library. */ #include "Precomp.h" #include -#include "CpuArch.h" -#include "RotateDefs.h" #include "Sha256.h" +#include "RotateDefs.h" +#include "CpuArch.h" + +#ifdef MY_CPU_X86_OR_AMD64 + #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ + || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ + || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \ + || defined(_MSC_VER) && (_MSC_VER >= 1200) + #define Z7_COMPILER_SHA256_SUPPORTED + #endif +#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) + + #if defined(__ARM_FEATURE_SHA2) \ + || defined(__ARM_FEATURE_CRYPTO) + #define Z7_COMPILER_SHA256_SUPPORTED + #else + #if defined(MY_CPU_ARM64) \ + || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ + || defined(Z7_MSC_VER_ORIGINAL) + #if defined(__ARM_FP) && \ + ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(__GNUC__) && (__GNUC__ >= 6) \ + ) \ + || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) + #if defined(MY_CPU_ARM64) \ + || !defined(Z7_CLANG_VERSION) \ + || defined(__ARM_NEON) && \ + (Z7_CLANG_VERSION < 170000 || \ + Z7_CLANG_VERSION > 170001) + #define Z7_COMPILER_SHA256_SUPPORTED + #endif + #endif + #endif + #endif +#endif + +void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); + +#ifdef Z7_COMPILER_SHA256_SUPPORTED + void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); + + static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks; + static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW; + + #define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks +#else + #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks +#endif + + +BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo) +{ + SHA256_FUNC_UPDATE_BLOCKS func = Sha256_UpdateBlocks; + + #ifdef Z7_COMPILER_SHA256_SUPPORTED + if (algo != SHA256_ALGO_SW) + { + if (algo == SHA256_ALGO_DEFAULT) + func = g_SHA256_FUNC_UPDATE_BLOCKS; + else + { + if (algo != SHA256_ALGO_HW) + return False; + func = g_SHA256_FUNC_UPDATE_BLOCKS_HW; + if (!func) + return False; + } + } + #else + if (algo > 1) + return False; + #endif + + p->v.vars.func_UpdateBlocks = func; + return True; +} + /* define it for speed optimization */ -#ifndef _SFX -#define _SHA256_UNROLL -#define _SHA256_UNROLL2 + +#ifdef Z7_SFX + #define STEP_PRE 1 + #define STEP_MAIN 1 +#else + #define STEP_PRE 2 + #define STEP_MAIN 4 + // #define Z7_SHA256_UNROLL #endif -/* #define _SHA256_UNROLL2 */ +#undef Z7_SHA256_BIG_W +#if STEP_MAIN != 16 + #define Z7_SHA256_BIG_W +#endif -void Sha256_Init(CSha256 *p) + + + +void Sha256_InitState(CSha256 *p) { + p->v.vars.count = 0; p->state[0] = 0x6a09e667; p->state[1] = 0xbb67ae85; p->state[2] = 0x3c6ef372; @@ -28,69 +116,121 @@ void Sha256_Init(CSha256 *p) p->state[5] = 0x9b05688c; p->state[6] = 0x1f83d9ab; p->state[7] = 0x5be0cd19; - p->count = 0; } -#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22)) -#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25)) -#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) -#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10)) -#define blk0(i) (W[i]) -#define blk2(i) (W[i] += s1(W[((i)-2)&15]) + W[((i)-7)&15] + s0(W[((i)-15)&15])) -#define Ch(x,y,z) (z^(x&(y^z))) -#define Maj(x,y,z) ((x&y)|(z&(x|y))) -#ifdef _SHA256_UNROLL2 -#define R(a,b,c,d,e,f,g,h, i) \ - h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + (j ? blk2(i) : blk0(i)); \ - d += h; \ - h += S0(a) + Maj(a, b, c) -#define RX_8(i) \ - R(a,b,c,d,e,f,g,h, i); \ - R(h,a,b,c,d,e,f,g, i+1); \ - R(g,h,a,b,c,d,e,f, i+2); \ - R(f,g,h,a,b,c,d,e, i+3); \ - R(e,f,g,h,a,b,c,d, i+4); \ - R(d,e,f,g,h,a,b,c, i+5); \ - R(c,d,e,f,g,h,a,b, i+6); \ - R(b,c,d,e,f,g,h,a, i+7) -#define RX_16 RX_8(0); RX_8(8); -#else +void Sha256_Init(CSha256 *p) +{ + p->v.vars.func_UpdateBlocks = + #ifdef Z7_COMPILER_SHA256_SUPPORTED + g_SHA256_FUNC_UPDATE_BLOCKS; + #else + NULL; + #endif + Sha256_InitState(p); +} -#define a(i) T[(0-(i))&7] -#define b(i) T[(1-(i))&7] -#define c(i) T[(2-(i))&7] -#define d(i) T[(3-(i))&7] -#define e(i) T[(4-(i))&7] -#define f(i) T[(5-(i))&7] -#define g(i) T[(6-(i))&7] -#define h(i) T[(7-(i))&7] +#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x,22)) +#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x,25)) +#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) +#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >>10)) + +#define Ch(x,y,z) (z^(x&(y^z))) +#define Maj(x,y,z) ((x&y)|(z&(x|y))) -#define R(i) \ - h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[(i)+(size_t)(j)] + (j ? blk2(i) : blk0(i)); \ - d(i) += h(i); \ - h(i) += S0(a(i)) + Maj(a(i), b(i), c(i)) \ -#ifdef _SHA256_UNROLL +#define W_PRE(i) (W[(i) + (size_t)(j)] = GetBe32(data + ((size_t)(j) + i) * 4)) -#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7); -#define RX_16 RX_8(0); RX_8(8); +#define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15)) +#ifdef Z7_SHA256_BIG_W + // we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned. + #define w(j, i) W[(size_t)(j) + i] + #define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i)) #else + #if STEP_MAIN == 16 + #define w(j, i) W[(i) & 15] + #else + #define w(j, i) W[((size_t)(j) + (i)) & 15] + #endif + #define blk2(j, i) (w(j, i) += blk2_main(j, i)) +#endif + +#define W_MAIN(i) blk2(j, i) + + +#define T1(wx, i) \ + tmp = h + S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ + h = g; \ + g = f; \ + f = e; \ + e = d + tmp; \ + tmp += S0(a) + Maj(a, b, c); \ + d = c; \ + c = b; \ + b = a; \ + a = tmp; \ -#define RX_16 unsigned i; for (i = 0; i < 16; i++) { R(i); } +#define R1_PRE(i) T1( W_PRE, i) +#define R1_MAIN(i) T1( W_MAIN, i) + +#if (!defined(Z7_SHA256_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4) +#define R2_MAIN(i) \ + R1_MAIN(i) \ + R1_MAIN(i + 1) \ #endif + + +#if defined(Z7_SHA256_UNROLL) && STEP_MAIN >= 8 + +#define T4( a,b,c,d,e,f,g,h, wx, i) \ + h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ + tmp = h; \ + h += d; \ + d = tmp + S0(a) + Maj(a, b, c); \ + +#define R4( wx, i) \ + T4 ( a,b,c,d,e,f,g,h, wx, (i )); \ + T4 ( d,a,b,c,h,e,f,g, wx, (i+1)); \ + T4 ( c,d,a,b,g,h,e,f, wx, (i+2)); \ + T4 ( b,c,d,a,f,g,h,e, wx, (i+3)); \ + +#define R4_PRE(i) R4( W_PRE, i) +#define R4_MAIN(i) R4( W_MAIN, i) + + +#define T8( a,b,c,d,e,f,g,h, wx, i) \ + h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ + d += h; \ + h += S0(a) + Maj(a, b, c); \ + +#define R8( wx, i) \ + T8 ( a,b,c,d,e,f,g,h, wx, i ); \ + T8 ( h,a,b,c,d,e,f,g, wx, i+1); \ + T8 ( g,h,a,b,c,d,e,f, wx, i+2); \ + T8 ( f,g,h,a,b,c,d,e, wx, i+3); \ + T8 ( e,f,g,h,a,b,c,d, wx, i+4); \ + T8 ( d,e,f,g,h,a,b,c, wx, i+5); \ + T8 ( c,d,e,f,g,h,a,b, wx, i+6); \ + T8 ( b,c,d,e,f,g,h,a, wx, i+7); \ + +#define R8_PRE(i) R8( W_PRE, i) +#define R8_MAIN(i) R8( W_MAIN, i) + #endif -static const UInt32 K[64] = { + +extern +MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64]; +MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, @@ -109,30 +249,29 @@ static const UInt32 K[64] = { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 }; -static void Sha256_WriteByteBlock(CSha256 *p) -{ - UInt32 W[16]; - unsigned j; - UInt32 *state; - #ifdef _SHA256_UNROLL2 - UInt32 a,b,c,d,e,f,g,h; - #else - UInt32 T[8]; - #endif - for (j = 0; j < 16; j += 4) - { - const Byte *ccc = p->buffer + j * 4; - W[j ] = GetBe32(ccc); - W[j + 1] = GetBe32(ccc + 4); - W[j + 2] = GetBe32(ccc + 8); - W[j + 3] = GetBe32(ccc + 12); - } - state = p->state; - #ifdef _SHA256_UNROLL2 +#define K SHA256_K_ARRAY + +Z7_NO_INLINE +void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks) +{ + UInt32 W +#ifdef Z7_SHA256_BIG_W + [64]; +#else + [16]; +#endif + unsigned j; + UInt32 a,b,c,d,e,f,g,h; +#if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) + UInt32 tmp; +#endif + + if (numBlocks == 0) return; + a = state[0]; b = state[1]; c = state[2]; @@ -141,108 +280,213 @@ static void Sha256_WriteByteBlock(CSha256 *p) f = state[5]; g = state[6]; h = state[7]; - #else - for (j = 0; j < 8; j++) - T[j] = state[j]; - #endif - for (j = 0; j < 64; j += 16) + do + { + + for (j = 0; j < 16; j += STEP_PRE) + { + #if STEP_PRE > 4 + + #if STEP_PRE < 8 + R4_PRE(0); + #else + R8_PRE(0); + #if STEP_PRE == 16 + R8_PRE(8); + #endif + #endif + + #else + + R1_PRE(0) + #if STEP_PRE >= 2 + R1_PRE(1) + #if STEP_PRE >= 4 + R1_PRE(2) + R1_PRE(3) + #endif + #endif + + #endif + } + + for (j = 16; j < 64; j += STEP_MAIN) { - RX_16 + #if defined(Z7_SHA256_UNROLL) && STEP_MAIN >= 8 + + #if STEP_MAIN < 8 + R4_MAIN(0) + #else + R8_MAIN(0) + #if STEP_MAIN == 16 + R8_MAIN(8) + #endif + #endif + + #else + + R1_MAIN(0) + #if STEP_MAIN >= 2 + R1_MAIN(1) + #if STEP_MAIN >= 4 + R2_MAIN(2) + #if STEP_MAIN >= 8 + R2_MAIN(4) + R2_MAIN(6) + #if STEP_MAIN >= 16 + R2_MAIN(8) + R2_MAIN(10) + R2_MAIN(12) + R2_MAIN(14) + #endif + #endif + #endif + #endif + #endif } - #ifdef _SHA256_UNROLL2 - state[0] += a; - state[1] += b; - state[2] += c; - state[3] += d; - state[4] += e; - state[5] += f; - state[6] += g; - state[7] += h; - #else - for (j = 0; j < 8; j++) - state[j] += T[j]; - #endif - - /* Wipe variables */ - /* memset(W, 0, sizeof(W)); */ - /* memset(T, 0, sizeof(T)); */ + a += state[0]; state[0] = a; + b += state[1]; state[1] = b; + c += state[2]; state[2] = c; + d += state[3]; state[3] = d; + e += state[4]; state[4] = e; + f += state[5]; state[5] = f; + g += state[6]; state[6] = g; + h += state[7]; state[7] = h; + + data += SHA256_BLOCK_SIZE; + } + while (--numBlocks); } -#undef S0 -#undef S1 -#undef s0 -#undef s1 + +#define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) void Sha256_Update(CSha256 *p, const Byte *data, size_t size) { if (size == 0) return; - { - unsigned pos = (unsigned)p->count & 0x3F; - unsigned num; - - p->count += size; - - num = 64 - pos; + const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1); + const unsigned num = SHA256_BLOCK_SIZE - pos; + p->v.vars.count += size; if (num > size) { memcpy(p->buffer + pos, data, size); return; } - - size -= num; - memcpy(p->buffer + pos, data, num); - data += num; + if (pos != 0) + { + size -= num; + memcpy(p->buffer + pos, data, num); + data += num; + Sha256_UpdateBlock(p); + } } - - for (;;) { - Sha256_WriteByteBlock(p); - if (size < 64) - break; - size -= 64; - memcpy(p->buffer, data, 64); - data += 64; - } - - if (size != 0) + const size_t numBlocks = size >> 6; + // if (numBlocks) + SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks); + size &= SHA256_BLOCK_SIZE - 1; + if (size == 0) + return; + data += (numBlocks << 6); memcpy(p->buffer, data, size); + } } + void Sha256_Final(CSha256 *p, Byte *digest) { - unsigned pos = (unsigned)p->count & 0x3F; - unsigned i; - + unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1); p->buffer[pos++] = 0x80; - - while (pos != (64 - 8)) + if (pos > (SHA256_BLOCK_SIZE - 4 * 2)) { - pos &= 0x3F; - if (pos == 0) - Sha256_WriteByteBlock(p); - p->buffer[pos++] = 0; + while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; } + // memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos); + Sha256_UpdateBlock(p); + pos = 0; } - + memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos); { - UInt64 numBits = (p->count << 3); - SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32)); - SetBe32(p->buffer + 64 - 4, (UInt32)(numBits)); + const UInt64 numBits = p->v.vars.count << 3; + SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32)) + SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits)) + } + Sha256_UpdateBlock(p); +#if 1 && defined(MY_CPU_BE) + memcpy(digest, p->state, SHA256_DIGEST_SIZE); +#else + { + unsigned i; + for (i = 0; i < 8; i += 2) + { + const UInt32 v0 = p->state[i]; + const UInt32 v1 = p->state[(size_t)i + 1]; + SetBe32(digest , v0) + SetBe32(digest + 4, v1) + digest += 4 * 2; + } } - - Sha256_WriteByteBlock(p); - for (i = 0; i < 8; i += 2) + + + +#endif + Sha256_InitState(p); +} + + +void Sha256Prepare(void) +{ +#ifdef Z7_COMPILER_SHA256_SUPPORTED + SHA256_FUNC_UPDATE_BLOCKS f, f_hw; + f = Sha256_UpdateBlocks; + f_hw = NULL; +#ifdef MY_CPU_X86_OR_AMD64 + if (CPU_IsSupported_SHA() + && CPU_IsSupported_SSSE3() + ) +#else + if (CPU_IsSupported_SHA2()) +#endif { - UInt32 v0 = p->state[i]; - UInt32 v1 = p->state[i + 1]; - SetBe32(digest , v0); - SetBe32(digest + 4, v1); - digest += 8; + // printf("\n========== HW SHA256 ======== \n"); + f = f_hw = Sha256_UpdateBlocks_HW; } - - Sha256_Init(p); + g_SHA256_FUNC_UPDATE_BLOCKS = f; + g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw; +#endif } + +#undef U64C +#undef K +#undef S0 +#undef S1 +#undef s0 +#undef s1 +#undef Ch +#undef Maj +#undef W_MAIN +#undef W_PRE +#undef w +#undef blk2_main +#undef blk2 +#undef T1 +#undef T4 +#undef T8 +#undef R1_PRE +#undef R1_MAIN +#undef R2_MAIN +#undef R4 +#undef R4_PRE +#undef R4_MAIN +#undef R8 +#undef R8_PRE +#undef R8_MAIN +#undef STEP_PRE +#undef STEP_MAIN +#undef Z7_SHA256_BIG_W +#undef Z7_SHA256_UNROLL +#undef Z7_COMPILER_SHA256_SUPPORTED diff --git a/src/sdk/C/Sha256.h b/src/sdk/C/Sha256.h index 3f455db..75329cd 100644 --- a/src/sdk/C/Sha256.h +++ b/src/sdk/C/Sha256.h @@ -1,26 +1,86 @@ /* Sha256.h -- SHA-256 Hash -2013-01-18 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ -#ifndef __CRYPTO_SHA256_H -#define __CRYPTO_SHA256_H +#ifndef ZIP7_INC_SHA256_H +#define ZIP7_INC_SHA256_H #include "7zTypes.h" EXTERN_C_BEGIN -#define SHA256_DIGEST_SIZE 32 +#define SHA256_NUM_BLOCK_WORDS 16 +#define SHA256_NUM_DIGEST_WORDS 8 + +#define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4) +#define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4) + + + + +typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks); + +/* + if (the system supports different SHA256 code implementations) + { + (CSha256::func_UpdateBlocks) will be used + (CSha256::func_UpdateBlocks) can be set by + Sha256_Init() - to default (fastest) + Sha256_SetFunction() - to any algo + } + else + { + (CSha256::func_UpdateBlocks) is ignored. + } +*/ typedef struct { - UInt32 state[8]; - UInt64 count; - Byte buffer[64]; + union + { + struct + { + SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; + UInt64 count; + } vars; + UInt64 _pad_64bit[4]; + void *_pad_align_ptr[2]; + } v; + UInt32 state[SHA256_NUM_DIGEST_WORDS]; + + Byte buffer[SHA256_BLOCK_SIZE]; } CSha256; + +#define SHA256_ALGO_DEFAULT 0 +#define SHA256_ALGO_SW 1 +#define SHA256_ALGO_HW 2 + +/* +Sha256_SetFunction() +return: + 0 - (algo) value is not supported, and func_UpdateBlocks was not changed + 1 - func_UpdateBlocks was set according (algo) value. +*/ + +BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo); + +void Sha256_InitState(CSha256 *p); void Sha256_Init(CSha256 *p); void Sha256_Update(CSha256 *p, const Byte *data, size_t size); void Sha256_Final(CSha256 *p, Byte *digest); + + + +// void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); + +/* +call Sha256Prepare() once at program start. +It prepares all supported implementations, and detects the fastest implementation. +*/ + +void Sha256Prepare(void); + EXTERN_C_END #endif diff --git a/src/sdk/C/Sha256Opt.c b/src/sdk/C/Sha256Opt.c new file mode 100644 index 0000000..1c6b50f --- /dev/null +++ b/src/sdk/C/Sha256Opt.c @@ -0,0 +1,451 @@ +/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions +: Igor Pavlov : Public domain */ + +#include "Precomp.h" +#include "Compiler.h" +#include "CpuArch.h" + +// #define Z7_USE_HW_SHA_STUB // for debug +#ifdef MY_CPU_X86_OR_AMD64 + #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check + #define USE_HW_SHA + #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ + || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) + #define USE_HW_SHA + #if !defined(__INTEL_COMPILER) + // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) + #if !defined(__SHA__) || !defined(__SSSE3__) + #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) + #endif + #endif + #elif defined(_MSC_VER) + #if (_MSC_VER >= 1900) + #define USE_HW_SHA + #else + #define Z7_USE_HW_SHA_STUB + #endif + #endif +// #endif // MY_CPU_X86_OR_AMD64 +#ifndef USE_HW_SHA + // #define Z7_USE_HW_SHA_STUB // for debug +#endif + +#ifdef USE_HW_SHA + +// #pragma message("Sha256 HW") + + + + +// sse/sse2/ssse3: +#include +// sha*: +#include + +#if defined (__clang__) && defined(_MSC_VER) + #if !defined(__SHA__) + #include + #endif +#else + +#endif + +/* +SHA256 uses: +SSE2: + _mm_loadu_si128 + _mm_storeu_si128 + _mm_set_epi32 + _mm_add_epi32 + _mm_shuffle_epi32 / pshufd + + + +SSSE3: + _mm_shuffle_epi8 / pshufb + _mm_alignr_epi8 +SHA: + _mm_sha256* +*/ + +// K array must be aligned for 16-bytes at least. +// The compiler can look align attribute and selects +// movdqu - for code without align attribute +// movdqa - for code with align attribute +extern +MY_ALIGN(64) +const UInt32 SHA256_K_ARRAY[64]; +#define K SHA256_K_ARRAY + + +#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); +#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); +#define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); + +#define LOAD_SHUFFLE(m, k) \ + m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ + m = _mm_shuffle_epi8(m, mask); \ + +#define NNN(m0, m1, m2, m3) + +#define SM1(m1, m2, m3, m0) \ + SHA256_MSG1(m0, m1); \ + +#define SM2(m2, m3, m0, m1) \ + ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \ + SHA256_MSG2(m0, m3); \ + +#define RND2(t0, t1) \ + t0 = _mm_sha256rnds2_epu32(t0, t1, msg); + + + +#define R4(k, m0, m1, m2, m3, OP0, OP1) \ + msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \ + RND2(state0, state1); \ + msg = _mm_shuffle_epi32(msg, 0x0E); \ + OP0(m0, m1, m2, m3) \ + RND2(state1, state0); \ + OP1(m0, m1, m2, m3) \ + +#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ + R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ + R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ + R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ + R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ + +#define PREPARE_STATE \ + tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ + state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \ + state1 = state0; \ + state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \ + state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \ + + +void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); +#ifdef ATTRIB_SHA +ATTRIB_SHA +#endif +void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) +{ + const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); + + + __m128i tmp, state0, state1; + + if (numBlocks == 0) + return; + + state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]); + state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]); + + PREPARE_STATE + + do + { + __m128i state0_save, state1_save; + __m128i m0, m1, m2, m3; + __m128i msg; + // #define msg tmp + + state0_save = state0; + state1_save = state1; + + LOAD_SHUFFLE (m0, 0) + LOAD_SHUFFLE (m1, 1) + LOAD_SHUFFLE (m2, 2) + LOAD_SHUFFLE (m3, 3) + + + + R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ) + R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) + R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) + R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ) + + ADD_EPI32(state0, state0_save) + ADD_EPI32(state1, state1_save) + + data += 64; + } + while (--numBlocks); + + PREPARE_STATE + + _mm_storeu_si128((__m128i *) (void *) &state[0], state0); + _mm_storeu_si128((__m128i *) (void *) &state[4], state1); +} + +#endif // USE_HW_SHA + +#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) + + #if defined(__ARM_FEATURE_SHA2) \ + || defined(__ARM_FEATURE_CRYPTO) + #define USE_HW_SHA + #else + #if defined(MY_CPU_ARM64) \ + || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ + || defined(Z7_MSC_VER_ORIGINAL) + #if defined(__ARM_FP) && \ + ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(__GNUC__) && (__GNUC__ >= 6) \ + ) \ + || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) + #if defined(MY_CPU_ARM64) \ + || !defined(Z7_CLANG_VERSION) \ + || defined(__ARM_NEON) && \ + (Z7_CLANG_VERSION < 170000 || \ + Z7_CLANG_VERSION > 170001) + #define USE_HW_SHA + #endif + #endif + #endif + #endif + +#ifdef USE_HW_SHA + +// #pragma message("=== Sha256 HW === ") + + +#if defined(__clang__) || defined(__GNUC__) +#if !defined(__ARM_FEATURE_SHA2) && \ + !defined(__ARM_FEATURE_CRYPTO) + #ifdef MY_CPU_ARM64 +#if defined(__clang__) + #define ATTRIB_SHA __attribute__((__target__("crypto"))) +#else + #define ATTRIB_SHA __attribute__((__target__("+crypto"))) +#endif + #else +#if defined(__clang__) && (__clang_major__ >= 1) + #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2"))) +#else + #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) +#endif + #endif +#endif +#else + // _MSC_VER + // for arm32 + #define _ARM_USE_NEW_NEON_INTRINSICS +#endif + +#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) +#include +#else + +#if defined(__clang__) && __clang_major__ < 16 +#if !defined(__ARM_FEATURE_SHA2) && \ + !defined(__ARM_FEATURE_CRYPTO) +// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ") + Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1 +// #if defined(__clang__) && __clang_major__ < 13 + #define __ARM_FEATURE_CRYPTO 1 +// #else + #define __ARM_FEATURE_SHA2 1 +// #endif + Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif +#endif // clang + +#if defined(__clang__) + +#if defined(__ARM_ARCH) && __ARM_ARCH < 8 + Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +// #pragma message("#define __ARM_ARCH 8") + #undef __ARM_ARCH + #define __ARM_ARCH 8 + Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif + +#endif // clang + +#include + +#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \ + defined(__ARM_FEATURE_CRYPTO) && \ + defined(__ARM_FEATURE_SHA2) +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #undef __ARM_FEATURE_CRYPTO + #undef __ARM_FEATURE_SHA2 + #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") +#endif + +#endif // Z7_MSC_VER_ORIGINAL + +typedef uint32x4_t v128; +// typedef __n128 v128; // MSVC + +#ifdef MY_CPU_BE + #define MY_rev32_for_LE(x) x +#else + #define MY_rev32_for_LE(x) vrev32q_u8(x) +#endif + +#if 1 // 0 for debug +// for arm32: it works slower by some reason than direct code +/* +for arm32 it generates: +MSVC-2022, GCC-9: + vld1.32 {d18,d19}, [r10] + vst1.32 {d4,d5}, [r3] + vld1.8 {d20-d21}, [r4] +there is no align hint (like [r10:128]). So instruction allows unaligned access +*/ +#define LOAD_128_32(_p) vld1q_u32(_p) +#define LOAD_128_8(_p) vld1q_u8 (_p) +#define STORE_128_32(_p, _v) vst1q_u32(_p, _v) +#else +/* +for arm32: +MSVC-2022: + vldm r10,{d18,d19} + vstm r3,{d4,d5} + does it require strict alignment? +GCC-9: + vld1.64 {d30-d31}, [r0:64] + vldr d28, [r0, #16] + vldr d29, [r0, #24] + vst1.64 {d30-d31}, [r0:64] + vstr d28, [r0, #16] + vstr d29, [r0, #24] +there is hint [r0:64], so does it requires 64-bit alignment. +*/ +#define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p)) +#define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p)) +#define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v) +#endif + +#define LOAD_SHUFFLE(m, k) \ + m = vreinterpretq_u32_u8( \ + MY_rev32_for_LE( \ + LOAD_128_8(data + (k) * 16))); \ + +// K array must be aligned for 16-bytes at least. +extern +MY_ALIGN(64) +const UInt32 SHA256_K_ARRAY[64]; +#define K SHA256_K_ARRAY + +#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); +#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); + +#define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0) +#define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1) +#define NNN(m0, m1, m2, m3) + +#define R4(k, m0, m1, m2, m3, OP0, OP1) \ + msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \ + tmp = state0; \ + state0 = vsha256hq_u32( state0, state1, msg ); \ + state1 = vsha256h2q_u32( state1, tmp, msg ); \ + OP0(m0, m1, m2, m3); \ + OP1(m0, m1, m2, m3); \ + + +#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ + R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \ + R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \ + R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \ + R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \ + + +void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); +#ifdef ATTRIB_SHA +ATTRIB_SHA +#endif +void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) +{ + v128 state0, state1; + + if (numBlocks == 0) + return; + + state0 = LOAD_128_32(&state[0]); + state1 = LOAD_128_32(&state[4]); + + do + { + v128 state0_save, state1_save; + v128 m0, m1, m2, m3; + v128 msg, tmp; + + state0_save = state0; + state1_save = state1; + + LOAD_SHUFFLE (m0, 0) + LOAD_SHUFFLE (m1, 1) + LOAD_SHUFFLE (m2, 2) + LOAD_SHUFFLE (m3, 3) + + R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ) + R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) + R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) + R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ) + + state0 = vaddq_u32(state0, state0_save); + state1 = vaddq_u32(state1, state1_save); + + data += 64; + } + while (--numBlocks); + + STORE_128_32(&state[0], state0); + STORE_128_32(&state[4], state1); +} + +#endif // USE_HW_SHA + +#endif // MY_CPU_ARM_OR_ARM64 + + +#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) +// #error Stop_Compiling_UNSUPPORTED_SHA +// #include +// We can compile this file with another C compiler, +// or we can compile asm version. +// So we can generate real code instead of this stub function. +// #include "Sha256.h" +// #if defined(_MSC_VER) +#pragma message("Sha256 HW-SW stub was used") +// #endif +void Z7_FASTCALL Sha256_UpdateBlocks (UInt32 state[8], const Byte *data, size_t numBlocks); +void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); +void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) +{ + Sha256_UpdateBlocks(state, data, numBlocks); + /* + UNUSED_VAR(state); + UNUSED_VAR(data); + UNUSED_VAR(numBlocks); + exit(1); + return; + */ +} +#endif + + +#undef K +#undef RND2 +#undef MY_rev32_for_LE + +#undef NNN +#undef LOAD_128 +#undef STORE_128 +#undef LOAD_SHUFFLE +#undef SM1 +#undef SM2 + + +#undef R4 +#undef R16 +#undef PREPARE_STATE +#undef USE_HW_SHA +#undef ATTRIB_SHA +#undef USE_VER_MIN +#undef Z7_USE_HW_SHA_STUB diff --git a/src/sdk/C/Sort.c b/src/sdk/C/Sort.c index e1097e3..20e3e69 100644 --- a/src/sdk/C/Sort.c +++ b/src/sdk/C/Sort.c @@ -1,141 +1,268 @@ /* Sort.c -- Sort functions -2014-04-05 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" #include "Sort.h" +#include "CpuArch.h" -#define HeapSortDown(p, k, size, temp) \ - { for (;;) { \ - size_t s = (k << 1); \ - if (s > size) break; \ - if (s < size && p[s + 1] > p[s]) s++; \ - if (temp >= p[s]) break; \ - p[k] = p[s]; k = s; \ - } p[k] = temp; } +#if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ + || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \ + ) +// the code with prefetch is slow for small arrays on x86. +// So we disable prefetch for x86. +#ifndef MY_CPU_X86 + // #pragma message("Z7_PREFETCH : __builtin_prefetch") + #define Z7_PREFETCH(a) __builtin_prefetch((a)) +#endif -void HeapSort(UInt32 *p, size_t size) -{ - if (size <= 1) - return; - p--; - { - size_t i = size / 2; - do - { - UInt32 temp = p[i]; - size_t k = i; - HeapSortDown(p, k, size, temp) - } - while (--i != 0); - } - /* - do - { - size_t k = 1; - UInt32 temp = p[size]; - p[size--] = p[1]; - HeapSortDown(p, k, size, temp) - } - while (size > 1); - */ - while (size > 3) - { - UInt32 temp = p[size]; - size_t k = (p[3] > p[2]) ? 3 : 2; - p[size--] = p[1]; - p[1] = p[k]; - HeapSortDown(p, k, size, temp) - } - { - UInt32 temp = p[size]; - p[size] = p[1]; - if (size > 2 && p[2] < temp) - { - p[1] = p[2]; - p[2] = temp; - } - else - p[1] = temp; - } +#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200) + +#include "7zWindows.h" + +// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1. +// For example, clang-cl can generate "prefetcht2" instruction for +// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call. +// But we want to generate "prefetcht0" instruction. +// So for CLANG/GCC we must use __builtin_prefetch() in code branch above +// instead of PreFetchCacheLine() / _mm_prefetch(). + +// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call. +// But old x86 cpus don't support "prefetcht0". +// So we will use PreFetchCacheLine(), only if we are sure that +// generated instruction is supported by all cpus of that isa. +#if defined(MY_CPU_AMD64) \ + || defined(MY_CPU_ARM64) \ + || defined(MY_CPU_IA64) +// we need to use additional braces for (a) in PreFetchCacheLine call, because +// PreFetchCacheLine macro doesn't use braces: +// #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l) + // #pragma message("Z7_PREFETCH : PreFetchCacheLine") + #define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a)) +#endif + +#endif // _WIN32 + + +#define PREFETCH_NO(p,k,s,size) + +#ifndef Z7_PREFETCH + #define SORT_PREFETCH(p,k,s,size) +#else + +// #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes +#define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch) +// #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch) + +#if PREFETCH_LEVEL == 0 + + #define SORT_PREFETCH(p,k,s,size) + +#else // PREFETCH_LEVEL != 0 + +/* +if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) + we prefetch one value per cache line. + Use it if array is aligned for cache line size (64 bytes) + or if array is small (less than L1 cache size). + +if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) + we perfetch all cache lines that can be required. + it can be faster for big unaligned arrays. +*/ + #define USE_PREFETCH_FOR_ALIGNED_ARRAY + +// s == k * 2 +#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64) + // x86 supports (lea r1*8+offset) + #define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL) +#else + #define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1)) +#endif + +#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) + #define PREFETCH_ADD_OFFSET 0 +#else + // last offset that can be reqiured in PREFETCH_LEVEL step: + #define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1) + #define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2 +#endif + +#if PREFETCH_LEVEL <= 3 + +#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY + #define SORT_PREFETCH(p,k,s,size) \ + { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \ + if (s2 <= size) { \ + Z7_PREFETCH((p + s2)); \ + }} +#else /* for unaligned array */ + #define SORT_PREFETCH(p,k,s,size) \ + { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ + if (s2 <= size) { \ + Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ + Z7_PREFETCH((p + s2)); \ + }} +#endif + +#else // PREFETCH_LEVEL > 3 + +#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY + #define SORT_PREFETCH(p,k,s,size) \ + { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \ + if (s2 <= size) { \ + Z7_PREFETCH((p + s2 - 16)); \ + Z7_PREFETCH((p + s2)); \ + }} +#else /* for unaligned array */ + #define SORT_PREFETCH(p,k,s,size) \ + { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ + if (s2 <= size) { \ + Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ + Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \ + Z7_PREFETCH((p + s2)); \ + }} +#endif + +#endif // PREFETCH_LEVEL > 3 +#endif // PREFETCH_LEVEL != 0 +#endif // Z7_PREFETCH + + +#if defined(MY_CPU_ARM64) \ + /* || defined(MY_CPU_AMD64) */ \ + /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */ + // we want to use cmov, if cmov is very fast: + // - this cmov version is slower for clang-x64. + // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus. + #define Z7_FAST_CMOV_SUPPORTED +#endif + +#ifdef Z7_FAST_CMOV_SUPPORTED + // we want to use cmov here, if cmov is fast: new arm64 cpus. + // we want the compiler to use conditional move for this branch + #define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1; +#else + // use this branch, if cpu doesn't support fast conditional move. + // it uses slow array access reading: + #define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow; +#endif + +#define HeapSortDown(p, k, size, temp, macro_prefetch) \ +{ \ + for (;;) { \ + UInt32 n0, n1; \ + size_t s = k * 2; \ + if (s >= size) { \ + if (s == size) { \ + n0 = p[s]; \ + p[k] = n0; \ + if (temp < n0) k = s; \ + } \ + break; \ + } \ + n0 = p[k * 2]; \ + n1 = p[k * 2 + 1]; \ + s += n0 < n1; \ + GET_MAX_VAL(n0, n1, p[s]) \ + if (temp >= n0) break; \ + macro_prefetch(p, k, s, size) \ + p[k] = n0; \ + k = s; \ + } \ + p[k] = temp; \ } -void HeapSort64(UInt64 *p, size_t size) + +/* +stage-1 : O(n) : + we generate intermediate partially sorted binary tree: + p[0] : it's additional item for better alignment of tree structure in memory. + p[1] + p[2] p[3] + p[4] p[5] p[6] p[7] + ... + p[x] >= p[x * 2] + p[x] >= p[x * 2 + 1] + +stage-2 : O(n)*log2(N): + we move largest item p[0] from head of tree to the end of array + and insert last item to sorted binary tree. +*/ + +// (p) must be aligned for cache line size (64-bytes) for best performance + +void Z7_FASTCALL HeapSort(UInt32 *p, size_t size) { - if (size <= 1) + if (size < 2) return; - p--; - { - size_t i = size / 2; - do - { - UInt64 temp = p[i]; - size_t k = i; - HeapSortDown(p, k, size, temp) - } - while (--i != 0); - } - /* - do + if (size == 2) { - size_t k = 1; - UInt64 temp = p[size]; - p[size--] = p[1]; - HeapSortDown(p, k, size, temp) - } - while (size > 1); - */ - while (size > 3) - { - UInt64 temp = p[size]; - size_t k = (p[3] > p[2]) ? 3 : 2; - p[size--] = p[1]; - p[1] = p[k]; - HeapSortDown(p, k, size, temp) + const UInt32 a0 = p[0]; + const UInt32 a1 = p[1]; + const unsigned k = a1 < a0; + p[k] = a0; + p[k ^ 1] = a1; + return; } { - UInt64 temp = p[size]; - p[size] = p[1]; - if (size > 2 && p[2] < temp) + // stage-1 : O(n) + // we transform array to partially sorted binary tree. + size_t i = --size / 2; + // (size) now is the index of the last item in tree, + // if (i) { - p[1] = p[2]; - p[2] = temp; + do + { + const UInt32 temp = p[i]; + size_t k = i; + HeapSortDown(p, k, size, temp, PREFETCH_NO) + } + while (--i); + } + { + const UInt32 temp = p[0]; + const UInt32 a1 = p[1]; + if (temp < a1) + { + size_t k = 1; + p[0] = a1; + HeapSortDown(p, k, size, temp, PREFETCH_NO) + } } - else - p[1] = temp; } -} -/* -#define HeapSortRefDown(p, vals, n, size, temp) \ - { size_t k = n; UInt32 val = vals[temp]; for (;;) { \ - size_t s = (k << 1); \ - if (s > size) break; \ - if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \ - if (val >= vals[p[s]]) break; \ - p[k] = p[s]; k = s; \ - } p[k] = temp; } - -void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size) -{ - if (size <= 1) + if (size < 3) + { + // size == 2 + const UInt32 a0 = p[0]; + p[0] = p[2]; + p[2] = a0; return; - p--; + } + if (size != 3) { - size_t i = size / 2; + // stage-2 : O(size) * log2(size): + // we move largest item p[0] from head to the end of array, + // and insert last item to sorted binary tree. do { - UInt32 temp = p[i]; - HeapSortRefDown(p, vals, i, size, temp); + const UInt32 temp = p[size]; + size_t k = p[2] < p[3] ? 3 : 2; + p[size--] = p[0]; + p[0] = p[1]; + p[1] = p[k]; + HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO } - while (--i != 0); + while (size != 3); } - do { - UInt32 temp = p[size]; - p[size--] = p[1]; - HeapSortRefDown(p, vals, 1, size, temp); + const UInt32 a2 = p[2]; + const UInt32 a3 = p[3]; + const size_t k = a2 < a3; + p[2] = p[1]; + p[3] = p[0]; + p[k] = a3; + p[k ^ 1] = a2; } - while (size > 1); } -*/ diff --git a/src/sdk/C/Sort.h b/src/sdk/C/Sort.h index 2e2963a..de5a4e8 100644 --- a/src/sdk/C/Sort.h +++ b/src/sdk/C/Sort.h @@ -1,17 +1,14 @@ /* Sort.h -- Sort functions -2014-04-05 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ -#ifndef __7Z_SORT_H -#define __7Z_SORT_H +#ifndef ZIP7_INC_SORT_H +#define ZIP7_INC_SORT_H #include "7zTypes.h" EXTERN_C_BEGIN -void HeapSort(UInt32 *p, size_t size); -void HeapSort64(UInt64 *p, size_t size); - -/* void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size); */ +void Z7_FASTCALL HeapSort(UInt32 *p, size_t size); EXTERN_C_END diff --git a/src/sdk/C/SwapBytes.c b/src/sdk/C/SwapBytes.c new file mode 100644 index 0000000..9290592 --- /dev/null +++ b/src/sdk/C/SwapBytes.c @@ -0,0 +1,835 @@ +/* SwapBytes.c -- Byte Swap conversion filter +2024-03-01 : Igor Pavlov : Public domain */ + +#include "Precomp.h" + +#include "Compiler.h" +#include "CpuArch.h" +#include "RotateDefs.h" +#include "SwapBytes.h" + +typedef UInt16 CSwapUInt16; +typedef UInt32 CSwapUInt32; + +// #define k_SwapBytes_Mode_BASE 0 + +#ifdef MY_CPU_X86_OR_AMD64 + +#define k_SwapBytes_Mode_SSE2 1 +#define k_SwapBytes_Mode_SSSE3 2 +#define k_SwapBytes_Mode_AVX2 3 + + // #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) + #if defined(__clang__) && (__clang_major__ >= 4) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701) + #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_AVX2 + #define SWAP_ATTRIB_SSE2 __attribute__((__target__("sse2"))) + #define SWAP_ATTRIB_SSSE3 __attribute__((__target__("ssse3"))) + #define SWAP_ATTRIB_AVX2 __attribute__((__target__("avx2"))) + #elif defined(_MSC_VER) + #if (_MSC_VER == 1900) + #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX + #endif + #if (_MSC_VER >= 1900) + #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_AVX2 + #elif (_MSC_VER >= 1500) // (VS2008) + #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_SSSE3 + #elif (_MSC_VER >= 1310) // (VS2003) + #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_SSE2 + #endif + #endif // _MSC_VER + +/* +// for debug +#ifdef k_SwapBytes_Mode_MAX +#undef k_SwapBytes_Mode_MAX +#endif +*/ + +#ifndef k_SwapBytes_Mode_MAX +#define k_SwapBytes_Mode_MAX 0 +#endif + +#if (k_SwapBytes_Mode_MAX != 0) && defined(MY_CPU_AMD64) + #define k_SwapBytes_Mode_MIN k_SwapBytes_Mode_SSE2 +#else + #define k_SwapBytes_Mode_MIN 0 +#endif + +#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_AVX2) + #define USE_SWAP_AVX2 +#endif +#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSSE3) + #define USE_SWAP_SSSE3 +#endif +#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSE2) + #define USE_SWAP_128 +#endif + +#if k_SwapBytes_Mode_MAX <= k_SwapBytes_Mode_MIN || !defined(USE_SWAP_128) +#define FORCE_SWAP_MODE +#endif + + +#ifdef USE_SWAP_128 +/* + MMX + SSE + SSE2 + SSE3 + SSSE3 + SSE4.1 + SSE4.2 + SSE4A + AES + AVX, AVX2, FMA +*/ + +#include // sse2 +// typedef __m128i v128; + +#define SWAP2_128(i) { \ + const __m128i v = *(const __m128i *)(const void *)(items + (i) * 8); \ + *( __m128i *)( void *)(items + (i) * 8) = \ + _mm_or_si128( \ + _mm_slli_epi16(v, 8), \ + _mm_srli_epi16(v, 8)); } +// _mm_or_si128() has more ports to execute than _mm_add_epi16(). + +static +#ifdef SWAP_ATTRIB_SSE2 +SWAP_ATTRIB_SSE2 +#endif +void +Z7_FASTCALL +SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim) +{ + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SWAP2_128(0) SWAP2_128(1) items += 2 * 8; + SWAP2_128(0) SWAP2_128(1) items += 2 * 8; + } + while (items != lim); +} + +/* +// sse2 +#define SWAP4_128_pack(i) { \ + __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \ + __m128i v0 = _mm_unpacklo_epi8(v, mask); \ + __m128i v1 = _mm_unpackhi_epi8(v, mask); \ + v0 = _mm_shufflelo_epi16(v0, 0x1b); \ + v1 = _mm_shufflelo_epi16(v1, 0x1b); \ + v0 = _mm_shufflehi_epi16(v0, 0x1b); \ + v1 = _mm_shufflehi_epi16(v1, 0x1b); \ + *(__m128i *)(void *)(items + (i) * 4) = _mm_packus_epi16(v0, v1); } + +static +#ifdef SWAP_ATTRIB_SSE2 +SWAP_ATTRIB_SSE2 +#endif +void +Z7_FASTCALL +SwapBytes4_128_pack(CSwapUInt32 *items, const CSwapUInt32 *lim) +{ + const __m128i mask = _mm_setzero_si128(); + // const __m128i mask = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0); + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SWAP4_128_pack(0); items += 1 * 4; + // SWAP4_128_pack(0); SWAP4_128_pack(1); items += 2 * 4; + } + while (items != lim); +} + +// sse2 +#define SWAP4_128_shift(i) { \ + __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \ + __m128i v2; \ + v2 = _mm_or_si128( \ + _mm_slli_si128(_mm_and_si128(v, mask), 1), \ + _mm_and_si128(_mm_srli_si128(v, 1), mask)); \ + v = _mm_or_si128( \ + _mm_slli_epi32(v, 24), \ + _mm_srli_epi32(v, 24)); \ + *(__m128i *)(void *)(items + (i) * 4) = _mm_or_si128(v2, v); } + +static +#ifdef SWAP_ATTRIB_SSE2 +SWAP_ATTRIB_SSE2 +#endif +void +Z7_FASTCALL +SwapBytes4_128_shift(CSwapUInt32 *items, const CSwapUInt32 *lim) +{ + #define M1 0xff00 + const __m128i mask = _mm_set_epi32(M1, M1, M1, M1); + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + // SWAP4_128_shift(0) SWAP4_128_shift(1) items += 2 * 4; + // SWAP4_128_shift(0) SWAP4_128_shift(1) items += 2 * 4; + SWAP4_128_shift(0); items += 1 * 4; + } + while (items != lim); +} +*/ + + +#if defined(USE_SWAP_SSSE3) || defined(USE_SWAP_AVX2) + +#define SWAP_SHUF_REV_SEQ_2_VALS(v) (v)+1, (v) +#define SWAP_SHUF_REV_SEQ_4_VALS(v) (v)+3, (v)+2, (v)+1, (v) + +#define SWAP2_SHUF_MASK_16_BYTES \ + SWAP_SHUF_REV_SEQ_2_VALS (0 * 2), \ + SWAP_SHUF_REV_SEQ_2_VALS (1 * 2), \ + SWAP_SHUF_REV_SEQ_2_VALS (2 * 2), \ + SWAP_SHUF_REV_SEQ_2_VALS (3 * 2), \ + SWAP_SHUF_REV_SEQ_2_VALS (4 * 2), \ + SWAP_SHUF_REV_SEQ_2_VALS (5 * 2), \ + SWAP_SHUF_REV_SEQ_2_VALS (6 * 2), \ + SWAP_SHUF_REV_SEQ_2_VALS (7 * 2) + +#define SWAP4_SHUF_MASK_16_BYTES \ + SWAP_SHUF_REV_SEQ_4_VALS (0 * 4), \ + SWAP_SHUF_REV_SEQ_4_VALS (1 * 4), \ + SWAP_SHUF_REV_SEQ_4_VALS (2 * 4), \ + SWAP_SHUF_REV_SEQ_4_VALS (3 * 4) + +#if defined(USE_SWAP_AVX2) +/* if we use 256_BIT_INIT_MASK, each static array mask will be larger for 16 bytes */ +// #define SWAP_USE_256_BIT_INIT_MASK +#endif + +#if defined(SWAP_USE_256_BIT_INIT_MASK) && defined(USE_SWAP_AVX2) +#define SWAP_MASK_INIT_SIZE 32 +#else +#define SWAP_MASK_INIT_SIZE 16 +#endif + +MY_ALIGN(SWAP_MASK_INIT_SIZE) +static const Byte k_ShufMask_Swap2[] = +{ + SWAP2_SHUF_MASK_16_BYTES + #if SWAP_MASK_INIT_SIZE > 16 + , SWAP2_SHUF_MASK_16_BYTES + #endif +}; + +MY_ALIGN(SWAP_MASK_INIT_SIZE) +static const Byte k_ShufMask_Swap4[] = +{ + SWAP4_SHUF_MASK_16_BYTES + #if SWAP_MASK_INIT_SIZE > 16 + , SWAP4_SHUF_MASK_16_BYTES + #endif +}; + + +#ifdef USE_SWAP_SSSE3 + +#include // ssse3 + +#define SHUF_128(i) *(items + (i)) = \ + _mm_shuffle_epi8(*(items + (i)), mask); // SSSE3 + +// Z7_NO_INLINE +static +#ifdef SWAP_ATTRIB_SSSE3 +SWAP_ATTRIB_SSSE3 +#endif +Z7_ATTRIB_NO_VECTORIZE +void +Z7_FASTCALL +ShufBytes_128(void *items8, const void *lim8, const void *mask128_ptr) +{ + __m128i *items = (__m128i *)items8; + const __m128i *lim = (const __m128i *)lim8; + // const __m128i mask = _mm_set_epi8(SHUF_SWAP2_MASK_16_VALS); + // const __m128i mask = _mm_set_epi8(SHUF_SWAP4_MASK_16_VALS); + // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); + // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); + // const __m128i mask = *(const __m128i *)(const void *)&(k_ShufMask_Swap4[0]); + const __m128i mask = *(const __m128i *)mask128_ptr; + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SHUF_128(0) SHUF_128(1) items += 2; + SHUF_128(0) SHUF_128(1) items += 2; + } + while (items != lim); +} + +#endif // USE_SWAP_SSSE3 + + + +#ifdef USE_SWAP_AVX2 + +#include // avx, avx2 +#if defined(__clang__) +#include +#include +#endif + +#define SHUF_256(i) *(items + (i)) = \ + _mm256_shuffle_epi8(*(items + (i)), mask); // AVX2 + +// Z7_NO_INLINE +static +#ifdef SWAP_ATTRIB_AVX2 +SWAP_ATTRIB_AVX2 +#endif +Z7_ATTRIB_NO_VECTORIZE +void +Z7_FASTCALL +ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr) +{ + __m256i *items = (__m256i *)items8; + const __m256i *lim = (const __m256i *)lim8; + /* + UNUSED_VAR(mask128_ptr) + __m256i mask = + for Swap4: _mm256_setr_epi8(SWAP4_SHUF_MASK_16_BYTES, SWAP4_SHUF_MASK_16_BYTES); + for Swap2: _mm256_setr_epi8(SWAP2_SHUF_MASK_16_BYTES, SWAP2_SHUF_MASK_16_BYTES); + */ + const __m256i mask = + #if SWAP_MASK_INIT_SIZE > 16 + *(const __m256i *)(const void *)mask128_ptr; + #else + /* msvc: broadcastsi128() version reserves the stack for no reason + msvc 19.29-: _mm256_insertf128_si256() / _mm256_set_m128i)) versions use non-avx movdqu xmm0,XMMWORD PTR [r8] + msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want + */ + // _mm256_broadcastsi128_si256(*mask128_ptr); +#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000) + #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) +#else + #define MY_mm256_set_m128i _mm256_set_m128i +#endif + MY_mm256_set_m128i( + *(const __m128i *)mask128_ptr, + *(const __m128i *)mask128_ptr); + #endif + + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SHUF_256(0) SHUF_256(1) items += 2; + SHUF_256(0) SHUF_256(1) items += 2; + } + while (items != lim); +} + +#endif // USE_SWAP_AVX2 +#endif // USE_SWAP_SSSE3 || USE_SWAP_AVX2 +#endif // USE_SWAP_128 + + + +// compile message "NEON intrinsics not available with the soft-float ABI" +#elif defined(MY_CPU_ARM_OR_ARM64) \ + && defined(MY_CPU_LE) \ + && !defined(Z7_DISABLE_ARM_NEON) + + #if defined(__clang__) && (__clang_major__ >= 8) \ + || defined(__GNUC__) && (__GNUC__ >= 6) + #if defined(__ARM_FP) + #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 4)) \ + || defined(MY_CPU_ARM64) + #if defined(MY_CPU_ARM64) \ + || !defined(Z7_CLANG_VERSION) \ + || defined(__ARM_NEON) + #define USE_SWAP_128 + #ifdef MY_CPU_ARM64 + // #define SWAP_ATTRIB_NEON __attribute__((__target__(""))) + #else +#if defined(Z7_CLANG_VERSION) + // #define SWAP_ATTRIB_NEON __attribute__((__target__("neon"))) +#else + // #pragma message("SWAP_ATTRIB_NEON __attribute__((__target__(fpu=neon))") + #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=neon"))) +#endif + #endif // MY_CPU_ARM64 + #endif // __ARM_NEON + #endif // __ARM_ARCH + #endif // __ARM_FP + + #elif defined(_MSC_VER) + #if (_MSC_VER >= 1910) + #define USE_SWAP_128 + #endif + #endif + + #ifdef USE_SWAP_128 + #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) + #include + #else + +/* +#if !defined(__ARM_NEON) +#if defined(Z7_GCC_VERSION) && (__GNUC__ < 5) \ + || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 90201) \ + || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 100100) +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +#pragma message("#define __ARM_NEON 1") +// #define __ARM_NEON 1 +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif +#endif +*/ + #include + #endif + #endif + +#ifndef USE_SWAP_128 + #define FORCE_SWAP_MODE +#else + +#ifdef MY_CPU_ARM64 + // for debug : comment it + #define FORCE_SWAP_MODE +#else + #define k_SwapBytes_Mode_NEON 1 +#endif +// typedef uint8x16_t v128; +#define SWAP2_128(i) *(uint8x16_t *) (void *)(items + (i) * 8) = \ + vrev16q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 8)); +#define SWAP4_128(i) *(uint8x16_t *) (void *)(items + (i) * 4) = \ + vrev32q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 4)); + +// Z7_NO_INLINE +static +#ifdef SWAP_ATTRIB_NEON +SWAP_ATTRIB_NEON +#endif +Z7_ATTRIB_NO_VECTORIZE +void +Z7_FASTCALL +SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim) +{ + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SWAP2_128(0) SWAP2_128(1) items += 2 * 8; + SWAP2_128(0) SWAP2_128(1) items += 2 * 8; + } + while (items != lim); +} + +// Z7_NO_INLINE +static +#ifdef SWAP_ATTRIB_NEON +SWAP_ATTRIB_NEON +#endif +Z7_ATTRIB_NO_VECTORIZE +void +Z7_FASTCALL +SwapBytes4_128(CSwapUInt32 *items, const CSwapUInt32 *lim) +{ + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SWAP4_128(0) SWAP4_128(1) items += 2 * 4; + SWAP4_128(0) SWAP4_128(1) items += 2 * 4; + } + while (items != lim); +} + +#endif // USE_SWAP_128 + +#else // MY_CPU_ARM_OR_ARM64 +#define FORCE_SWAP_MODE +#endif // MY_CPU_ARM_OR_ARM64 + + + + + + +#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_X86) + /* _byteswap_ushort() in MSVC x86 32-bit works via slow { mov dh, al; mov dl, ah } + So we use own versions of byteswap function */ + #if (_MSC_VER < 1400 ) // old MSVC-X86 without _rotr16() support + #define SWAP2_16(i) { UInt32 v = items[i]; v += (v << 16); v >>= 8; items[i] = (CSwapUInt16)v; } + #else // is new MSVC-X86 with fast _rotr16() + #include + #define SWAP2_16(i) { items[i] = _rotr16(items[i], 8); } + #endif +#else // is not MSVC-X86 + #define SWAP2_16(i) { CSwapUInt16 v = items[i]; items[i] = Z7_BSWAP16(v); } +#endif // MSVC-X86 + +#if defined(Z7_CPU_FAST_BSWAP_SUPPORTED) + #define SWAP4_32(i) { CSwapUInt32 v = items[i]; items[i] = Z7_BSWAP32(v); } +#else + #define SWAP4_32(i) \ + { UInt32 v = items[i]; \ + v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff); \ + v = rotlFixed(v, 16); \ + items[i] = v; } +#endif + + + + +#if defined(FORCE_SWAP_MODE) && defined(USE_SWAP_128) + #define DEFAULT_Swap2 SwapBytes2_128 + #if !defined(MY_CPU_X86_OR_AMD64) + #define DEFAULT_Swap4 SwapBytes4_128 + #endif +#endif + +#if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4) + +#define SWAP_BASE_FUNCS_PREFIXES \ +Z7_FORCE_INLINE \ +static \ +Z7_ATTRIB_NO_VECTOR \ +void Z7_FASTCALL + + +#if defined(MY_CPU_ARM_OR_ARM64) +#if defined(__clang__) +#pragma GCC diagnostic ignored "-Wlanguage-extension-token" +#endif +#endif + + +#ifdef MY_CPU_64BIT + +#if defined(MY_CPU_ARM64) \ + && defined(__ARM_ARCH) && (__ARM_ARCH >= 8) \ + && ( (defined(__GNUC__) && (__GNUC__ >= 4)) \ + || (defined(__clang__) && (__clang_major__ >= 4))) + + #define SWAP2_64_VAR(v) asm ("rev16 %x0,%x0" : "+r" (v)); + #define SWAP4_64_VAR(v) asm ("rev32 %x0,%x0" : "+r" (v)); + +#else // is not ARM64-GNU + +#if !defined(MY_CPU_X86_OR_AMD64) || (k_SwapBytes_Mode_MIN == 0) || !defined(USE_SWAP_128) + #define SWAP2_64_VAR(v) \ + v = ( 0x00ff00ff00ff00ff & (v >> 8)) \ + + ((0x00ff00ff00ff00ff & v) << 8); + /* plus gives faster code in MSVC */ +#endif + +#ifdef Z7_CPU_FAST_BSWAP_SUPPORTED + #define SWAP4_64_VAR(v) \ + v = Z7_BSWAP64(v); \ + v = Z7_ROTL64(v, 32); +#else + #define SWAP4_64_VAR(v) \ + v = ( 0x000000ff000000ff & (v >> 24)) \ + + ((0x000000ff000000ff & v) << 24 ) \ + + ( 0x0000ff000000ff00 & (v >> 8)) \ + + ((0x0000ff000000ff00 & v) << 8 ) \ + ; +#endif + +#endif // ARM64-GNU + + +#ifdef SWAP2_64_VAR + +#define SWAP2_64(i) { \ + UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 4); \ + SWAP2_64_VAR(v) \ + *(UInt64 *)(void *)(items + (i) * 4) = v; } + +SWAP_BASE_FUNCS_PREFIXES +SwapBytes2_64(CSwapUInt16 *items, const CSwapUInt16 *lim) +{ + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SWAP2_64(0) SWAP2_64(1) items += 2 * 4; + SWAP2_64(0) SWAP2_64(1) items += 2 * 4; + } + while (items != lim); +} + + #define DEFAULT_Swap2 SwapBytes2_64 + #if !defined(FORCE_SWAP_MODE) + #define SWAP2_DEFAULT_MODE 0 + #endif +#else // !defined(SWAP2_64_VAR) + #define DEFAULT_Swap2 SwapBytes2_128 + #if !defined(FORCE_SWAP_MODE) + #define SWAP2_DEFAULT_MODE 1 + #endif +#endif // SWAP2_64_VAR + + +#define SWAP4_64(i) { \ + UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 2); \ + SWAP4_64_VAR(v) \ + *(UInt64 *)(void *)(items + (i) * 2) = v; } + +SWAP_BASE_FUNCS_PREFIXES +SwapBytes4_64(CSwapUInt32 *items, const CSwapUInt32 *lim) +{ + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SWAP4_64(0) SWAP4_64(1) items += 2 * 2; + SWAP4_64(0) SWAP4_64(1) items += 2 * 2; + } + while (items != lim); +} + +#define DEFAULT_Swap4 SwapBytes4_64 + +#else // is not 64BIT + + +#if defined(MY_CPU_ARM_OR_ARM64) \ + && defined(__ARM_ARCH) && (__ARM_ARCH >= 6) \ + && ( (defined(__GNUC__) && (__GNUC__ >= 4)) \ + || (defined(__clang__) && (__clang_major__ >= 4))) + +#ifdef MY_CPU_64BIT + #define SWAP2_32_VAR(v) asm ("rev16 %w0,%w0" : "+r" (v)); +#else + #define SWAP2_32_VAR(v) asm ("rev16 %0,%0" : "+r" (v)); // for clang/gcc + // asm ("rev16 %r0,%r0" : "+r" (a)); // for gcc +#endif + +#elif defined(_MSC_VER) && (_MSC_VER < 1300) && defined(MY_CPU_X86) \ + || !defined(Z7_CPU_FAST_BSWAP_SUPPORTED) \ + || !defined(Z7_CPU_FAST_ROTATE_SUPPORTED) + // old msvc doesn't support _byteswap_ulong() + #define SWAP2_32_VAR(v) \ + v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff); + +#else // is not ARM and is not old-MSVC-X86 and fast BSWAP/ROTATE are supported + #define SWAP2_32_VAR(v) \ + v = Z7_BSWAP32(v); \ + v = rotlFixed(v, 16); + +#endif // GNU-ARM* + +#define SWAP2_32(i) { \ + UInt32 v = *(const UInt32 *)(const void *)(items + (i) * 2); \ + SWAP2_32_VAR(v); \ + *(UInt32 *)(void *)(items + (i) * 2) = v; } + + +SWAP_BASE_FUNCS_PREFIXES +SwapBytes2_32(CSwapUInt16 *items, const CSwapUInt16 *lim) +{ + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SWAP2_32(0) SWAP2_32(1) items += 2 * 2; + SWAP2_32(0) SWAP2_32(1) items += 2 * 2; + } + while (items != lim); +} + + +SWAP_BASE_FUNCS_PREFIXES +SwapBytes4_32(CSwapUInt32 *items, const CSwapUInt32 *lim) +{ + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + do + { + SWAP4_32(0) SWAP4_32(1) items += 2; + SWAP4_32(0) SWAP4_32(1) items += 2; + } + while (items != lim); +} + +#define DEFAULT_Swap2 SwapBytes2_32 +#define DEFAULT_Swap4 SwapBytes4_32 +#if !defined(FORCE_SWAP_MODE) + #define SWAP2_DEFAULT_MODE 0 +#endif + +#endif // MY_CPU_64BIT +#endif // if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4) + + + +#if !defined(FORCE_SWAP_MODE) +static unsigned g_SwapBytes_Mode; +#endif + +/* size of largest unrolled loop iteration: 128 bytes = 4 * 32 bytes (AVX). */ +#define SWAP_ITERATION_BLOCK_SIZE_MAX (1 << 7) + +// 32 bytes for (AVX) or 2 * 16-bytes for NEON. +#define SWAP_VECTOR_ALIGN_SIZE (1 << 5) + +Z7_NO_INLINE +void z7_SwapBytes2(CSwapUInt16 *items, size_t numItems) +{ + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--) + { + SWAP2_16(0) + items++; + } + { + const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt16) - 1; + size_t numItems2 = numItems; + CSwapUInt16 *lim; + numItems &= k_Align_Mask; + numItems2 &= ~(size_t)k_Align_Mask; + lim = items + numItems2; + if (numItems2 != 0) + { + #if !defined(FORCE_SWAP_MODE) + #ifdef MY_CPU_X86_OR_AMD64 + #ifdef USE_SWAP_AVX2 + if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3) + ShufBytes_256((__m256i *)(void *)items, + (const __m256i *)(const void *)lim, + (const __m128i *)(const void *)&(k_ShufMask_Swap2[0])); + else + #endif + #ifdef USE_SWAP_SSSE3 + if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3) + ShufBytes_128((__m128i *)(void *)items, + (const __m128i *)(const void *)lim, + (const __m128i *)(const void *)&(k_ShufMask_Swap2[0])); + else + #endif + #endif // MY_CPU_X86_OR_AMD64 + #if SWAP2_DEFAULT_MODE == 0 + if (g_SwapBytes_Mode != 0) + SwapBytes2_128(items, lim); + else + #endif + #endif // FORCE_SWAP_MODE + DEFAULT_Swap2(items, lim); + } + items = lim; + } + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + for (; numItems != 0; numItems--) + { + SWAP2_16(0) + items++; + } +} + + +Z7_NO_INLINE +void z7_SwapBytes4(CSwapUInt32 *items, size_t numItems) +{ + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--) + { + SWAP4_32(0) + items++; + } + { + const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt32) - 1; + size_t numItems2 = numItems; + CSwapUInt32 *lim; + numItems &= k_Align_Mask; + numItems2 &= ~(size_t)k_Align_Mask; + lim = items + numItems2; + if (numItems2 != 0) + { + #if !defined(FORCE_SWAP_MODE) + #ifdef MY_CPU_X86_OR_AMD64 + #ifdef USE_SWAP_AVX2 + if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3) + ShufBytes_256((__m256i *)(void *)items, + (const __m256i *)(const void *)lim, + (const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); + else + #endif + #ifdef USE_SWAP_SSSE3 + if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3) + ShufBytes_128((__m128i *)(void *)items, + (const __m128i *)(const void *)lim, + (const __m128i *)(const void *)&(k_ShufMask_Swap4[0])); + else + #endif + #else // MY_CPU_X86_OR_AMD64 + + if (g_SwapBytes_Mode != 0) + SwapBytes4_128(items, lim); + else + #endif // MY_CPU_X86_OR_AMD64 + #endif // FORCE_SWAP_MODE + DEFAULT_Swap4(items, lim); + } + items = lim; + } + Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE + for (; numItems != 0; numItems--) + { + SWAP4_32(0) + items++; + } +} + + +// #define SHOW_HW_STATUS + +#ifdef SHOW_HW_STATUS +#include +#define PRF(x) x +#else +#define PRF(x) +#endif + +void z7_SwapBytesPrepare(void) +{ +#ifndef FORCE_SWAP_MODE + unsigned mode = 0; // k_SwapBytes_Mode_BASE; + +#ifdef MY_CPU_ARM_OR_ARM64 + { + if (CPU_IsSupported_NEON()) + { + // #pragma message ("=== SwapBytes NEON") + PRF(printf("\n=== SwapBytes NEON\n");) + mode = k_SwapBytes_Mode_NEON; + } + } +#else // MY_CPU_ARM_OR_ARM64 + { + #ifdef USE_SWAP_AVX2 + if (CPU_IsSupported_AVX2()) + { + // #pragma message ("=== SwapBytes AVX2") + PRF(printf("\n=== SwapBytes AVX2\n");) + mode = k_SwapBytes_Mode_AVX2; + } + else + #endif + #ifdef USE_SWAP_SSSE3 + if (CPU_IsSupported_SSSE3()) + { + // #pragma message ("=== SwapBytes SSSE3") + PRF(printf("\n=== SwapBytes SSSE3\n");) + mode = k_SwapBytes_Mode_SSSE3; + } + else + #endif + #if !defined(MY_CPU_AMD64) + if (CPU_IsSupported_SSE2()) + #endif + { + // #pragma message ("=== SwapBytes SSE2") + PRF(printf("\n=== SwapBytes SSE2\n");) + mode = k_SwapBytes_Mode_SSE2; + } + } +#endif // MY_CPU_ARM_OR_ARM64 + g_SwapBytes_Mode = mode; + // g_SwapBytes_Mode = 0; // for debug +#endif // FORCE_SWAP_MODE + PRF(printf("\n=== SwapBytesPrepare\n");) +} + +#undef PRF diff --git a/src/sdk/C/SwapBytes.h b/src/sdk/C/SwapBytes.h new file mode 100644 index 0000000..d442467 --- /dev/null +++ b/src/sdk/C/SwapBytes.h @@ -0,0 +1,17 @@ +/* SwapBytes.h -- Byte Swap conversion filter +2023-04-02 : Igor Pavlov : Public domain */ + +#ifndef ZIP7_INC_SWAP_BYTES_H +#define ZIP7_INC_SWAP_BYTES_H + +#include "7zTypes.h" + +EXTERN_C_BEGIN + +void z7_SwapBytes2(UInt16 *data, size_t numItems); +void z7_SwapBytes4(UInt32 *data, size_t numItems); +void z7_SwapBytesPrepare(void); + +EXTERN_C_END + +#endif diff --git a/src/sdk/C/Threads.c b/src/sdk/C/Threads.c index 930ad27..177d1d9 100644 --- a/src/sdk/C/Threads.c +++ b/src/sdk/C/Threads.c @@ -1,17 +1,19 @@ /* Threads.c -- multithreading library -2017-06-26 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" -#ifndef UNDER_CE +#ifdef _WIN32 + +#ifndef USE_THREADS_CreateThread #include #endif #include "Threads.h" -static WRes GetError() +static WRes GetError(void) { - DWORD res = GetLastError(); + const DWORD res = GetLastError(); return res ? (WRes)res : 1; } @@ -29,28 +31,310 @@ WRes HandlePtr_Close(HANDLE *p) return 0; } -WRes Handle_WaitObject(HANDLE h) { return (WRes)WaitForSingleObject(h, INFINITE); } +WRes Handle_WaitObject(HANDLE h) +{ + DWORD dw = WaitForSingleObject(h, INFINITE); + /* + (dw) result: + WAIT_OBJECT_0 // 0 + WAIT_ABANDONED // 0x00000080 : is not compatible with Win32 Error space + WAIT_TIMEOUT // 0x00000102 : is compatible with Win32 Error space + WAIT_FAILED // 0xFFFFFFFF + */ + if (dw == WAIT_FAILED) + { + dw = GetLastError(); + if (dw == 0) + return WAIT_FAILED; + } + return (WRes)dw; +} + +#define Thread_Wait(p) Handle_WaitObject(*(p)) + +WRes Thread_Wait_Close(CThread *p) +{ + WRes res = Thread_Wait(p); + WRes res2 = Thread_Close(p); + return (res != 0 ? res : res2); +} + +typedef struct MY_PROCESSOR_NUMBER { + WORD Group; + BYTE Number; + BYTE Reserved; +} MY_PROCESSOR_NUMBER, *MY_PPROCESSOR_NUMBER; + +typedef struct MY_GROUP_AFFINITY { +#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 100000) + // KAFFINITY is not defined in old mingw + ULONG_PTR +#else + KAFFINITY +#endif + Mask; + WORD Group; + WORD Reserved[3]; +} MY_GROUP_AFFINITY, *MY_PGROUP_AFFINITY; + +typedef BOOL (WINAPI *Func_SetThreadGroupAffinity)( + HANDLE hThread, + CONST MY_GROUP_AFFINITY *GroupAffinity, + MY_PGROUP_AFFINITY PreviousGroupAffinity); + +typedef BOOL (WINAPI *Func_GetThreadGroupAffinity)( + HANDLE hThread, + MY_PGROUP_AFFINITY GroupAffinity); + +typedef BOOL (WINAPI *Func_GetProcessGroupAffinity)( + HANDLE hProcess, + PUSHORT GroupCount, + PUSHORT GroupArray); + +Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION + +#if 0 +#include +#define PRF(x) x +/* +-- + before call of SetThreadGroupAffinity() + GetProcessGroupAffinity return one group. + after call of SetThreadGroupAffinity(): + GetProcessGroupAffinity return more than group, + if SetThreadGroupAffinity() was to another group. +-- + GetProcessAffinityMask MS DOCs: + { + If the calling process contains threads in multiple groups, + the function returns zero for both affinity masks. + } + but tests in win10 with 2 groups (less than 64 cores total): + GetProcessAffinityMask() still returns non-zero affinity masks + even after SetThreadGroupAffinity() calls. +*/ +static void PrintProcess_Info() +{ + { + const + Func_GetProcessGroupAffinity fn_GetProcessGroupAffinity = + (Func_GetProcessGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), + "GetProcessGroupAffinity"); + if (fn_GetProcessGroupAffinity) + { + unsigned i; + USHORT GroupCounts[64]; + USHORT GroupCount = Z7_ARRAY_SIZE(GroupCounts); + BOOL boolRes = fn_GetProcessGroupAffinity(GetCurrentProcess(), + &GroupCount, GroupCounts); + printf("\n====== GetProcessGroupAffinity : " + "boolRes=%u GroupCounts = %u :", + boolRes, (unsigned)GroupCount); + for (i = 0; i < GroupCount; i++) + printf(" %u", GroupCounts[i]); + printf("\n"); + } + } + { + DWORD_PTR processAffinityMask, systemAffinityMask; + if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask)) + { + PRF(printf("\n====== GetProcessAffinityMask : " + ": processAffinityMask=%x, systemAffinityMask=%x\n", + (UInt32)processAffinityMask, (UInt32)systemAffinityMask);) + } + else + printf("\n==GetProcessAffinityMask FAIL"); + } +} +#else +#ifndef USE_THREADS_CreateThread +// #define PRF(x) +#endif +#endif WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) { /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ + + #ifdef USE_THREADS_CreateThread + + DWORD threadId; + *p = CreateThread(NULL, 0, func, param, 0, &threadId); - #ifdef UNDER_CE + #else - DWORD threadId; - *p = CreateThread(0, 0, func, param, 0, &threadId); + unsigned threadId; + *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId)); - #else +#if 0 // 1 : for debug + { + DWORD_PTR prevMask; + DWORD_PTR affinity = 1 << 0; + prevMask = SetThreadAffinityMask(*p, (DWORD_PTR)affinity); + prevMask = prevMask; + } +#endif +#if 0 // 1 : for debug + { + /* win10: new thread will be created in same group that is assigned to parent thread + but affinity mask will contain all allowed threads of that group, + even if affinity mask of parent group is not full + win11: what group it will be created, if we have set + affinity of parent thread with ThreadGroupAffinity? + */ + const + Func_GetThreadGroupAffinity fn = + (Func_GetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), + "GetThreadGroupAffinity"); + if (fn) + { + // BOOL wres2; + MY_GROUP_AFFINITY groupAffinity; + memset(&groupAffinity, 0, sizeof(groupAffinity)); + /* wres2 = */ fn(*p, &groupAffinity); + PRF(printf("\n==Thread_Create cur = %6u GetThreadGroupAffinity(): " + "wres2_BOOL = %u, group=%u mask=%x\n", + GetCurrentThreadId(), + wres2, + groupAffinity.Group, + (UInt32)groupAffinity.Mask);) + } + } +#endif - unsigned threadId; - *p = (HANDLE)_beginthreadex(NULL, 0, func, param, 0, &threadId); - #endif /* maybe we must use errno here, but probably GetLastError() is also OK. */ return HandleToWRes(*p); } + +WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) +{ + #ifdef USE_THREADS_CreateThread + + UNUSED_VAR(affinity) + return Thread_Create(p, func, param); + + #else + + /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ + HANDLE h; + WRes wres; + unsigned threadId; + h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId)); + *p = h; + wres = HandleToWRes(h); + if (h) + { + { + // DWORD_PTR prevMask = + SetThreadAffinityMask(h, (DWORD_PTR)affinity); + /* + if (prevMask == 0) + { + // affinity change is non-critical error, so we can ignore it + // wres = GetError(); + } + */ + } + { + const DWORD prevSuspendCount = ResumeThread(h); + /* ResumeThread() returns: + 0 : was_not_suspended + 1 : was_resumed + -1 : error + */ + if (prevSuspendCount == (DWORD)-1) + wres = GetError(); + } + } + + /* maybe we must use errno here, but probably GetLastError() is also OK. */ + return wres; + + #endif +} + + +WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask) +{ +#ifdef USE_THREADS_CreateThread + + UNUSED_VAR(group) + UNUSED_VAR(affinityMask) + return Thread_Create(p, func, param); + +#else + + /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ + HANDLE h; + WRes wres; + unsigned threadId; + h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId)); + *p = h; + wres = HandleToWRes(h); + if (h) + { + // PrintProcess_Info(); + { + const + Func_SetThreadGroupAffinity fn = + (Func_SetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), + "SetThreadGroupAffinity"); + if (fn) + { + // WRes wres2; + MY_GROUP_AFFINITY groupAffinity, prev_groupAffinity; + memset(&groupAffinity, 0, sizeof(groupAffinity)); + // groupAffinity.Mask must use only bits that supported by current group + // (groupAffinity.Mask = 0) means all allowed bits + groupAffinity.Mask = affinityMask; + groupAffinity.Group = (WORD)group; + // wres2 = + fn(h, &groupAffinity, &prev_groupAffinity); + /* + if (groupAffinity.Group == prev_groupAffinity.Group) + wres2 = wres2; + else + wres2 = wres2; + if (wres2 == 0) + { + wres2 = GetError(); + PRF(printf("\n==SetThreadGroupAffinity error: %u\n", wres2);) + } + else + { + PRF(printf("\n==Thread_Create_With_Group::SetThreadGroupAffinity()" + " threadId = %6u" + " group=%u mask=%x\n", + threadId, + prev_groupAffinity.Group, + (UInt32)prev_groupAffinity.Mask);) + } + */ + } + } + { + const DWORD prevSuspendCount = ResumeThread(h); + /* ResumeThread() returns: + 0 : was_not_suspended + 1 : was_resumed + -1 : error + */ + if (prevSuspendCount == (DWORD)-1) + wres = GetError(); + } + } + + /* maybe we must use errno here, but probably GetLastError() is also OK. */ + return wres; + + #endif +} + + static WRes Event_Create(CEvent *p, BOOL manualReset, int signaled) { *p = CreateEvent(NULL, manualReset, (signaled ? TRUE : FALSE), NULL); @@ -68,10 +352,22 @@ WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p) { return AutoResetEven WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount) { + // negative ((LONG)maxCount) is not supported in WIN32::CreateSemaphore() *p = CreateSemaphore(NULL, (LONG)initCount, (LONG)maxCount, NULL); return HandleToWRes(*p); } +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount) +{ + // if (Semaphore_IsCreated(p)) + { + WRes wres = Semaphore_Close(p); + if (wres != 0) + return wres; + } + return Semaphore_Create(p, initCount, maxCount); +} + static WRes Semaphore_Release(CSemaphore *p, LONG releaseCount, LONG *previousCount) { return BOOLToWRes(ReleaseSemaphore(*p, releaseCount, previousCount)); } WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num) @@ -80,8 +376,13 @@ WRes Semaphore_Release1(CSemaphore *p) { return Semaphore_ReleaseN(p, 1); } WRes CriticalSection_Init(CCriticalSection *p) { - /* InitializeCriticalSection can raise only STATUS_NO_MEMORY exception */ + /* InitializeCriticalSection() can raise exception: + Windows XP, 2003 : can raise a STATUS_NO_MEMORY exception + Windows Vista+ : no exceptions */ #ifdef _MSC_VER + #ifdef __clang__ + #pragma GCC diagnostic ignored "-Wlanguage-extension-token" + #endif __try #endif { @@ -89,7 +390,423 @@ WRes CriticalSection_Init(CCriticalSection *p) /* InitializeCriticalSectionAndSpinCount(p, 0); */ } #ifdef _MSC_VER - __except (EXCEPTION_EXECUTE_HANDLER) { return 1; } + __except (EXCEPTION_EXECUTE_HANDLER) { return ERROR_NOT_ENOUGH_MEMORY; } #endif return 0; } + + + + +#else // _WIN32 + +// ---------- POSIX ---------- + +#if defined(__linux__) && !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) +#ifndef Z7_AFFINITY_DISABLE +// _GNU_SOURCE can be required for pthread_setaffinity_np() / CPU_ZERO / CPU_SET +// clang < 3.6 : unknown warning group '-Wreserved-id-macro' +// clang 3.6 - 12.01 : gives warning "macro name is a reserved identifier" +// clang >= 13 : do not give warning +#if !defined(_GNU_SOURCE) +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +// #define _GNU_SOURCE +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif // !defined(_GNU_SOURCE) +#endif // Z7_AFFINITY_DISABLE +#endif // __linux__ + +#include "Threads.h" + +#include +#include +#include +#ifdef Z7_AFFINITY_SUPPORTED +// #include +#endif + + +// #include +// #define PRF(p) p +#define PRF(p) +#define Print(s) PRF(printf("\n%s\n", s);) + +WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet) +{ + // new thread in Posix probably inherits affinity from parrent thread + Print("Thread_Create_With_CpuSet") + + pthread_attr_t attr; + int ret; + // int ret2; + + p->_created = 0; + + RINOK(pthread_attr_init(&attr)) + + ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + if (!ret) + { + if (cpuSet) + { + // pthread_attr_setaffinity_np() is not supported for MUSL compile. + // so we check for __GLIBC__ here +#if defined(Z7_AFFINITY_SUPPORTED) && defined( __GLIBC__) + /* + printf("\n affinity :"); + unsigned i; + for (i = 0; i < sizeof(*cpuSet) && i < 8; i++) + { + Byte b = *((const Byte *)cpuSet + i); + char temp[32]; + #define GET_HEX_CHAR(t) ((char)(((t < 10) ? ('0' + t) : ('A' + (t - 10))))) + temp[0] = GET_HEX_CHAR((b & 0xF)); + temp[1] = GET_HEX_CHAR((b >> 4)); + // temp[0] = GET_HEX_CHAR((b >> 4)); // big-endian + // temp[1] = GET_HEX_CHAR((b & 0xF)); // big-endian + temp[2] = 0; + printf("%s", temp); + } + printf("\n"); + */ + + // ret2 = + pthread_attr_setaffinity_np(&attr, sizeof(*cpuSet), cpuSet); + // if (ret2) ret = ret2; +#endif + } + + ret = pthread_create(&p->_tid, &attr, func, param); + + if (!ret) + { + p->_created = 1; + /* + if (cpuSet) + { + // ret2 = + pthread_setaffinity_np(p->_tid, sizeof(*cpuSet), cpuSet); + // if (ret2) ret = ret2; + } + */ + } + } + // ret2 = + pthread_attr_destroy(&attr); + // if (ret2 != 0) ret = ret2; + return ret; +} + + +WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) +{ + return Thread_Create_With_CpuSet(p, func, param, NULL); +} + +/* +WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinity) +{ + UNUSED_VAR(group) + return Thread_Create_With_Affinity(p, func, param, affinity); +} +*/ + +WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) +{ + Print("Thread_Create_WithAffinity") + CCpuSet cs; + unsigned i; + CpuSet_Zero(&cs); + for (i = 0; i < sizeof(affinity) * 8; i++) + { + if (affinity == 0) + break; + if (affinity & 1) + { + CpuSet_Set(&cs, i); + } + affinity >>= 1; + } + return Thread_Create_With_CpuSet(p, func, param, &cs); +} + + +WRes Thread_Close(CThread *p) +{ + // Print("Thread_Close") + int ret; + if (!p->_created) + return 0; + + ret = pthread_detach(p->_tid); + p->_tid = 0; + p->_created = 0; + return ret; +} + + +WRes Thread_Wait_Close(CThread *p) +{ + // Print("Thread_Wait_Close") + void *thread_return; + int ret; + if (!p->_created) + return EINVAL; + + ret = pthread_join(p->_tid, &thread_return); + // probably we can't use that (_tid) after pthread_join(), so we close thread here + p->_created = 0; + p->_tid = 0; + return ret; +} + + + +static WRes Event_Create(CEvent *p, int manualReset, int signaled) +{ + RINOK(pthread_mutex_init(&p->_mutex, NULL)) + RINOK(pthread_cond_init(&p->_cond, NULL)) + p->_manual_reset = manualReset; + p->_state = (signaled ? True : False); + p->_created = 1; + return 0; +} + +WRes ManualResetEvent_Create(CManualResetEvent *p, int signaled) + { return Event_Create(p, True, signaled); } +WRes ManualResetEvent_CreateNotSignaled(CManualResetEvent *p) + { return ManualResetEvent_Create(p, 0); } +WRes AutoResetEvent_Create(CAutoResetEvent *p, int signaled) + { return Event_Create(p, False, signaled); } +WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p) + { return AutoResetEvent_Create(p, 0); } + + +#if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) +// freebsd: +#pragma GCC diagnostic ignored "-Wthread-safety-analysis" +#endif + +WRes Event_Set(CEvent *p) +{ + RINOK(pthread_mutex_lock(&p->_mutex)) + p->_state = True; + { + const int res1 = pthread_cond_broadcast(&p->_cond); + const int res2 = pthread_mutex_unlock(&p->_mutex); + return (res2 ? res2 : res1); + } +} + +WRes Event_Reset(CEvent *p) +{ + RINOK(pthread_mutex_lock(&p->_mutex)) + p->_state = False; + return pthread_mutex_unlock(&p->_mutex); +} + +WRes Event_Wait(CEvent *p) +{ + RINOK(pthread_mutex_lock(&p->_mutex)) + while (p->_state == False) + { + // ETIMEDOUT + // ret = + pthread_cond_wait(&p->_cond, &p->_mutex); + // if (ret != 0) break; + } + if (p->_manual_reset == False) + { + p->_state = False; + } + return pthread_mutex_unlock(&p->_mutex); +} + +WRes Event_Close(CEvent *p) +{ + if (!p->_created) + return 0; + p->_created = 0; + { + const int res1 = pthread_mutex_destroy(&p->_mutex); + const int res2 = pthread_cond_destroy(&p->_cond); + return (res1 ? res1 : res2); + } +} + + +WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount) +{ + if (initCount > maxCount || maxCount < 1) + return EINVAL; + RINOK(pthread_mutex_init(&p->_mutex, NULL)) + RINOK(pthread_cond_init(&p->_cond, NULL)) + p->_count = initCount; + p->_maxCount = maxCount; + p->_created = 1; + return 0; +} + + +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount) +{ + if (Semaphore_IsCreated(p)) + { + /* + WRes wres = Semaphore_Close(p); + if (wres != 0) + return wres; + */ + if (initCount > maxCount || maxCount < 1) + return EINVAL; + // return EINVAL; // for debug + p->_count = initCount; + p->_maxCount = maxCount; + return 0; + } + return Semaphore_Create(p, initCount, maxCount); +} + + +WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 releaseCount) +{ + UInt32 newCount; + int ret; + + if (releaseCount < 1) + return EINVAL; + + RINOK(pthread_mutex_lock(&p->_mutex)) + + newCount = p->_count + releaseCount; + if (newCount > p->_maxCount) + ret = ERROR_TOO_MANY_POSTS; // EINVAL; + else + { + p->_count = newCount; + ret = pthread_cond_broadcast(&p->_cond); + } + RINOK(pthread_mutex_unlock(&p->_mutex)) + return ret; +} + +WRes Semaphore_Wait(CSemaphore *p) +{ + RINOK(pthread_mutex_lock(&p->_mutex)) + while (p->_count < 1) + { + pthread_cond_wait(&p->_cond, &p->_mutex); + } + p->_count--; + return pthread_mutex_unlock(&p->_mutex); +} + +WRes Semaphore_Close(CSemaphore *p) +{ + if (!p->_created) + return 0; + p->_created = 0; + { + const int res1 = pthread_mutex_destroy(&p->_mutex); + const int res2 = pthread_cond_destroy(&p->_cond); + return (res1 ? res1 : res2); + } +} + + + +WRes CriticalSection_Init(CCriticalSection *p) +{ + // Print("CriticalSection_Init") + if (!p) + return EINTR; + return pthread_mutex_init(&p->_mutex, NULL); +} + +void CriticalSection_Enter(CCriticalSection *p) +{ + // Print("CriticalSection_Enter") + if (p) + { + // int ret = + pthread_mutex_lock(&p->_mutex); + } +} + +void CriticalSection_Leave(CCriticalSection *p) +{ + // Print("CriticalSection_Leave") + if (p) + { + // int ret = + pthread_mutex_unlock(&p->_mutex); + } +} + +void CriticalSection_Delete(CCriticalSection *p) +{ + // Print("CriticalSection_Delete") + if (p) + { + // int ret = + pthread_mutex_destroy(&p->_mutex); + } +} + +LONG InterlockedIncrement(LONG volatile *addend) +{ + // Print("InterlockedIncrement") + #ifdef USE_HACK_UNSAFE_ATOMIC + LONG val = *addend + 1; + *addend = val; + return val; + #else + + #if defined(__clang__) && (__clang_major__ >= 8) + #pragma GCC diagnostic ignored "-Watomic-implicit-seq-cst" + #endif + return __sync_add_and_fetch(addend, 1); + #endif +} + +LONG InterlockedDecrement(LONG volatile *addend) +{ + // Print("InterlockedDecrement") + #ifdef USE_HACK_UNSAFE_ATOMIC + LONG val = *addend - 1; + *addend = val; + return val; + #else + return __sync_sub_and_fetch(addend, 1); + #endif +} + +#endif // _WIN32 + +WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p) +{ + if (Event_IsCreated(p)) + return Event_Reset(p); + return AutoResetEvent_CreateNotSignaled(p); +} + +void ThreadNextGroup_Init(CThreadNextGroup *p, UInt32 numGroups, UInt32 startGroup) +{ + // printf("\n====== ThreadNextGroup_Init numGroups = %x: startGroup=%x\n", numGroups, startGroup); + if (numGroups == 0) + numGroups = 1; + p->NumGroups = numGroups; + p->NextGroup = startGroup % numGroups; +} + + +UInt32 ThreadNextGroup_GetNext(CThreadNextGroup *p) +{ + const UInt32 next = p->NextGroup; + p->NextGroup = (next + 1) % p->NumGroups; + return next; +} + +#undef PRF +#undef Print diff --git a/src/sdk/C/Threads.h b/src/sdk/C/Threads.h index e53ace4..be12e6e 100644 --- a/src/sdk/C/Threads.h +++ b/src/sdk/C/Threads.h @@ -1,38 +1,163 @@ /* Threads.h -- multithreading library -2017-06-18 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ -#ifndef __7Z_THREADS_H -#define __7Z_THREADS_H +#ifndef ZIP7_INC_THREADS_H +#define ZIP7_INC_THREADS_H #ifdef _WIN32 -#include +#include "7zWindows.h" + +#else + +#include "Compiler.h" + +// #define Z7_AFFINITY_DISABLE +#if defined(__linux__) +#if !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) +#ifndef Z7_AFFINITY_DISABLE +#define Z7_AFFINITY_SUPPORTED +// #pragma message(" ==== Z7_AFFINITY_SUPPORTED") +#if !defined(_GNU_SOURCE) +// #pragma message(" ==== _GNU_SOURCE set") +// we need _GNU_SOURCE for cpu_set_t, if we compile for MUSL +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +#define _GNU_SOURCE +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif +#endif +#endif +#endif + +#include + #endif #include "7zTypes.h" EXTERN_C_BEGIN +#ifdef _WIN32 + WRes HandlePtr_Close(HANDLE *h); WRes Handle_WaitObject(HANDLE h); typedef HANDLE CThread; -#define Thread_Construct(p) *(p) = NULL + +#define Thread_CONSTRUCT(p) { *(p) = NULL; } #define Thread_WasCreated(p) (*(p) != NULL) #define Thread_Close(p) HandlePtr_Close(p) -#define Thread_Wait(p) Handle_WaitObject(*(p)) +// #define Thread_Wait(p) Handle_WaitObject(*(p)) -typedef #ifdef UNDER_CE - DWORD + // if (USE_THREADS_CreateThread is defined), we use _beginthreadex() + // if (USE_THREADS_CreateThread is not definned), we use CreateThread() + #define USE_THREADS_CreateThread +#endif + +typedef + #ifdef USE_THREADS_CreateThread + DWORD + #else + unsigned + #endif + THREAD_FUNC_RET_TYPE; + +#define THREAD_FUNC_RET_ZERO 0 + +typedef DWORD_PTR CAffinityMask; +typedef DWORD_PTR CCpuSet; + +#define CpuSet_Zero(p) *(p) = (0) +#define CpuSet_Set(p, cpu) *(p) |= ((DWORD_PTR)1 << (cpu)) + +#else // _WIN32 + +typedef struct +{ + pthread_t _tid; + int _created; +} CThread; + +#define Thread_CONSTRUCT(p) { (p)->_tid = 0; (p)->_created = 0; } +#define Thread_WasCreated(p) ((p)->_created != 0) +WRes Thread_Close(CThread *p); +// #define Thread_Wait Thread_Wait_Close + +typedef void * THREAD_FUNC_RET_TYPE; +#define THREAD_FUNC_RET_ZERO NULL + + +typedef UInt64 CAffinityMask; + +#ifdef Z7_AFFINITY_SUPPORTED + +typedef cpu_set_t CCpuSet; +#define CpuSet_Zero(p) CPU_ZERO(p) +#define CpuSet_Set(p, cpu) CPU_SET(cpu, p) +#define CpuSet_IsSet(p, cpu) CPU_ISSET(cpu, p) + +#else + +typedef UInt64 CCpuSet; +#define CpuSet_Zero(p) *(p) = (0) +#define CpuSet_Set(p, cpu) *(p) |= ((UInt64)1 << (cpu)) +#define CpuSet_IsSet(p, cpu) ((*(p) & ((UInt64)1 << (cpu))) != 0) + +#endif + + +#endif // _WIN32 + + +#define THREAD_FUNC_CALL_TYPE Z7_STDCALL + +#if defined(_WIN32) && defined(__GNUC__) +/* GCC compiler for x86 32-bit uses the rule: + the stack is 16-byte aligned before CALL instruction for function calling. + But only root function main() contains instructions that + set 16-byte alignment for stack pointer. And another functions + just keep alignment, if it was set in some parent function. + + The problem: + if we create new thread in MinGW (GCC) 32-bit x86 via _beginthreadex() or CreateThread(), + the root function of thread doesn't set 16-byte alignment. + And stack frames in all child functions also will be unaligned in that case. + + Here we set (force_align_arg_pointer) attribute for root function of new thread. + Do we need (force_align_arg_pointer) also for another systems? */ + + #define THREAD_FUNC_ATTRIB_ALIGN_ARG __attribute__((force_align_arg_pointer)) + // #define THREAD_FUNC_ATTRIB_ALIGN_ARG // for debug : bad alignment in SSE functions #else - unsigned + #define THREAD_FUNC_ATTRIB_ALIGN_ARG #endif - THREAD_FUNC_RET_TYPE; -#define THREAD_FUNC_CALL_TYPE MY_STD_CALL -#define THREAD_FUNC_DECL THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE +#define THREAD_FUNC_DECL THREAD_FUNC_ATTRIB_ALIGN_ARG THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE + typedef THREAD_FUNC_RET_TYPE (THREAD_FUNC_CALL_TYPE * THREAD_FUNC_TYPE)(void *); WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param); +WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity); +WRes Thread_Wait_Close(CThread *p); + +#ifdef _WIN32 +WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask); +#define Thread_Create_With_CpuSet(p, func, param, cs) \ + Thread_Create_With_Affinity(p, func, param, *cs) +#else +WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet); +#endif + +typedef struct +{ + unsigned NumGroups; + unsigned NextGroup; +} CThreadNextGroup; + +void ThreadNextGroup_Init(CThreadNextGroup *p, unsigned numGroups, unsigned startGroup); +unsigned ThreadNextGroup_GetNext(CThreadNextGroup *p); + + +#ifdef _WIN32 typedef HANDLE CEvent; typedef CEvent CAutoResetEvent; @@ -54,6 +179,7 @@ typedef HANDLE CSemaphore; #define Semaphore_Close(p) HandlePtr_Close(p) #define Semaphore_Wait(p) Handle_WaitObject(*(p)) WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount); +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount); WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num); WRes Semaphore_Release1(CSemaphore *p); @@ -63,6 +189,72 @@ WRes CriticalSection_Init(CCriticalSection *p); #define CriticalSection_Enter(p) EnterCriticalSection(p) #define CriticalSection_Leave(p) LeaveCriticalSection(p) + +#else // _WIN32 + +typedef struct +{ + int _created; + int _manual_reset; + int _state; + pthread_mutex_t _mutex; + pthread_cond_t _cond; +} CEvent; + +typedef CEvent CAutoResetEvent; +typedef CEvent CManualResetEvent; + +#define Event_Construct(p) (p)->_created = 0 +#define Event_IsCreated(p) ((p)->_created) + +WRes ManualResetEvent_Create(CManualResetEvent *p, int signaled); +WRes ManualResetEvent_CreateNotSignaled(CManualResetEvent *p); +WRes AutoResetEvent_Create(CAutoResetEvent *p, int signaled); +WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p); + +WRes Event_Set(CEvent *p); +WRes Event_Reset(CEvent *p); +WRes Event_Wait(CEvent *p); +WRes Event_Close(CEvent *p); + + +typedef struct +{ + int _created; + UInt32 _count; + UInt32 _maxCount; + pthread_mutex_t _mutex; + pthread_cond_t _cond; +} CSemaphore; + +#define Semaphore_Construct(p) (p)->_created = 0 +#define Semaphore_IsCreated(p) ((p)->_created) + +WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount); +WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount); +WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num); +#define Semaphore_Release1(p) Semaphore_ReleaseN(p, 1) +WRes Semaphore_Wait(CSemaphore *p); +WRes Semaphore_Close(CSemaphore *p); + + +typedef struct +{ + pthread_mutex_t _mutex; +} CCriticalSection; + +WRes CriticalSection_Init(CCriticalSection *p); +void CriticalSection_Delete(CCriticalSection *cs); +void CriticalSection_Enter(CCriticalSection *cs); +void CriticalSection_Leave(CCriticalSection *cs); + +LONG InterlockedIncrement(LONG volatile *addend); +LONG InterlockedDecrement(LONG volatile *addend); + +#endif // _WIN32 + +WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p); + EXTERN_C_END #endif diff --git a/src/sdk/C/Util/7z/7z.dsp b/src/sdk/C/Util/7z/7z.dsp index be0f0a7..474c660 100644 --- a/src/sdk/C/Util/7z/7z.dsp +++ b/src/sdk/C/Util/7z/7z.dsp @@ -42,7 +42,7 @@ RSC=rc.exe # PROP Ignore_Export_Lib 0 # PROP Target_Dir "" # ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c -# ADD CPP /nologo /MD /W4 /WX /GX /O2 /D "NDEBUG" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /FAcs /Yu"Precomp.h" /FD /c +# ADD CPP /nologo /MD /W4 /WX /GX /O2 /D "NDEBUG" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /D "Z7_EXTRACT_ONLY" /FAcs /Yu"Precomp.h" /FD /c # ADD BASE RSC /l 0x419 /d "NDEBUG" # ADD RSC /l 0x419 /d "NDEBUG" BSC32=bscmake.exe @@ -67,7 +67,7 @@ LINK32=link.exe # PROP Ignore_Export_Lib 0 # PROP Target_Dir "" # ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c -# ADD CPP /nologo /W4 /WX /Gm /GX /ZI /Od /D "_DEBUG" /D "_SZ_ALLOC_DEBUG2" /D "_SZ_NO_INT_64_A" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /Yu"Precomp.h" /FD /GZ /c +# ADD CPP /nologo /W4 /WX /Gm /GX /ZI /Od /D "_DEBUG" /D "_SZ_ALLOC_DEBUG2" /D "_SZ_NO_INT_64_A" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /D "Z7_EXTRACT_ONLY" /Yu"Precomp.h" /FD /GZ /c # ADD BASE RSC /l 0x419 /d "_DEBUG" # ADD RSC /l 0x419 /d "_DEBUG" BSC32=bscmake.exe @@ -145,6 +145,10 @@ SOURCE=..\..\7zTypes.h # End Source File # Begin Source File +SOURCE=..\..\7zWindows.h +# End Source File +# Begin Source File + SOURCE=..\..\Bcj2.c # End Source File # Begin Source File @@ -230,6 +234,10 @@ SOURCE=.\Precomp.c # End Source File # Begin Source File +SOURCE=..\..\Precomp.h +# End Source File +# Begin Source File + SOURCE=.\Precomp.h # End Source File # End Group diff --git a/src/sdk/C/Util/7z/7zMain.c b/src/sdk/C/Util/7z/7zMain.c index 6ccc830..6baf979 100644 --- a/src/sdk/C/Util/7z/7zMain.c +++ b/src/sdk/C/Util/7z/7zMain.c @@ -1,34 +1,41 @@ /* 7zMain.c - Test application for 7z Decoder -2019-02-02 : Igor Pavlov : Public domain */ +2024-02-28 : Igor Pavlov : Public domain */ #include "Precomp.h" #include #include -#include "../../CpuArch.h" - -#include "../../7z.h" -#include "../../7zAlloc.h" -#include "../../7zBuf.h" -#include "../../7zCrc.h" -#include "../../7zFile.h" -#include "../../7zVersion.h" - #ifndef USE_WINDOWS_FILE /* for mkdir */ #ifdef _WIN32 #include #else +#include +#include +#ifdef __GNUC__ +#include +#endif +#include +// #include #include #include #endif #endif +#include "../../7zFile.h" +#include "../../7z.h" +#include "../../7zAlloc.h" +#include "../../7zBuf.h" +#include "../../7zCrc.h" +#include "../../7zVersion.h" + +#include "../../CpuArch.h" #define kInputBufSize ((size_t)1 << 18) static const ISzAlloc g_Alloc = { SzAlloc, SzFree }; +// static const ISzAlloc g_Alloc_temp = { SzAllocTemp, SzFreeTemp }; static void Print(const char *s) @@ -46,19 +53,19 @@ static int Buf_EnsureSize(CBuf *dest, size_t size) } #ifndef _WIN32 -#define _USE_UTF8 +#define MY_USE_UTF8 #endif -/* #define _USE_UTF8 */ +/* #define MY_USE_UTF8 */ -#ifdef _USE_UTF8 +#ifdef MY_USE_UTF8 -#define _UTF8_START(n) (0x100 - (1 << (7 - (n)))) +#define MY_UTF8_START(n) (0x100 - (1 << (7 - (n)))) -#define _UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6)) +#define MY_UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6)) -#define _UTF8_HEAD(n, val) ((Byte)(_UTF8_START(n) + (val >> (6 * (n))))) -#define _UTF8_CHAR(n, val) ((Byte)(0x80 + (((val) >> (6 * (n))) & 0x3F))) +#define MY_UTF8_HEAD(n, val) ((Byte)(MY_UTF8_START(n) + (val >> (6 * (n))))) +#define MY_UTF8_CHAR(n, val) ((Byte)(0x80 + (((val) >> (6 * (n))) & 0x3F))) static size_t Utf16_To_Utf8_Calc(const UInt16 *src, const UInt16 *srcLim) { @@ -75,7 +82,7 @@ static size_t Utf16_To_Utf8_Calc(const UInt16 *src, const UInt16 *srcLim) if (val < 0x80) continue; - if (val < _UTF8_RANGE(1)) + if (val < MY_UTF8_RANGE(1)) { size++; continue; @@ -83,7 +90,7 @@ static size_t Utf16_To_Utf8_Calc(const UInt16 *src, const UInt16 *srcLim) if (val >= 0xD800 && val < 0xDC00 && src != srcLim) { - UInt32 c2 = *src; + const UInt32 c2 = *src; if (c2 >= 0xDC00 && c2 < 0xE000) { src++; @@ -108,37 +115,37 @@ static Byte *Utf16_To_Utf8(Byte *dest, const UInt16 *src, const UInt16 *srcLim) if (val < 0x80) { - *dest++ = (char)val; + *dest++ = (Byte)val; continue; } - if (val < _UTF8_RANGE(1)) + if (val < MY_UTF8_RANGE(1)) { - dest[0] = _UTF8_HEAD(1, val); - dest[1] = _UTF8_CHAR(0, val); + dest[0] = MY_UTF8_HEAD(1, val); + dest[1] = MY_UTF8_CHAR(0, val); dest += 2; continue; } if (val >= 0xD800 && val < 0xDC00 && src != srcLim) { - UInt32 c2 = *src; + const UInt32 c2 = *src; if (c2 >= 0xDC00 && c2 < 0xE000) { src++; val = (((val - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000; - dest[0] = _UTF8_HEAD(3, val); - dest[1] = _UTF8_CHAR(2, val); - dest[2] = _UTF8_CHAR(1, val); - dest[3] = _UTF8_CHAR(0, val); + dest[0] = MY_UTF8_HEAD(3, val); + dest[1] = MY_UTF8_CHAR(2, val); + dest[2] = MY_UTF8_CHAR(1, val); + dest[3] = MY_UTF8_CHAR(0, val); dest += 4; continue; } } - dest[0] = _UTF8_HEAD(2, val); - dest[1] = _UTF8_CHAR(1, val); - dest[2] = _UTF8_CHAR(0, val); + dest[0] = MY_UTF8_HEAD(2, val); + dest[1] = MY_UTF8_CHAR(1, val); + dest[2] = MY_UTF8_CHAR(0, val); dest += 3; } } @@ -156,27 +163,27 @@ static SRes Utf16_To_Utf8Buf(CBuf *dest, const UInt16 *src, size_t srcLen) #endif static SRes Utf16_To_Char(CBuf *buf, const UInt16 *s - #ifndef _USE_UTF8 + #ifndef MY_USE_UTF8 , UINT codePage #endif ) { - unsigned len = 0; - for (len = 0; s[len] != 0; len++); + size_t len = 0; + for (len = 0; s[len] != 0; len++) {} - #ifndef _USE_UTF8 + #ifndef MY_USE_UTF8 { - unsigned size = len * 3 + 100; + const size_t size = len * 3 + 100; if (!Buf_EnsureSize(buf, size)) return SZ_ERROR_MEM; { buf->data[0] = 0; if (len != 0) { - char defaultChar = '_'; + const char defaultChar = '_'; BOOL defUsed; - unsigned numChars = 0; - numChars = WideCharToMultiByte(codePage, 0, (LPCWSTR)s, len, (char *)buf->data, size, &defaultChar, &defUsed); + const unsigned numChars = (unsigned)WideCharToMultiByte( + codePage, 0, (LPCWSTR)s, (int)len, (char *)buf->data, (int)size, &defaultChar, &defUsed); if (numChars == 0 || numChars >= size) return SZ_ERROR_FAIL; buf->data[numChars] = 0; @@ -192,8 +199,8 @@ static SRes Utf16_To_Char(CBuf *buf, const UInt16 *s #ifdef _WIN32 #ifndef USE_WINDOWS_FILE static UINT g_FileCodePage = CP_ACP; + #define MY_FILE_CODE_PAGE_PARAM ,g_FileCodePage #endif - #define MY_FILE_CODE_PAGE_PARAM ,g_FileCodePage #else #define MY_FILE_CODE_PAGE_PARAM #endif @@ -209,7 +216,7 @@ static WRes MyCreateDir(const UInt16 *name) CBuf buf; WRes res; Buf_Init(&buf); - RINOK(Utf16_To_Char(&buf, name MY_FILE_CODE_PAGE_PARAM)); + RINOK(Utf16_To_Char(&buf, name MY_FILE_CODE_PAGE_PARAM)) res = #ifdef _WIN32 @@ -232,7 +239,7 @@ static WRes OutFile_OpenUtf16(CSzFile *p, const UInt16 *name) CBuf buf; WRes res; Buf_Init(&buf); - RINOK(Utf16_To_Char(&buf, name MY_FILE_CODE_PAGE_PARAM)); + RINOK(Utf16_To_Char(&buf, name MY_FILE_CODE_PAGE_PARAM)) res = OutFile_Open(p, (const char *)buf.data); Buf_Free(&buf, &g_Alloc); return res; @@ -246,7 +253,7 @@ static SRes PrintString(const UInt16 *s) SRes res; Buf_Init(&buf); res = Utf16_To_Char(&buf, s - #ifndef _USE_UTF8 + #ifndef MY_USE_UTF8 , CP_OEMCP #endif ); @@ -300,17 +307,143 @@ static void UIntToStr_2(char *s, unsigned value) s[1] = (char)('0' + (value % 10)); } + #define PERIOD_4 (4 * 365 + 1) #define PERIOD_100 (PERIOD_4 * 25 - 1) #define PERIOD_400 (PERIOD_100 * 4 + 1) -static void ConvertFileTimeToString(const CNtfsFileTime *nt, char *s) + + +#ifndef _WIN32 + +// MS uses long for BOOL, but long is 32-bit in MS. So we use int. +// typedef long BOOL; +typedef int BOOL; + +typedef struct +{ + DWORD dwLowDateTime; + DWORD dwHighDateTime; +} FILETIME; + +static LONG TIME_GetBias(void) +{ + const time_t utc = time(NULL); + struct tm *ptm = localtime(&utc); + const int localdaylight = ptm->tm_isdst; /* daylight for local timezone */ + ptm = gmtime(&utc); + ptm->tm_isdst = localdaylight; /* use local daylight, not that of Greenwich */ + return (int)(mktime(ptm) - utc); +} + +#define TICKS_PER_SEC 10000000 + +#define GET_TIME_64(pft) ((pft)->dwLowDateTime | ((UInt64)(pft)->dwHighDateTime << 32)) + +#define SET_FILETIME(ft, v64) \ + (ft)->dwLowDateTime = (DWORD)v64; \ + (ft)->dwHighDateTime = (DWORD)(v64 >> 32); + +#define WINAPI +#define TRUE 1 + +static BOOL WINAPI FileTimeToLocalFileTime(const FILETIME *fileTime, FILETIME *localFileTime) +{ + UInt64 v = GET_TIME_64(fileTime); + v = (UInt64)((Int64)v - (Int64)TIME_GetBias() * TICKS_PER_SEC); + SET_FILETIME(localFileTime, v) + return TRUE; +} + +static const UInt32 kNumTimeQuantumsInSecond = 10000000; +static const UInt32 kFileTimeStartYear = 1601; +static const UInt32 kUnixTimeStartYear = 1970; + +static Int64 Time_FileTimeToUnixTime64(const FILETIME *ft) +{ + const UInt64 kUnixTimeOffset = + (UInt64)60 * 60 * 24 * (89 + 365 * (kUnixTimeStartYear - kFileTimeStartYear)); + const UInt64 winTime = GET_TIME_64(ft); + return (Int64)(winTime / kNumTimeQuantumsInSecond) - (Int64)kUnixTimeOffset; +} + +#if defined(_AIX) + #define MY_ST_TIMESPEC st_timespec +#else + #define MY_ST_TIMESPEC timespec +#endif + +static void FILETIME_To_timespec(const FILETIME *ft, struct MY_ST_TIMESPEC *ts) +{ + if (ft) + { + const Int64 sec = Time_FileTimeToUnixTime64(ft); + // time_t is long + const time_t sec2 = (time_t)sec; + if (sec2 == sec) + { + ts->tv_sec = sec2; + { + const UInt64 winTime = GET_TIME_64(ft); + ts->tv_nsec = (long)((winTime % 10000000) * 100); + } + return; + } + } + // else + { + ts->tv_sec = 0; + // ts.tv_nsec = UTIME_NOW; // set to the current time + ts->tv_nsec = UTIME_OMIT; // keep old timesptamp + } +} + +static WRes Set_File_FILETIME(const UInt16 *name, const FILETIME *mTime) +{ + struct timespec times[2]; + + const int flags = 0; // follow link + // = AT_SYMLINK_NOFOLLOW; // don't follow link + + CBuf buf; + int res; + Buf_Init(&buf); + RINOK(Utf16_To_Char(&buf, name MY_FILE_CODE_PAGE_PARAM)) + FILETIME_To_timespec(NULL, ×[0]); + FILETIME_To_timespec(mTime, ×[1]); + res = utimensat(AT_FDCWD, (const char *)buf.data, times, flags); + Buf_Free(&buf, &g_Alloc); + if (res == 0) + return 0; + return errno; +} + +#endif + +static void NtfsFileTime_to_FILETIME(const CNtfsFileTime *t, FILETIME *ft) +{ + ft->dwLowDateTime = (DWORD)(t->Low); + ft->dwHighDateTime = (DWORD)(t->High); +} + +static void ConvertFileTimeToString(const CNtfsFileTime *nTime, char *s) { unsigned year, mon, hour, min, sec; Byte ms[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }; - unsigned t; + UInt32 t; UInt32 v; - UInt64 v64 = nt->Low | ((UInt64)nt->High << 32); + // UInt64 v64 = nt->Low | ((UInt64)nt->High << 32); + UInt64 v64; + { + FILETIME fileTime, locTime; + NtfsFileTime_to_FILETIME(nTime, &fileTime); + if (!FileTimeToLocalFileTime(&fileTime, &locTime)) + { + locTime.dwHighDateTime = + locTime.dwLowDateTime = 0; + } + v64 = locTime.dwLowDateTime | ((UInt64)locTime.dwHighDateTime << 32); + } v64 /= 10000000; sec = (unsigned)(v64 % 60); v64 /= 60; min = (unsigned)(v64 % 60); v64 /= 60; @@ -329,7 +462,7 @@ static void ConvertFileTimeToString(const CNtfsFileTime *nt, char *s) ms[1] = 29; for (mon = 0;; mon++) { - unsigned d = ms[mon]; + const UInt32 d = ms[mon]; if (v < d) break; v -= d; @@ -342,7 +475,7 @@ static void ConvertFileTimeToString(const CNtfsFileTime *nt, char *s) UIntToStr_2(s, sec); s[2] = 0; } -static void PrintLF() +static void PrintLF(void) { Print("\n"); } @@ -354,6 +487,43 @@ static void PrintError(char *s) PrintLF(); } +static void PrintError_WRes(const char *message, WRes wres) +{ + Print("\nERROR: "); + Print(message); + PrintLF(); + { + char s[32]; + UIntToStr(s, (unsigned)wres, 1); + Print("System error code: "); + Print(s); + } + // sprintf(buffer + strlen(buffer), "\nSystem error code: %d", (unsigned)wres); + #ifdef _WIN32 + { + char *s = NULL; + if (FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, wres, 0, (LPSTR) &s, 0, NULL) != 0 && s) + { + Print(" : "); + Print(s); + LocalFree(s); + } + } + #else + { + const char *s = strerror(wres); + if (s) + { + Print(" : "); + Print(s); + } + } + #endif + PrintLF(); +} + static void GetAttribString(UInt32 wa, BoolInt isDir, char *s) { #ifdef USE_WINDOWS_FILE @@ -372,7 +542,7 @@ static void GetAttribString(UInt32 wa, BoolInt isDir, char *s) // #define NUM_PARENTS_MAX 128 -int MY_CDECL main(int numargs, char *args[]) +int Z7_CDECL main(int numargs, char *args[]) { ISzAlloc allocImp; ISzAlloc allocTempImp; @@ -412,18 +582,24 @@ int MY_CDECL main(int numargs, char *args[]) allocImp = g_Alloc; allocTempImp = g_Alloc; + // allocTempImp = g_Alloc_temp; - #ifdef UNDER_CE - if (InFile_OpenW(&archiveStream.file, L"\test.7z")) - #else - if (InFile_Open(&archiveStream.file, args[2])) - #endif { - PrintError("can not open input file"); - return 1; + WRes wres = + #ifdef UNDER_CE + InFile_OpenW(&archiveStream.file, L"\test.7z"); // change it + #else + InFile_Open(&archiveStream.file, args[2]); + #endif + if (wres != 0) + { + PrintError_WRes("cannot open input file", wres); + return 1; + } } FileInStream_CreateVTable(&archiveStream); + archiveStream.wres = 0; LookToRead2_CreateVTable(&lookStream, False); lookStream.buf = NULL; @@ -437,7 +613,7 @@ int MY_CDECL main(int numargs, char *args[]) { lookStream.bufSize = kInputBufSize; lookStream.realStream = &archiveStream.vt; - LookToRead2_Init(&lookStream); + LookToRead2_INIT(&lookStream) } } @@ -483,7 +659,7 @@ int MY_CDECL main(int numargs, char *args[]) size_t outSizeProcessed = 0; // const CSzFileItem *f = db.Files + i; size_t len; - unsigned isDir = SzArEx_IsDir(&db, i); + const BoolInt isDir = SzArEx_IsDir(&db, i); if (listCommand == 0 && isDir && !fullPaths) continue; len = SzArEx_GetFileNameUtf16(&db, i, NULL); @@ -546,8 +722,8 @@ int MY_CDECL main(int numargs, char *args[]) } Print(testCommand ? - "Testing ": - "Extracting "); + "T ": + "- "); res = PrintString(temp); if (res != SZ_OK) break; @@ -591,27 +767,37 @@ int MY_CDECL main(int numargs, char *args[]) PrintLF(); continue; } - else if (OutFile_OpenUtf16(&outFile, destPath)) + else { - PrintError("can not open output file"); - res = SZ_ERROR_FAIL; - break; + const WRes wres = OutFile_OpenUtf16(&outFile, destPath); + if (wres != 0) + { + PrintError_WRes("cannot open output file", wres); + res = SZ_ERROR_FAIL; + break; + } } processedSize = outSizeProcessed; - if (File_Write(&outFile, outBuffer + offset, &processedSize) != 0 || processedSize != outSizeProcessed) { - PrintError("can not write output file"); - res = SZ_ERROR_FAIL; - break; + const WRes wres = File_Write(&outFile, outBuffer + offset, &processedSize); + if (wres != 0 || processedSize != outSizeProcessed) + { + PrintError_WRes("cannot write output file", wres); + res = SZ_ERROR_FAIL; + break; + } } - #ifdef USE_WINDOWS_FILE { - FILETIME mtime, ctime; + FILETIME mtime; FILETIME *mtimePtr = NULL; + + #ifdef USE_WINDOWS_FILE + FILETIME ctime; FILETIME *ctimePtr = NULL; + #endif if (SzBitWithVals_Check(&db.MTime, i)) { @@ -620,6 +806,8 @@ int MY_CDECL main(int numargs, char *args[]) mtime.dwHighDateTime = (DWORD)(t->High); mtimePtr = &mtime; } + + #ifdef USE_WINDOWS_FILE if (SzBitWithVals_Check(&db.CTime, i)) { const CNtfsFileTime *t = &db.CTime.Vals[i]; @@ -627,16 +815,29 @@ int MY_CDECL main(int numargs, char *args[]) ctime.dwHighDateTime = (DWORD)(t->High); ctimePtr = &ctime; } + if (mtimePtr || ctimePtr) SetFileTime(outFile.handle, ctimePtr, NULL, mtimePtr); - } - #endif + #endif - if (File_Close(&outFile)) - { - PrintError("can not close output file"); - res = SZ_ERROR_FAIL; - break; + { + const WRes wres = File_Close(&outFile); + if (wres != 0) + { + PrintError_WRes("cannot close output file", wres); + res = SZ_ERROR_FAIL; + break; + } + } + + #ifndef USE_WINDOWS_FILE + #ifdef _WIN32 + mtimePtr = mtimePtr; + #else + if (mtimePtr) + Set_File_FILETIME(destPath, mtimePtr); + #endif + #endif } #ifdef USE_WINDOWS_FILE @@ -672,13 +873,15 @@ int MY_CDECL main(int numargs, char *args[]) if (res == SZ_ERROR_UNSUPPORTED) PrintError("decoder doesn't support this archive"); else if (res == SZ_ERROR_MEM) - PrintError("can not allocate memory"); + PrintError("cannot allocate memory"); else if (res == SZ_ERROR_CRC) PrintError("CRC error"); + else if (res == SZ_ERROR_READ /* || archiveStream.Res != 0 */) + PrintError_WRes("Read Error", archiveStream.wres); else { char s[32]; - UInt64ToStr(res, s, 0); + UInt64ToStr((unsigned)res, s, 0); PrintError(s); } diff --git a/src/sdk/C/Util/7z/Precomp.h b/src/sdk/C/Util/7z/Precomp.h index 588a66f..13a41ef 100644 --- a/src/sdk/C/Util/7z/Precomp.h +++ b/src/sdk/C/Util/7z/Precomp.h @@ -1,10 +1,13 @@ -/* Precomp.h -- StdAfx -2013-06-16 : Igor Pavlov : Public domain */ +/* Precomp.h -- Precomp +2024-01-23 : Igor Pavlov : Public domain */ -#ifndef __7Z_PRECOMP_H -#define __7Z_PRECOMP_H - -#include "../../Compiler.h" -#include "../../7zTypes.h" +// #ifndef ZIP7_INC_PRECOMP_LOC_H +// #define ZIP7_INC_PRECOMP_LOC_H +#if defined(_MSC_VER) && _MSC_VER >= 1800 +#pragma warning(disable : 4464) // relative include path contains '..' #endif + +#include "../../Precomp.h" + +// #endif diff --git a/src/sdk/C/Util/7z/makefile b/src/sdk/C/Util/7z/makefile index 9a49fd5..987f065 100644 --- a/src/sdk/C/Util/7z/makefile +++ b/src/sdk/C/Util/7z/makefile @@ -1,12 +1,10 @@ -CFLAGS = $(CFLAGS) -D_7ZIP_PPMD_SUPPPORT +CFLAGS = $(CFLAGS) -DZ7_PPMD_SUPPORT -DZ7_EXTRACT_ONLY PROG = 7zDec.exe C_OBJS = \ $O\7zAlloc.obj \ $O\7zBuf.obj \ - $O\7zCrc.obj \ - $O\7zCrcOpt.obj \ $O\7zFile.obj \ $O\7zDec.obj \ $O\7zArcIn.obj \ @@ -25,10 +23,14 @@ C_OBJS = \ 7Z_OBJS = \ $O\7zMain.obj \ +!include "../../../CPP/7zip/Crc.mak" +!include "../../../CPP/7zip/LzmaDec.mak" + OBJS = \ $O\Precomp.obj \ $(7Z_OBJS) \ $(C_OBJS) \ + $(ASM_OBJS) \ !include "../../../CPP/Build.mak" @@ -38,3 +40,5 @@ $(C_OBJS): ../../$(*B).c $(CCOMPL_USE) $O\Precomp.obj: Precomp.c $(CCOMPL_PCH) + +!include "../../Asm_c.mak" diff --git a/src/sdk/C/Util/7z/makefile.gcc b/src/sdk/C/Util/7z/makefile.gcc index 51053ba..f48d362 100644 --- a/src/sdk/C/Util/7z/makefile.gcc +++ b/src/sdk/C/Util/7z/makefile.gcc @@ -1,75 +1,32 @@ -PROG = 7zDec -CXX = gcc -LIB = -RM = rm -f -CFLAGS = -c -O2 -Wall - -OBJS = 7zMain.o 7zAlloc.o 7zArcIn.o 7zBuf.o 7zBuf2.o 7zCrc.o 7zCrcOpt.o 7zDec.o CpuArch.o Delta.o LzmaDec.o Lzma2Dec.o Bra.o Bra86.o BraIA64.o Bcj2.o Ppmd7.o Ppmd7Dec.o 7zFile.o 7zStream.o - -all: $(PROG) - -$(PROG): $(OBJS) - $(CXX) -o $(PROG) $(LDFLAGS) $(OBJS) $(LIB) - -7zMain.o: 7zMain.c - $(CXX) $(CFLAGS) 7zMain.c - -7zAlloc.o: ../../7zAlloc.c - $(CXX) $(CFLAGS) ../../7zAlloc.c - -7zArcIn.o: ../../7zArcIn.c - $(CXX) $(CFLAGS) ../../7zArcIn.c - -7zBuf.o: ../../7zBuf.c - $(CXX) $(CFLAGS) ../../7zBuf.c - -7zBuf2.o: ../../7zBuf2.c - $(CXX) $(CFLAGS) ../../7zBuf2.c - -7zCrc.o: ../../7zCrc.c - $(CXX) $(CFLAGS) ../../7zCrc.c - -7zCrcOpt.o: ../../7zCrc.c - $(CXX) $(CFLAGS) ../../7zCrcOpt.c - -7zDec.o: ../../7zDec.c - $(CXX) $(CFLAGS) -D_7ZIP_PPMD_SUPPPORT ../../7zDec.c - -CpuArch.o: ../../CpuArch.c - $(CXX) $(CFLAGS) ../../CpuArch.c - -Delta.o: ../../Delta.c - $(CXX) $(CFLAGS) ../../Delta.c - -LzmaDec.o: ../../LzmaDec.c - $(CXX) $(CFLAGS) ../../LzmaDec.c - -Lzma2Dec.o: ../../Lzma2Dec.c - $(CXX) $(CFLAGS) ../../Lzma2Dec.c - -Bra.o: ../../Bra.c - $(CXX) $(CFLAGS) ../../Bra.c - -Bra86.o: ../../Bra86.c - $(CXX) $(CFLAGS) ../../Bra86.c - -BraIA64.o: ../../BraIA64.c - $(CXX) $(CFLAGS) ../../BraIA64.c - -Bcj2.o: ../../Bcj2.c - $(CXX) $(CFLAGS) ../../Bcj2.c - -Ppmd7.o: ../../Ppmd7.c - $(CXX) $(CFLAGS) ../../Ppmd7.c - -Ppmd7Dec.o: ../../Ppmd7Dec.c - $(CXX) $(CFLAGS) ../../Ppmd7Dec.c - -7zFile.o: ../../7zFile.c - $(CXX) $(CFLAGS) ../../7zFile.c - -7zStream.o: ../../7zStream.c - $(CXX) $(CFLAGS) ../../7zStream.c - -clean: - -$(RM) $(PROG) $(OBJS) +PROG = 7zdec + +LOCAL_FLAGS = -DZ7_PPMD_SUPPORT -DZ7_EXTRACT_ONLY + +include ../../../CPP/7zip/LzmaDec_gcc.mak + + +OBJS = \ + $(LZMA_DEC_OPT_OBJS) \ + $O/Bcj2.o \ + $O/Bra.o \ + $O/Bra86.o \ + $O/BraIA64.o \ + $O/CpuArch.o \ + $O/Delta.o \ + $O/Lzma2Dec.o \ + $O/LzmaDec.o \ + $O/Ppmd7.o \ + $O/Ppmd7Dec.o \ + $O/7zCrc.o \ + $O/7zCrcOpt.o \ + $O/7zAlloc.o \ + $O/7zArcIn.o \ + $O/7zBuf.o \ + $O/7zBuf2.o \ + $O/7zDec.o \ + $O/7zMain.o \ + $O/7zFile.o \ + $O/7zStream.o \ + + +include ../../7zip_gcc_c.mak diff --git a/src/sdk/C/Util/Lzma/LzmaUtil.c b/src/sdk/C/Util/Lzma/LzmaUtil.c index 739bc0f..b9b974b 100644 --- a/src/sdk/C/Util/Lzma/LzmaUtil.c +++ b/src/sdk/C/Util/Lzma/LzmaUtil.c @@ -1,7 +1,7 @@ /* LzmaUtil.c -- Test application for LZMA compression -2018-07-04 : Igor Pavlov : Public domain */ +2023-03-07 : Igor Pavlov : Public domain */ -#include "../../Precomp.h" +#include "Precomp.h" #include #include @@ -12,40 +12,89 @@ #include "../../Alloc.h" #include "../../7zFile.h" #include "../../7zVersion.h" +#include "../../LzFind.h" #include "../../LzmaDec.h" #include "../../LzmaEnc.h" -static const char * const kCantReadMessage = "Can not read input file"; -static const char * const kCantWriteMessage = "Can not write output file"; -static const char * const kCantAllocateMessage = "Can not allocate memory"; +static const char * const kCantReadMessage = "Cannot read input file"; +static const char * const kCantWriteMessage = "Cannot write output file"; +static const char * const kCantAllocateMessage = "Cannot allocate memory"; static const char * const kDataErrorMessage = "Data error"; -static void PrintHelp(char *buffer) +static void Print(const char *s) { - strcat(buffer, - "\nLZMA-C " MY_VERSION_CPU " : " MY_COPYRIGHT_DATE "\n\n" - "Usage: lzma inputFile outputFile\n" - " e: encode file\n" - " d: decode file\n"); + fputs(s, stdout); } -static int PrintError(char *buffer, const char *message) +static void PrintHelp(void) { - strcat(buffer, "\nError: "); - strcat(buffer, message); - strcat(buffer, "\n"); + Print( + "\n" "LZMA-C " MY_VERSION_CPU " : " MY_COPYRIGHT_DATE + "\n" + "\n" "Usage: lzma inputFile outputFile" + "\n" " e: encode file" + "\n" " d: decode file" + "\n"); +} + +static int PrintError(const char *message) +{ + Print("\nError: "); + Print(message); + Print("\n"); + return 1; +} + +#define CONVERT_INT_TO_STR(charType, tempSize) \ + unsigned char temp[tempSize]; unsigned i = 0; \ + while (val >= 10) { temp[i++] = (unsigned char)('0' + (unsigned)(val % 10)); val /= 10; } \ + *s++ = (charType)('0' + (unsigned)val); \ + while (i != 0) { i--; *s++ = (charType)temp[i]; } \ + *s = 0; \ + return s; + +static char * Convert_unsigned_To_str(unsigned val, char *s) +{ + CONVERT_INT_TO_STR(char, 32) +} + +static void Print_unsigned(unsigned code) +{ + char str[32]; + Convert_unsigned_To_str(code, str); + Print(str); +} + +static int PrintError_WRes(const char *message, WRes wres) +{ + PrintError(message); + Print("\nSystem error code: "); + Print_unsigned((unsigned)wres); + #ifndef _WIN32 + { + const char *s = strerror(wres); + if (s) + { + Print(" : "); + Print(s); + } + } + #endif + Print("\n"); return 1; } -static int PrintErrorNumber(char *buffer, SRes val) +static int PrintErrorNumber(SRes val) { - sprintf(buffer + strlen(buffer), "\nError code: %x\n", (unsigned)val); + Print("\n7-Zip error code: "); + Print_unsigned((unsigned)val); + Print("\n"); return 1; } -static int PrintUserError(char *buffer) +static int PrintUserError(void) { - return PrintError(buffer, "Incorrect command"); + return PrintError("Incorrect command"); } @@ -53,10 +102,10 @@ static int PrintUserError(char *buffer) #define OUT_BUF_SIZE (1 << 16) -static SRes Decode2(CLzmaDec *state, ISeqOutStream *outStream, ISeqInStream *inStream, +static SRes Decode2(CLzmaDec *state, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, UInt64 unpackSize) { - int thereIsSize = (unpackSize != (UInt64)(Int64)-1); + const int thereIsSize = (unpackSize != (UInt64)(Int64)-1); Byte inBuf[IN_BUF_SIZE]; Byte outBuf[OUT_BUF_SIZE]; size_t inPos = 0, inSize = 0, outPos = 0; @@ -66,7 +115,7 @@ static SRes Decode2(CLzmaDec *state, ISeqOutStream *outStream, ISeqInStream *inS if (inPos == inSize) { inSize = IN_BUF_SIZE; - RINOK(inStream->Read(inStream, inBuf, &inSize)); + RINOK(inStream->Read(inStream, inBuf, &inSize)) inPos = 0; } { @@ -107,7 +156,7 @@ static SRes Decode2(CLzmaDec *state, ISeqOutStream *outStream, ISeqInStream *inS } -static SRes Decode(ISeqOutStream *outStream, ISeqInStream *inStream) +static SRes Decode(ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream) { UInt64 unpackSize; int i; @@ -120,27 +169,29 @@ static SRes Decode(ISeqOutStream *outStream, ISeqInStream *inStream) /* Read and parse header */ - RINOK(SeqInStream_Read(inStream, header, sizeof(header))); - + { + size_t size = sizeof(header); + RINOK(SeqInStream_ReadMax(inStream, header, &size)) + if (size != sizeof(header)) + return SZ_ERROR_INPUT_EOF; + } unpackSize = 0; for (i = 0; i < 8; i++) unpackSize += (UInt64)header[LZMA_PROPS_SIZE + i] << (i * 8); - LzmaDec_Construct(&state); - RINOK(LzmaDec_Allocate(&state, header, LZMA_PROPS_SIZE, &g_Alloc)); + LzmaDec_CONSTRUCT(&state) + RINOK(LzmaDec_Allocate(&state, header, LZMA_PROPS_SIZE, &g_Alloc)) res = Decode2(&state, outStream, inStream, unpackSize); LzmaDec_Free(&state, &g_Alloc); return res; } -static SRes Encode(ISeqOutStream *outStream, ISeqInStream *inStream, UInt64 fileSize, char *rs) +static SRes Encode(ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, UInt64 fileSize) { CLzmaEncHandle enc; SRes res; CLzmaEncProps props; - UNUSED_VAR(rs); - enc = LzmaEnc_Create(&g_Alloc); if (enc == 0) return SZ_ERROR_MEM; @@ -170,7 +221,7 @@ static SRes Encode(ISeqOutStream *outStream, ISeqInStream *inStream, UInt64 file } -static int main2(int numArgs, const char *args[], char *rs) +int Z7_CDECL main(int numArgs, const char *args[]) { CFileSeqInStream inStream; CFileOutStream outStream; @@ -179,50 +230,63 @@ static int main2(int numArgs, const char *args[], char *rs) int encodeMode; BoolInt useOutFile = False; + LzFindPrepare(); + FileSeqInStream_CreateVTable(&inStream); File_Construct(&inStream.file); + inStream.wres = 0; FileOutStream_CreateVTable(&outStream); File_Construct(&outStream.file); + outStream.wres = 0; if (numArgs == 1) { - PrintHelp(rs); + PrintHelp(); return 0; } if (numArgs < 3 || numArgs > 4 || strlen(args[1]) != 1) - return PrintUserError(rs); + return PrintUserError(); c = args[1][0]; encodeMode = (c == 'e' || c == 'E'); if (!encodeMode && c != 'd' && c != 'D') - return PrintUserError(rs); + return PrintUserError(); + /* { size_t t4 = sizeof(UInt32); size_t t8 = sizeof(UInt64); if (t4 != 4 || t8 != 8) - return PrintError(rs, "Incorrect UInt32 or UInt64"); + return PrintError("Incorrect UInt32 or UInt64"); } + */ - if (InFile_Open(&inStream.file, args[2]) != 0) - return PrintError(rs, "Can not open input file"); + { + const WRes wres = InFile_Open(&inStream.file, args[2]); + if (wres != 0) + return PrintError_WRes("Cannot open input file", wres); + } if (numArgs > 3) { + WRes wres; useOutFile = True; - if (OutFile_Open(&outStream.file, args[3]) != 0) - return PrintError(rs, "Can not open output file"); + wres = OutFile_Open(&outStream.file, args[3]); + if (wres != 0) + return PrintError_WRes("Cannot open output file", wres); } else if (encodeMode) - PrintUserError(rs); + PrintUserError(); if (encodeMode) { UInt64 fileSize; - File_GetLength(&inStream.file, &fileSize); - res = Encode(&outStream.vt, &inStream.vt, fileSize, rs); + const WRes wres = File_GetLength(&inStream.file, &fileSize); + if (wres != 0) + return PrintError_WRes("Cannot get file length", wres); + res = Encode(&outStream.vt, &inStream.vt, fileSize); } else { @@ -236,23 +300,14 @@ static int main2(int numArgs, const char *args[], char *rs) if (res != SZ_OK) { if (res == SZ_ERROR_MEM) - return PrintError(rs, kCantAllocateMessage); + return PrintError(kCantAllocateMessage); else if (res == SZ_ERROR_DATA) - return PrintError(rs, kDataErrorMessage); + return PrintError(kDataErrorMessage); else if (res == SZ_ERROR_WRITE) - return PrintError(rs, kCantWriteMessage); + return PrintError_WRes(kCantWriteMessage, outStream.wres); else if (res == SZ_ERROR_READ) - return PrintError(rs, kCantReadMessage); - return PrintErrorNumber(rs, res); + return PrintError_WRes(kCantReadMessage, inStream.wres); + return PrintErrorNumber(res); } return 0; } - - -int MY_CDECL main(int numArgs, const char *args[]) -{ - char rs[800] = { 0 }; - int res = main2(numArgs, args, rs); - fputs(rs, stdout); - return res; -} diff --git a/src/sdk/C/Util/Lzma/LzmaUtil.dsp b/src/sdk/C/Util/Lzma/LzmaUtil.dsp index f060a26..71de950 100644 --- a/src/sdk/C/Util/Lzma/LzmaUtil.dsp +++ b/src/sdk/C/Util/Lzma/LzmaUtil.dsp @@ -106,6 +106,10 @@ SOURCE=..\..\7zVersion.h # End Source File # Begin Source File +SOURCE=..\..\7zWindows.h +# End Source File +# Begin Source File + SOURCE=..\..\Alloc.c # End Source File # Begin Source File @@ -114,6 +118,14 @@ SOURCE=..\..\Alloc.h # End Source File # Begin Source File +SOURCE=..\..\Compiler.h +# End Source File +# Begin Source File + +SOURCE=..\..\CpuArch.c +# End Source File +# Begin Source File + SOURCE=..\..\CpuArch.h # End Source File # Begin Source File @@ -134,6 +146,10 @@ SOURCE=..\..\LzFindMt.h # End Source File # Begin Source File +SOURCE=..\..\LzFindOpt.c +# End Source File +# Begin Source File + SOURCE=..\..\LzHash.h # End Source File # Begin Source File @@ -158,6 +174,14 @@ SOURCE=.\LzmaUtil.c # End Source File # Begin Source File +SOURCE=..\..\Precomp.h +# End Source File +# Begin Source File + +SOURCE=.\Precomp.h +# End Source File +# Begin Source File + SOURCE=..\..\Threads.c # End Source File # Begin Source File diff --git a/src/sdk/C/Util/Lzma/Precomp.h b/src/sdk/C/Util/Lzma/Precomp.h new file mode 100644 index 0000000..13a41ef --- /dev/null +++ b/src/sdk/C/Util/Lzma/Precomp.h @@ -0,0 +1,13 @@ +/* Precomp.h -- Precomp +2024-01-23 : Igor Pavlov : Public domain */ + +// #ifndef ZIP7_INC_PRECOMP_LOC_H +// #define ZIP7_INC_PRECOMP_LOC_H + +#if defined(_MSC_VER) && _MSC_VER >= 1800 +#pragma warning(disable : 4464) // relative include path contains '..' +#endif + +#include "../../Precomp.h" + +// #endif diff --git a/src/sdk/C/Util/Lzma/makefile b/src/sdk/C/Util/Lzma/makefile index 4795322..7813bdb 100644 --- a/src/sdk/C/Util/Lzma/makefile +++ b/src/sdk/C/Util/Lzma/makefile @@ -8,8 +8,10 @@ LIB_OBJS = \ C_OBJS = \ $O\Alloc.obj \ + $O\CpuArch.obj \ $O\LzFind.obj \ $O\LzFindMt.obj \ + $O\LzFindOpt.obj \ $O\LzmaDec.obj \ $O\LzmaEnc.obj \ $O\7zFile.obj \ diff --git a/src/sdk/C/Util/Lzma/makefile.gcc b/src/sdk/C/Util/Lzma/makefile.gcc index 67aa8b1..2acb0b8 100644 --- a/src/sdk/C/Util/Lzma/makefile.gcc +++ b/src/sdk/C/Util/Lzma/makefile.gcc @@ -1,44 +1,21 @@ -PROG = lzma -CXX = g++ -LIB = -RM = rm -f -CFLAGS = -c -O2 -Wall -D_7ZIP_ST +PROG = 7lzma -OBJS = \ - LzmaUtil.o \ - Alloc.o \ - LzFind.o \ - LzmaDec.o \ - LzmaEnc.o \ - 7zFile.o \ - 7zStream.o \ - - -all: $(PROG) - -$(PROG): $(OBJS) - $(CXX) -o $(PROG) $(LDFLAGS) $(OBJS) $(LIB) $(LIB2) - -LzmaUtil.o: LzmaUtil.c - $(CXX) $(CFLAGS) LzmaUtil.c - -Alloc.o: ../../Alloc.c - $(CXX) $(CFLAGS) ../../Alloc.c +include ../../../CPP/7zip/LzmaDec_gcc.mak -LzFind.o: ../../LzFind.c - $(CXX) $(CFLAGS) ../../LzFind.c -LzmaDec.o: ../../LzmaDec.c - $(CXX) $(CFLAGS) ../../LzmaDec.c - -LzmaEnc.o: ../../LzmaEnc.c - $(CXX) $(CFLAGS) ../../LzmaEnc.c - -7zFile.o: ../../7zFile.c - $(CXX) $(CFLAGS) ../../7zFile.c - -7zStream.o: ../../7zStream.c - $(CXX) $(CFLAGS) ../../7zStream.c - -clean: - -$(RM) $(PROG) $(OBJS) +OBJS = \ + $(LZMA_DEC_OPT_OBJS) \ + $O/7zFile.o \ + $O/7zStream.o \ + $O/Alloc.o \ + $O/CpuArch.o \ + $O/LzFind.o \ + $O/LzFindMt.o \ + $O/LzFindOpt.o \ + $O/LzmaDec.o \ + $O/LzmaEnc.o \ + $O/LzmaUtil.o \ + $O/Threads.o \ + + +include ../../7zip_gcc_c.mak diff --git a/src/sdk/C/Util/LzmaLib/LzmaLib.dsp b/src/sdk/C/Util/LzmaLib/LzmaLib.dsp index 3421de8..f413137 100644 --- a/src/sdk/C/Util/LzmaLib/LzmaLib.dsp +++ b/src/sdk/C/Util/LzmaLib/LzmaLib.dsp @@ -43,7 +43,7 @@ RSC=rc.exe # PROP Ignore_Export_Lib 0 # PROP Target_Dir "" # ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /c -# ADD CPP /nologo /Gr /MT /W3 /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c +# ADD CPP /nologo /Gr /MT /W4 /WX /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c # SUBTRACT CPP /YX # ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32 # ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32 @@ -71,7 +71,7 @@ LINK32=link.exe # PROP Ignore_Export_Lib 0 # PROP Target_Dir "" # ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /GZ /c -# ADD CPP /nologo /MTd /W3 /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c +# ADD CPP /nologo /MTd /W4 /WX /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c # SUBTRACT CPP /YX # ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32 # ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32 @@ -101,6 +101,10 @@ SOURCE=.\LzmaLib.def SOURCE=.\LzmaLibExports.c # End Source File +# Begin Source File + +SOURCE=.\Precomp.h +# End Source File # End Group # Begin Source File @@ -108,6 +112,10 @@ SOURCE=..\..\7zTypes.h # End Source File # Begin Source File +SOURCE=..\..\7zWindows.h +# End Source File +# Begin Source File + SOURCE=..\..\Alloc.c # End Source File # Begin Source File @@ -116,6 +124,18 @@ SOURCE=..\..\Alloc.h # End Source File # Begin Source File +SOURCE=..\..\Compiler.h +# End Source File +# Begin Source File + +SOURCE=..\..\CpuArch.c +# End Source File +# Begin Source File + +SOURCE=..\..\CpuArch.h +# End Source File +# Begin Source File + SOURCE=..\..\IStream.h # End Source File # Begin Source File @@ -136,6 +156,10 @@ SOURCE=..\..\LzFindMt.h # End Source File # Begin Source File +SOURCE=..\..\LzFindOpt.c +# End Source File +# Begin Source File + SOURCE=..\..\LzHash.h # End Source File # Begin Source File @@ -164,6 +188,10 @@ SOURCE=..\..\LzmaLib.h # End Source File # Begin Source File +SOURCE=..\..\Precomp.h +# End Source File +# Begin Source File + SOURCE=.\resource.rc # End Source File # Begin Source File diff --git a/src/sdk/C/Util/LzmaLib/LzmaLibExports.c b/src/sdk/C/Util/LzmaLib/LzmaLibExports.c index 4a28a9a..a46c9a8 100644 --- a/src/sdk/C/Util/LzmaLib/LzmaLibExports.c +++ b/src/sdk/C/Util/LzmaLib/LzmaLibExports.c @@ -1,14 +1,15 @@ /* LzmaLibExports.c -- LZMA library DLL Entry point -2015-11-08 : Igor Pavlov : Public domain */ +2023-03-05 : Igor Pavlov : Public domain */ -#include "../../Precomp.h" +#include "Precomp.h" -#include +#include "../../7zWindows.h" +BOOL WINAPI DllMain(HINSTANCE hInstance, DWORD dwReason, LPVOID lpReserved); BOOL WINAPI DllMain(HINSTANCE hInstance, DWORD dwReason, LPVOID lpReserved) { - UNUSED_VAR(hInstance); - UNUSED_VAR(dwReason); - UNUSED_VAR(lpReserved); + UNUSED_VAR(hInstance) + UNUSED_VAR(dwReason) + UNUSED_VAR(lpReserved) return TRUE; } diff --git a/src/sdk/C/Util/LzmaLib/Precomp.c b/src/sdk/C/Util/LzmaLib/Precomp.c new file mode 100644 index 0000000..01605e3 --- /dev/null +++ b/src/sdk/C/Util/LzmaLib/Precomp.c @@ -0,0 +1,4 @@ +/* Precomp.c -- StdAfx +2013-01-21 : Igor Pavlov : Public domain */ + +#include "Precomp.h" diff --git a/src/sdk/C/Util/LzmaLib/Precomp.h b/src/sdk/C/Util/LzmaLib/Precomp.h new file mode 100644 index 0000000..13a41ef --- /dev/null +++ b/src/sdk/C/Util/LzmaLib/Precomp.h @@ -0,0 +1,13 @@ +/* Precomp.h -- Precomp +2024-01-23 : Igor Pavlov : Public domain */ + +// #ifndef ZIP7_INC_PRECOMP_LOC_H +// #define ZIP7_INC_PRECOMP_LOC_H + +#if defined(_MSC_VER) && _MSC_VER >= 1800 +#pragma warning(disable : 4464) // relative include path contains '..' +#endif + +#include "../../Precomp.h" + +// #endif diff --git a/src/sdk/C/Util/LzmaLib/makefile b/src/sdk/C/Util/LzmaLib/makefile index 74103bb..9ed0aa4 100644 --- a/src/sdk/C/Util/LzmaLib/makefile +++ b/src/sdk/C/Util/LzmaLib/makefile @@ -11,6 +11,7 @@ LIB_OBJS = \ C_OBJS = \ $O\Alloc.obj \ + $O\CpuArch.obj \ $O\LzFind.obj \ $O\LzFindMt.obj \ $O\LzmaDec.obj \ @@ -18,9 +19,14 @@ C_OBJS = \ $O\LzmaLib.obj \ $O\Threads.obj \ +!include "../../../CPP/7zip/LzFindOpt.mak" +!include "../../../CPP/7zip/LzmaDec.mak" + OBJS = \ + $O\Precomp.obj \ $(LIB_OBJS) \ $(C_OBJS) \ + $(ASM_OBJS) \ $O\resource.res !include "../../../CPP/Build.mak" @@ -28,7 +34,26 @@ OBJS = \ $(SLIBPATH): $O $(OBJS) lib -out:$(SLIBPATH) $(OBJS) $(LIBS) + +MAK_SINGLE_FILE = 1 + +$O\Precomp.obj: Precomp.c + $(CCOMPL_PCH) + +!IFDEF MAK_SINGLE_FILE + $(LIB_OBJS): $(*B).c - $(COMPL_O2) + $(CCOMPL_USE) $(C_OBJS): ../../$(*B).c - $(COMPL_O2) + $(CCOMPL_USE) + +!ELSE + +{.}.c{$O}.obj:: + $(CCOMPLB_USE) +{../../../C}.c{$O}.obj:: + $(CCOMPLB_USE) + +!ENDIF + +!include "../../Asm_c.mak" diff --git a/src/sdk/C/Util/SfxSetup/Precomp.h b/src/sdk/C/Util/SfxSetup/Precomp.h index 588a66f..13a41ef 100644 --- a/src/sdk/C/Util/SfxSetup/Precomp.h +++ b/src/sdk/C/Util/SfxSetup/Precomp.h @@ -1,10 +1,13 @@ -/* Precomp.h -- StdAfx -2013-06-16 : Igor Pavlov : Public domain */ +/* Precomp.h -- Precomp +2024-01-23 : Igor Pavlov : Public domain */ -#ifndef __7Z_PRECOMP_H -#define __7Z_PRECOMP_H - -#include "../../Compiler.h" -#include "../../7zTypes.h" +// #ifndef ZIP7_INC_PRECOMP_LOC_H +// #define ZIP7_INC_PRECOMP_LOC_H +#if defined(_MSC_VER) && _MSC_VER >= 1800 +#pragma warning(disable : 4464) // relative include path contains '..' #endif + +#include "../../Precomp.h" + +// #endif diff --git a/src/sdk/C/Util/SfxSetup/SfxSetup.c b/src/sdk/C/Util/SfxSetup/SfxSetup.c index ef19aea..9b5c1f9 100644 --- a/src/sdk/C/Util/SfxSetup/SfxSetup.c +++ b/src/sdk/C/Util/SfxSetup/SfxSetup.c @@ -1,5 +1,5 @@ /* SfxSetup.c - 7z SFX Setup -2019-02-02 : Igor Pavlov : Public domain */ +2024-01-24 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -26,6 +26,12 @@ #define kInputBufSize ((size_t)1 << 18) + +#define wcscat lstrcatW +#define wcslen (size_t)lstrlenW +#define wcscpy lstrcpyW +// wcsncpy() and lstrcpynW() work differently. We don't use them. + static const char * const kExts[] = { "bat" @@ -64,7 +70,7 @@ static unsigned FindExt(const wchar_t *s, unsigned *extLen) return len; } -#define MAKE_CHAR_UPPER(c) ((((c) >= 'a' && (c) <= 'z') ? (c) -= 0x20 : (c))) +#define MAKE_CHAR_UPPER(c) ((((c) >= 'a' && (c) <= 'z') ? (c) - 0x20 : (c))) static unsigned FindItem(const char * const *items, unsigned num, const wchar_t *s, unsigned len) { @@ -72,13 +78,13 @@ static unsigned FindItem(const char * const *items, unsigned num, const wchar_t for (i = 0; i < num; i++) { const char *item = items[i]; - unsigned itemLen = (unsigned)strlen(item); + const unsigned itemLen = (unsigned)strlen(item); unsigned j; if (len != itemLen) continue; for (j = 0; j < len; j++) { - unsigned c = (Byte)item[j]; + const unsigned c = (Byte)item[j]; if (c != s[j] && MAKE_CHAR_UPPER(c) != s[j]) break; } @@ -96,10 +102,20 @@ static BOOL WINAPI HandlerRoutine(DWORD ctrlType) } #endif + +#ifdef _CONSOLE +static void PrintStr(const char *s) +{ + fputs(s, stdout); +} +#endif + static void PrintErrorMessage(const char *message) { #ifdef _CONSOLE - printf("\n7-Zip Error: %s\n", message); + PrintStr("\n7-Zip Error: "); + PrintStr(message); + PrintStr("\n"); #else #ifdef UNDER_CE WCHAR messageW[256 + 4]; @@ -179,7 +195,7 @@ static WRes RemoveDirWithSubItems(WCHAR *path) WIN32_FIND_DATAW fd; HANDLE handle; WRes res = 0; - size_t len = wcslen(path); + const size_t len = wcslen(path); wcscpy(path + len, L"*"); handle = FindFirstFileW(path, &fd); path[len] = L'\0'; @@ -228,7 +244,7 @@ static WRes RemoveDirWithSubItems(WCHAR *path) } #ifdef _CONSOLE -int MY_CDECL main() +int Z7_CDECL main(void) #else int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, #ifdef UNDER_CE @@ -262,10 +278,10 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, #ifdef _CONSOLE SetConsoleCtrlHandler(HandlerRoutine, TRUE); #else - UNUSED_VAR(hInstance); - UNUSED_VAR(hPrevInstance); - UNUSED_VAR(lpCmdLine); - UNUSED_VAR(nCmdShow); + UNUSED_VAR(hInstance) + UNUSED_VAR(hPrevInstance) + UNUSED_VAR(lpCmdLine) + UNUSED_VAR(nCmdShow) #endif CrcGenerateTable(); @@ -290,7 +306,7 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, BoolInt quoteMode = False; for (;; cmdLineParams++) { - wchar_t c = *cmdLineParams; + const wchar_t c = *cmdLineParams; if (c == L'\"') quoteMode = !quoteMode; else if (c == 0 || (c == L' ' && !quoteMode)) @@ -324,7 +340,7 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, unsigned k; for (k = 0; k < 8; k++) { - unsigned t = value & 0xF; + const unsigned t = value & 0xF; value >>= 4; s[7 - k] = (wchar_t)((t < 10) ? ('0' + t) : ('A' + (t - 10))); } @@ -386,7 +402,7 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, { lookStream.bufSize = kInputBufSize; lookStream.realStream = &archiveStream.vt; - LookToRead2_Init(&lookStream); + LookToRead2_INIT(&lookStream) } } @@ -455,11 +471,11 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, unsigned extLen; const WCHAR *name = temp + nameStartPos; unsigned len = (unsigned)wcslen(name); - unsigned nameLen = FindExt(temp + nameStartPos, &extLen); - unsigned extPrice = FindItem(kExts, sizeof(kExts) / sizeof(kExts[0]), name + len - extLen, extLen); - unsigned namePrice = FindItem(kNames, sizeof(kNames) / sizeof(kNames[0]), name, nameLen); + const unsigned nameLen = FindExt(temp + nameStartPos, &extLen); + const unsigned extPrice = FindItem(kExts, sizeof(kExts) / sizeof(kExts[0]), name + len - extLen, extLen); + const unsigned namePrice = FindItem(kNames, sizeof(kNames) / sizeof(kNames[0]), name, nameLen); - unsigned price = namePrice + extPrice * 64 + (nameStartPos == 0 ? 0 : (1 << 12)); + const unsigned price = namePrice + extPrice * 64 + (nameStartPos == 0 ? 0 : (1 << 12)); if (minPrice > price) { minPrice = price; @@ -500,12 +516,13 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, #endif { - SRes res2 = File_Close(&outFile); + const WRes res2 = File_Close(&outFile); if (res != SZ_OK) break; - if (res2 != SZ_OK) + if (res2 != 0) { - res = res2; + errorMessage = "Can't close output file"; + res = SZ_ERROR_FAIL; break; } } @@ -550,7 +567,7 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, WCHAR oldCurDir[MAX_PATH + 2]; oldCurDir[0] = 0; { - DWORD needLen = GetCurrentDirectory(MAX_PATH + 1, oldCurDir); + const DWORD needLen = GetCurrentDirectory(MAX_PATH + 1, oldCurDir); if (needLen == 0 || needLen > MAX_PATH) oldCurDir[0] = 0; SetCurrentDirectory(workCurDir); diff --git a/src/sdk/C/Util/SfxSetup/makefile b/src/sdk/C/Util/SfxSetup/makefile index 544da67..b3f25a2 100644 --- a/src/sdk/C/Util/SfxSetup/makefile +++ b/src/sdk/C/Util/SfxSetup/makefile @@ -1,13 +1,14 @@ PROG = 7zS2.sfx MY_FIXED = 1 +CFLAGS = $(CFLAGS) \ + -DZ7_EXTRACT_ONLY \ + C_OBJS = \ $O\7zAlloc.obj \ $O\7zArcIn.obj \ $O\7zBuf.obj \ $O\7zBuf2.obj \ - $O\7zCrc.obj \ - $O\7zCrcOpt.obj \ $O\7zFile.obj \ $O\7zDec.obj \ $O\7zStream.obj \ @@ -24,9 +25,13 @@ C_OBJS = \ 7Z_OBJS = \ $O\SfxSetup.obj \ +!include "../../../CPP/7zip/Crc.mak" +# !include "../../../CPP/7zip/LzmaDec.mak" + OBJS = \ $(7Z_OBJS) \ $(C_OBJS) \ + $(ASM_OBJS) \ $O\resource.res !include "../../../CPP/Build.mak" @@ -35,3 +40,5 @@ $(7Z_OBJS): $(*B).c $(COMPL_O1) $(C_OBJS): ../../$(*B).c $(COMPL_O1) + +!include "../../Asm_c.mak" diff --git a/src/sdk/C/Util/SfxSetup/makefile_con b/src/sdk/C/Util/SfxSetup/makefile_con index d0f8352..9f4b916 100644 --- a/src/sdk/C/Util/SfxSetup/makefile_con +++ b/src/sdk/C/Util/SfxSetup/makefile_con @@ -1,6 +1,8 @@ PROG = 7zS2con.sfx MY_FIXED = 1 -CFLAGS = $(CFLAGS) -D_CONSOLE + +CFLAGS = $(CFLAGS) -D_CONSOLE \ + -DZ7_EXTRACT_ONLY \ C_OBJS = \ $O\7zAlloc.obj \ diff --git a/src/sdk/C/Xz.c b/src/sdk/C/Xz.c index d9f83df..d07550d 100644 --- a/src/sdk/C/Xz.c +++ b/src/sdk/C/Xz.c @@ -1,5 +1,5 @@ /* Xz.c - Xz -2017-05-12 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -41,7 +41,7 @@ void Xz_Free(CXzStream *p, ISzAllocPtr alloc) unsigned XzFlags_GetCheckSize(CXzStreamFlags f) { unsigned t = XzFlags_GetCheckType(f); - return (t == 0) ? 0 : (4 << ((t - 1) / 3)); + return (t == 0) ? 0 : ((unsigned)4 << ((t - 1) / 3)); } void XzCheck_Init(CXzCheck *p, unsigned mode) @@ -52,6 +52,7 @@ void XzCheck_Init(CXzCheck *p, unsigned mode) case XZ_CHECK_CRC32: p->crc = CRC_INIT_VAL; break; case XZ_CHECK_CRC64: p->crc64 = CRC64_INIT_VAL; break; case XZ_CHECK_SHA256: Sha256_Init(&p->sha); break; + default: break; } } @@ -62,6 +63,7 @@ void XzCheck_Update(CXzCheck *p, const void *data, size_t size) case XZ_CHECK_CRC32: p->crc = CrcUpdate(p->crc, data, size); break; case XZ_CHECK_CRC64: p->crc64 = Crc64Update(p->crc64, data, size); break; case XZ_CHECK_SHA256: Sha256_Update(&p->sha, (const Byte *)data, size); break; + default: break; } } @@ -70,7 +72,7 @@ int XzCheck_Final(CXzCheck *p, Byte *digest) switch (p->mode) { case XZ_CHECK_CRC32: - SetUi32(digest, CRC_GET_DIGEST(p->crc)); + SetUi32(digest, CRC_GET_DIGEST(p->crc)) break; case XZ_CHECK_CRC64: { diff --git a/src/sdk/C/Xz.h b/src/sdk/C/Xz.h index 544ee18..ad63b48 100644 --- a/src/sdk/C/Xz.h +++ b/src/sdk/C/Xz.h @@ -1,21 +1,24 @@ /* Xz.h - Xz interface -2018-07-04 : Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ -#ifndef __XZ_H -#define __XZ_H +#ifndef ZIP7_INC_XZ_H +#define ZIP7_INC_XZ_H #include "Sha256.h" +#include "Delta.h" EXTERN_C_BEGIN #define XZ_ID_Subblock 1 #define XZ_ID_Delta 3 -#define XZ_ID_X86 4 -#define XZ_ID_PPC 5 -#define XZ_ID_IA64 6 -#define XZ_ID_ARM 7 -#define XZ_ID_ARMT 8 +#define XZ_ID_X86 4 +#define XZ_ID_PPC 5 +#define XZ_ID_IA64 6 +#define XZ_ID_ARM 7 +#define XZ_ID_ARMT 8 #define XZ_ID_SPARC 9 +#define XZ_ID_ARM64 0xa +#define XZ_ID_RISCV 0xb #define XZ_ID_LZMA2 0x21 unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value); @@ -47,13 +50,13 @@ typedef struct CXzFilter filters[XZ_NUM_FILTERS_MAX]; } CXzBlock; -#define XzBlock_GetNumFilters(p) (((p)->flags & XZ_BF_NUM_FILTERS_MASK) + 1) +#define XzBlock_GetNumFilters(p) (((unsigned)(p)->flags & XZ_BF_NUM_FILTERS_MASK) + 1) #define XzBlock_HasPackSize(p) (((p)->flags & XZ_BF_PACK_SIZE) != 0) #define XzBlock_HasUnpackSize(p) (((p)->flags & XZ_BF_UNPACK_SIZE) != 0) #define XzBlock_HasUnsupportedFlags(p) (((p)->flags & ~(XZ_BF_NUM_FILTERS_MASK | XZ_BF_PACK_SIZE | XZ_BF_UNPACK_SIZE)) != 0) SRes XzBlock_Parse(CXzBlock *p, const Byte *header); -SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStream *inStream, BoolInt *isIndex, UInt32 *headerSizeRes); +SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes); /* ---------- xz stream ---------- */ @@ -101,7 +104,7 @@ typedef UInt16 CXzStreamFlags; unsigned XzFlags_GetCheckSize(CXzStreamFlags f); SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf); -SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStream *inStream); +SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream); typedef struct { @@ -112,11 +115,13 @@ typedef struct typedef struct { CXzStreamFlags flags; + // Byte _pad[6]; size_t numBlocks; CXzBlockSizes *blocks; UInt64 startOffset; } CXzStream; +#define Xz_CONSTRUCT(p) { (p)->numBlocks = 0; (p)->blocks = NULL; (p)->flags = 0; } void Xz_Construct(CXzStream *p); void Xz_Free(CXzStream *p, ISzAllocPtr alloc); @@ -132,9 +137,14 @@ typedef struct CXzStream *streams; } CXzs; +#define Xzs_CONSTRUCT(p) { (p)->num = 0; (p)->numAllocated = 0; (p)->streams = NULL; } void Xzs_Construct(CXzs *p); void Xzs_Free(CXzs *p, ISzAllocPtr alloc); -SRes Xzs_ReadBackward(CXzs *p, ILookInStream *inStream, Int64 *startOffset, ICompressProgress *progress, ISzAllocPtr alloc); +/* +Xzs_ReadBackward() must be called for empty CXzs object. +Xzs_ReadBackward() can return non empty object with (p->num != 0) even in case of error. +*/ +SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc); UInt64 Xzs_GetNumBlocks(const CXzs *p); UInt64 Xzs_GetUnpackSize(const CXzs *p); @@ -160,9 +170,9 @@ typedef enum } ECoderFinishMode; -typedef struct _IStateCoder +typedef struct { - void *p; + void *p; // state object; void (*Free)(void *p, ISzAllocPtr alloc); SRes (*SetProps)(void *p, const Byte *props, size_t propSize, ISzAllocPtr alloc); void (*Init)(void *p); @@ -174,6 +184,20 @@ typedef struct _IStateCoder } IStateCoder; +typedef struct +{ + UInt32 methodId; + UInt32 delta; + UInt32 ip; + UInt32 X86_State; + Byte delta_State[DELTA_STATE_SIZE]; +} CXzBcFilterStateBase; + +typedef SizeT (*Xz_Func_BcFilterStateBase_Filter)(CXzBcFilterStateBase *p, Byte *data, SizeT size); + +SRes Xz_StateCoder_Bc_SetFromMethod_Func(IStateCoder *p, UInt64 id, + Xz_Func_BcFilterStateBase_Filter func, ISzAllocPtr alloc); + #define MIXCODER_NUM_FILTERS_MAX 4 @@ -216,13 +240,13 @@ typedef enum typedef struct { EXzState state; - UInt32 pos; + unsigned pos; unsigned alignPos; unsigned indexPreSize; CXzStreamFlags streamFlags; - UInt32 blockHeaderSize; + unsigned blockHeaderSize; UInt64 packSize; UInt64 unpackSize; @@ -250,8 +274,8 @@ typedef struct size_t outBufSize; size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked - Byte shaDigest[SHA256_DIGEST_SIZE]; - Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; + UInt32 shaDigest32[SHA256_DIGEST_SIZE / 4]; + Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; // it must be aligned for 4-bytes } CXzUnpacker; /* alloc : aligned for cache line allocation is better */ @@ -277,7 +301,10 @@ void XzUnpacker_Free(CXzUnpacker *p); { XzUnpacker_Init() for() + { XzUnpacker_Code(); + } + XzUnpacker_IsStreamWasFinished() } Interface-2 : Direct output buffer: @@ -288,7 +315,10 @@ void XzUnpacker_Free(CXzUnpacker *p); XzUnpacker_Init() XzUnpacker_SetOutBufMode(); // to set output buffer and size for() + { XzUnpacker_Code(); // (dest = NULL) in XzUnpacker_Code() + } + XzUnpacker_IsStreamWasFinished() } Interface-3 : Direct output buffer : One call full decoding @@ -296,6 +326,7 @@ void XzUnpacker_Free(CXzUnpacker *p); It uses Interface-2 internally. { XzUnpacker_CodeFull() + XzUnpacker_IsStreamWasFinished() } */ @@ -309,8 +340,12 @@ void XzUnpacker_Free(CXzUnpacker *p); SZ_OK status: CODER_STATUS_NOT_FINISHED, - CODER_STATUS_NEEDS_MORE_INPUT - maybe there are more xz streams, - call XzUnpacker_IsStreamWasFinished to check that current stream was finished + CODER_STATUS_NEEDS_MORE_INPUT - the decoder can return it in two cases: + 1) it needs more input data to finish current xz stream + 2) xz stream was finished successfully. But the decoder supports multiple + concatented xz streams. So it expects more input data for new xz streams. + Call XzUnpacker_IsStreamWasFinished() to check that latest xz stream was finished successfully. + SZ_ERROR_MEM - Memory allocation error SZ_ERROR_DATA - Data error SZ_ERROR_UNSUPPORTED - Unsupported method or method properties @@ -335,12 +370,17 @@ SRes XzUnpacker_CodeFull(CXzUnpacker *p, Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, ECoderFinishMode finishMode, ECoderStatus *status); +/* +If you decode full xz stream(s), then you can call XzUnpacker_IsStreamWasFinished() +after successful XzUnpacker_CodeFull() or after last call of XzUnpacker_Code(). +*/ + BoolInt XzUnpacker_IsStreamWasFinished(const CXzUnpacker *p); /* -XzUnpacker_GetExtraSize() returns then number of uncofirmed bytes, +XzUnpacker_GetExtraSize() returns then number of unconfirmed bytes, if it's in (XZ_STATE_STREAM_HEADER) state or in (XZ_STATE_STREAM_PADDING) state. -These bytes can be some bytes after xz archive, or +These bytes can be some data after xz archive, or it can be start of new xz stream. Call XzUnpacker_GetExtraSize() after XzUnpacker_Code() function to detect real size of @@ -371,29 +411,57 @@ BoolInt XzUnpacker_IsBlockFinished(const CXzUnpacker *p); -/* ---------- Multi Threading Decoding ---------- */ + + + +/* ---- Single-Thread and Multi-Thread xz Decoding with Input/Output Streams ---- */ + +/* + if (CXzDecMtProps::numThreads > 1), the decoder can try to use + Multi-Threading. The decoder analyses xz block header, and if + there are pack size and unpack size values stored in xz block header, + the decoder reads compressed data of block to internal buffers, + and then it can start parallel decoding, if there are another blocks. + The decoder can switch back to Single-Thread decoding after some conditions. + + The sequence of calls for xz decoding with in/out Streams: + { + XzDecMt_Create() + XzDecMtProps_Init(XzDecMtProps) to set default values of properties + // then you can change some XzDecMtProps parameters with required values + // here you can set the number of threads and (memUseMax) - the maximum + Memory usage for multithreading decoding. + for() + { + XzDecMt_Decode() // one call per one file + } + XzDecMt_Destroy() + } +*/ typedef struct { - size_t inBufSize_ST; - size_t outStep_ST; - BoolInt ignoreErrors; + size_t inBufSize_ST; // size of input buffer for Single-Thread decoding + size_t outStep_ST; // size of output buffer for Single-Thread decoding + BoolInt ignoreErrors; // if set to 1, the decoder can ignore some errors and it skips broken parts of data. - #ifndef _7ZIP_ST - unsigned numThreads; - size_t inBufSize_MT; - size_t memUseMax; + #ifndef Z7_ST + unsigned numThreads; // the number of threads for Multi-Thread decoding. if (umThreads == 1) it will use Single-thread decoding + size_t inBufSize_MT; // size of small input data buffers for Multi-Thread decoding. Big number of such small buffers can be created + size_t memUseMax; // the limit of total memory usage for Multi-Thread decoding. + // it's recommended to set (memUseMax) manually to value that is smaller of total size of RAM in computer. #endif } CXzDecMtProps; void XzDecMtProps_Init(CXzDecMtProps *p); - -typedef void * CXzDecMtHandle; +typedef struct CXzDecMt CXzDecMt; +typedef CXzDecMt * CXzDecMtHandle; +// Z7_DECLARE_HANDLE(CXzDecMtHandle) /* - alloc : XzDecMt uses CAlignOffsetAlloc for addresses allocated by (alloc). + alloc : XzDecMt uses CAlignOffsetAlloc internally for addresses allocated by (alloc). allocMid : for big allocations, aligned allocation is better */ @@ -407,33 +475,46 @@ typedef struct Byte NumStreams_Defined; Byte NumBlocks_Defined; - Byte DataAfterEnd; + Byte DataAfterEnd; // there are some additional data after good xz streams, and that data is not new xz stream. Byte DecodingTruncated; // Decoding was Truncated, we need only partial output data - UInt64 InSize; // pack size processed + UInt64 InSize; // pack size processed. That value doesn't include the data after + // end of xz stream, if that data was not correct UInt64 OutSize; UInt64 NumStreams; UInt64 NumBlocks; - SRes DecodeRes; - SRes ReadRes; - SRes ProgressRes; - SRes CombinedRes; - SRes CombinedRes_Type; + SRes DecodeRes; // the error code of xz streams data decoding + SRes ReadRes; // error code from ISeqInStream:Read() + SRes ProgressRes; // error code from ICompressProgress:Progress() + SRes CombinedRes; // Combined result error code that shows main rusult + // = S_OK, if there is no error. + // but check also (DataAfterEnd) that can show additional minor errors. + + SRes CombinedRes_Type; // = SZ_ERROR_READ, if error from ISeqInStream + // = SZ_ERROR_PROGRESS, if error from ICompressProgress + // = SZ_ERROR_WRITE, if error from ISeqOutStream + // = SZ_ERROR_* codes for decoding } CXzStatInfo; void XzStatInfo_Clear(CXzStatInfo *p); /* + XzDecMt_Decode() -SRes: - SZ_OK - OK +SRes: it's combined decoding result. It also is equal to stat->CombinedRes. + + SZ_OK - no error + check also output value in (stat->DataAfterEnd) + that can show additional possible error + SZ_ERROR_MEM - Memory allocation error SZ_ERROR_NO_ARCHIVE - is not xz archive SZ_ERROR_ARCHIVE - Headers error SZ_ERROR_DATA - Data Error + SZ_ERROR_UNSUPPORTED - Unsupported method or method properties SZ_ERROR_CRC - CRC Error SZ_ERROR_INPUT_EOF - it needs more input data SZ_ERROR_WRITE - ISeqOutStream error @@ -447,13 +528,14 @@ SRes XzDecMt_Decode(CXzDecMtHandle p, const CXzDecMtProps *props, const UInt64 *outDataSize, // NULL means undefined int finishMode, // 0 - partial unpacking is allowed, 1 - xz stream(s) must be finished - ISeqOutStream *outStream, + ISeqOutStreamPtr outStream, // Byte *outBuf, size_t *outBufSize, - ISeqInStream *inStream, + ISeqInStreamPtr inStream, // const Byte *inData, size_t inDataSize, - CXzStatInfo *stat, - int *isMT, // 0 means that ST (Single-Thread) version was used - ICompressProgress *progress); + CXzStatInfo *stat, // out: decoding results and statistics + int *isMT, // out: 0 means that ST (Single-Thread) version was used + // 1 means that MT (Multi-Thread) version was used + ICompressProgressPtr progress); EXTERN_C_END diff --git a/src/sdk/C/XzCrc64.c b/src/sdk/C/XzCrc64.c index b6d02cb..94fc1af 100644 --- a/src/sdk/C/XzCrc64.c +++ b/src/sdk/C/XzCrc64.c @@ -1,5 +1,5 @@ /* XzCrc64.c -- CRC64 calculation -2017-05-23 : Igor Pavlov : Public domain */ +2023-12-08 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -8,45 +8,76 @@ #define kCrc64Poly UINT64_CONST(0xC96C5795D7870F42) -#ifdef MY_CPU_LE - #define CRC64_NUM_TABLES 4 +// for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu +// #define Z7_CRC64_DEBUG_BE +#ifdef Z7_CRC64_DEBUG_BE +#undef MY_CPU_LE +#define MY_CPU_BE +#endif + +#ifdef Z7_CRC64_NUM_TABLES + #define Z7_CRC64_NUM_TABLES_USE Z7_CRC64_NUM_TABLES #else - #define CRC64_NUM_TABLES 5 - #define CRC_UINT64_SWAP(v) \ - ((v >> 56) \ - | ((v >> 40) & ((UInt64)0xFF << 8)) \ - | ((v >> 24) & ((UInt64)0xFF << 16)) \ - | ((v >> 8) & ((UInt64)0xFF << 24)) \ - | ((v << 8) & ((UInt64)0xFF << 32)) \ - | ((v << 24) & ((UInt64)0xFF << 40)) \ - | ((v << 40) & ((UInt64)0xFF << 48)) \ - | ((v << 56))) - - UInt64 MY_FAST_CALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table); + #define Z7_CRC64_NUM_TABLES_USE 12 +#endif + +#if Z7_CRC64_NUM_TABLES_USE < 1 + #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES #endif + +#if Z7_CRC64_NUM_TABLES_USE != 1 + #ifndef MY_CPU_BE - UInt64 MY_FAST_CALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table); + #define FUNC_NAME_LE_2(s) XzCrc64UpdateT ## s + #define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s) + #define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC64_NUM_TABLES_USE) + UInt64 Z7_FASTCALL FUNC_NAME_LE (UInt64 v, const void *data, size_t size, const UInt64 *table); +#endif +#ifndef MY_CPU_LE + #define FUNC_NAME_BE_2(s) XzCrc64UpdateBeT ## s + #define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s) + #define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC64_NUM_TABLES_USE) + UInt64 Z7_FASTCALL FUNC_NAME_BE (UInt64 v, const void *data, size_t size, const UInt64 *table); #endif -typedef UInt64 (MY_FAST_CALL *CRC64_FUNC)(UInt64 v, const void *data, size_t size, const UInt64 *table); +#if defined(MY_CPU_LE) + #define FUNC_REF FUNC_NAME_LE +#elif defined(MY_CPU_BE) + #define FUNC_REF FUNC_NAME_BE +#else + #define FUNC_REF g_Crc64Update + static UInt64 (Z7_FASTCALL *FUNC_REF)(UInt64 v, const void *data, size_t size, const UInt64 *table); +#endif -static CRC64_FUNC g_Crc64Update; -UInt64 g_Crc64Table[256 * CRC64_NUM_TABLES]; +#endif + + +MY_ALIGN(64) +static UInt64 g_Crc64Table[256 * Z7_CRC64_NUM_TABLES_USE]; -UInt64 MY_FAST_CALL Crc64Update(UInt64 v, const void *data, size_t size) -{ - return g_Crc64Update(v, data, size, g_Crc64Table); -} -UInt64 MY_FAST_CALL Crc64Calc(const void *data, size_t size) +UInt64 Z7_FASTCALL Crc64Update(UInt64 v, const void *data, size_t size) { - return g_Crc64Update(CRC64_INIT_VAL, data, size, g_Crc64Table) ^ CRC64_INIT_VAL; +#if Z7_CRC64_NUM_TABLES_USE == 1 + #define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) + const UInt64 *table = g_Crc64Table; + const Byte *p = (const Byte *)data; + const Byte *lim = p + size; + for (; p != lim; p++) + v = CRC64_UPDATE_BYTE_2(v, *p); + return v; + #undef CRC64_UPDATE_BYTE_2 +#else + return FUNC_REF (v, data, size, g_Crc64Table); +#endif } -void MY_FAST_CALL Crc64GenerateTable() + +Z7_NO_INLINE +void Z7_FASTCALL Crc64GenerateTable(void) { - UInt32 i; + unsigned i; for (i = 0; i < 256; i++) { UInt64 r = i; @@ -55,32 +86,55 @@ void MY_FAST_CALL Crc64GenerateTable() r = (r >> 1) ^ (kCrc64Poly & ((UInt64)0 - (r & 1))); g_Crc64Table[i] = r; } - for (i = 256; i < 256 * CRC64_NUM_TABLES; i++) + +#if Z7_CRC64_NUM_TABLES_USE != 1 +#if 1 || 1 && defined(MY_CPU_X86) // low register count + for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i++) { - UInt64 r = g_Crc64Table[(size_t)i - 256]; - g_Crc64Table[i] = g_Crc64Table[r & 0xFF] ^ (r >> 8); + const UInt64 r0 = g_Crc64Table[(size_t)i]; + g_Crc64Table[(size_t)i + 256] = g_Crc64Table[(Byte)r0] ^ (r0 >> 8); } - - #ifdef MY_CPU_LE - - g_Crc64Update = XzCrc64UpdateT4; +#else + for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i += 2) + { + UInt64 r0 = g_Crc64Table[(size_t)(i) ]; + UInt64 r1 = g_Crc64Table[(size_t)(i) + 1]; + r0 = g_Crc64Table[(Byte)r0] ^ (r0 >> 8); + r1 = g_Crc64Table[(Byte)r1] ^ (r1 >> 8); + g_Crc64Table[(size_t)i + 256 ] = r0; + g_Crc64Table[(size_t)i + 256 + 1] = r1; + } +#endif - #else +#ifndef MY_CPU_LE { - #ifndef MY_CPU_BE +#ifndef MY_CPU_BE UInt32 k = 1; if (*(const Byte *)&k == 1) - g_Crc64Update = XzCrc64UpdateT4; + FUNC_REF = FUNC_NAME_LE; else - #endif +#endif { - for (i = 256 * CRC64_NUM_TABLES - 1; i >= 256; i--) +#ifndef MY_CPU_BE + FUNC_REF = FUNC_NAME_BE; +#endif + for (i = 0; i < 256 * Z7_CRC64_NUM_TABLES_USE; i++) { - UInt64 x = g_Crc64Table[(size_t)i - 256]; - g_Crc64Table[i] = CRC_UINT64_SWAP(x); + const UInt64 x = g_Crc64Table[i]; + g_Crc64Table[i] = Z7_BSWAP64(x); } - g_Crc64Update = XzCrc64UpdateT1_BeT4; } } - #endif +#endif // ndef MY_CPU_LE +#endif // Z7_CRC64_NUM_TABLES_USE != 1 } + +#undef kCrc64Poly +#undef Z7_CRC64_NUM_TABLES_USE +#undef FUNC_REF +#undef FUNC_NAME_LE_2 +#undef FUNC_NAME_LE_1 +#undef FUNC_NAME_LE +#undef FUNC_NAME_BE_2 +#undef FUNC_NAME_BE_1 +#undef FUNC_NAME_BE diff --git a/src/sdk/C/XzCrc64.h b/src/sdk/C/XzCrc64.h index 08dbc33..04f8153 100644 --- a/src/sdk/C/XzCrc64.h +++ b/src/sdk/C/XzCrc64.h @@ -1,8 +1,8 @@ /* XzCrc64.h -- CRC64 calculation -2013-01-18 : Igor Pavlov : Public domain */ +2023-12-08 : Igor Pavlov : Public domain */ -#ifndef __XZ_CRC64_H -#define __XZ_CRC64_H +#ifndef ZIP7_INC_XZ_CRC64_H +#define ZIP7_INC_XZ_CRC64_H #include @@ -10,16 +10,16 @@ EXTERN_C_BEGIN -extern UInt64 g_Crc64Table[]; +// extern UInt64 g_Crc64Table[]; -void MY_FAST_CALL Crc64GenerateTable(void); +void Z7_FASTCALL Crc64GenerateTable(void); #define CRC64_INIT_VAL UINT64_CONST(0xFFFFFFFFFFFFFFFF) #define CRC64_GET_DIGEST(crc) ((crc) ^ CRC64_INIT_VAL) -#define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) +// #define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) -UInt64 MY_FAST_CALL Crc64Update(UInt64 crc, const void *data, size_t size); -UInt64 MY_FAST_CALL Crc64Calc(const void *data, size_t size); +UInt64 Z7_FASTCALL Crc64Update(UInt64 crc, const void *data, size_t size); +// UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size); EXTERN_C_END diff --git a/src/sdk/C/XzCrc64Opt.c b/src/sdk/C/XzCrc64Opt.c index b2852de..6eea4a3 100644 --- a/src/sdk/C/XzCrc64Opt.c +++ b/src/sdk/C/XzCrc64Opt.c @@ -1,69 +1,261 @@ -/* XzCrc64Opt.c -- CRC64 calculation -2017-06-30 : Igor Pavlov : Public domain */ +/* XzCrc64Opt.c -- CRC64 calculation (optimized functions) +: Igor Pavlov : Public domain */ #include "Precomp.h" #include "CpuArch.h" +#if !defined(Z7_CRC64_NUM_TABLES) || Z7_CRC64_NUM_TABLES > 1 + +// for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu +// #define Z7_CRC64_DEBUG_BE +#ifdef Z7_CRC64_DEBUG_BE +#undef MY_CPU_LE +#define MY_CPU_BE +#endif + +#if defined(MY_CPU_64BIT) +#define Z7_CRC64_USE_64BIT +#endif + +// the value Z7_CRC64_NUM_TABLES_USE must be defined to same value as in XzCrc64.c +#ifdef Z7_CRC64_NUM_TABLES +#define Z7_CRC64_NUM_TABLES_USE Z7_CRC64_NUM_TABLES +#else +#define Z7_CRC64_NUM_TABLES_USE 12 +#endif + +#if Z7_CRC64_NUM_TABLES_USE % 4 || \ + Z7_CRC64_NUM_TABLES_USE < 4 || \ + Z7_CRC64_NUM_TABLES_USE > 4 * 4 + #error Stop_Compiling_Bad_CRC64_NUM_TABLES +#endif + + #ifndef MY_CPU_BE -#define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) +#define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) + +#if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0) + +#define Q64LE(n, d) \ + ( (table + ((n) * 8 + 7) * 0x100)[((d) ) & 0xFF] \ + ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 3 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 4 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 5 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 6 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 0) * 0x100)[((d) >> 7 * 8)] ) + +#define R64(a) *((const UInt64 *)(const void *)p + (a)) + +#else + +#define Q32LE(n, d) \ + ( (table + ((n) * 4 + 3) * 0x100)[((d) ) & 0xFF] \ + ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] ) + +#define R32(a) *((const UInt32 *)(const void *)p + (a)) + +#endif + + +#define CRC64_FUNC_PRE_LE2(step) \ +UInt64 Z7_FASTCALL XzCrc64UpdateT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table) + +#define CRC64_FUNC_PRE_LE(step) \ + CRC64_FUNC_PRE_LE2(step); \ + CRC64_FUNC_PRE_LE2(step) -UInt64 MY_FAST_CALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table) +CRC64_FUNC_PRE_LE(Z7_CRC64_NUM_TABLES_USE) { const Byte *p = (const Byte *)data; - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) + const Byte *lim; + for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++) v = CRC64_UPDATE_BYTE_2(v, *p); - for (; size >= 4; size -= 4, p += 4) + lim = p + size; + if (size >= Z7_CRC64_NUM_TABLES_USE) { - UInt32 d = (UInt32)v ^ *(const UInt32 *)p; - v = (v >> 32) - ^ (table + 0x300)[((d ) & 0xFF)] - ^ (table + 0x200)[((d >> 8) & 0xFF)] - ^ (table + 0x100)[((d >> 16) & 0xFF)] - ^ (table + 0x000)[((d >> 24))]; + lim -= Z7_CRC64_NUM_TABLES_USE; + do + { +#if Z7_CRC64_NUM_TABLES_USE == 4 + const UInt32 d = (UInt32)v ^ R32(0); + v = (v >> 32) ^ Q32LE(0, d); +#elif Z7_CRC64_NUM_TABLES_USE == 8 +#ifdef Z7_CRC64_USE_64BIT + v ^= R64(0); + v = Q64LE(0, v); +#else + UInt32 v0, v1; + v0 = (UInt32)v ^ R32(0); + v1 = (UInt32)(v >> 32) ^ R32(1); + v = Q32LE(1, v0) ^ Q32LE(0, v1); +#endif +#elif Z7_CRC64_NUM_TABLES_USE == 12 + UInt32 w; + UInt32 v0, v1; + v0 = (UInt32)v ^ R32(0); + v1 = (UInt32)(v >> 32) ^ R32(1); + w = R32(2); + v = Q32LE(0, w); + v ^= Q32LE(2, v0) ^ Q32LE(1, v1); +#elif Z7_CRC64_NUM_TABLES_USE == 16 +#ifdef Z7_CRC64_USE_64BIT + UInt64 w; + UInt64 x; + w = R64(1); x = Q64LE(0, w); + v ^= R64(0); v = x ^ Q64LE(1, v); +#else + UInt32 v0, v1; + UInt32 r0, r1; + v0 = (UInt32)v ^ R32(0); + v1 = (UInt32)(v >> 32) ^ R32(1); + r0 = R32(2); + r1 = R32(3); + v = Q32LE(1, r0) ^ Q32LE(0, r1); + v ^= Q32LE(3, v0) ^ Q32LE(2, v1); +#endif +#else +#error Stop_Compiling_Bad_CRC64_NUM_TABLES +#endif + p += Z7_CRC64_NUM_TABLES_USE; + } + while (p <= lim); + lim += Z7_CRC64_NUM_TABLES_USE; } - for (; size > 0; size--, p++) + for (; p < lim; p++) v = CRC64_UPDATE_BYTE_2(v, *p); return v; } +#undef CRC64_UPDATE_BYTE_2 +#undef R32 +#undef R64 +#undef Q32LE +#undef Q64LE +#undef CRC64_FUNC_PRE_LE +#undef CRC64_FUNC_PRE_LE2 + #endif + + #ifndef MY_CPU_LE -#define CRC_UINT64_SWAP(v) \ - ((v >> 56) \ - | ((v >> 40) & ((UInt64)0xFF << 8)) \ - | ((v >> 24) & ((UInt64)0xFF << 16)) \ - | ((v >> 8) & ((UInt64)0xFF << 24)) \ - | ((v << 8) & ((UInt64)0xFF << 32)) \ - | ((v << 24) & ((UInt64)0xFF << 40)) \ - | ((v << 40) & ((UInt64)0xFF << 48)) \ - | ((v << 56))) +#define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 56) ^ (b)] ^ ((crc) << 8)) + +#if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0) + +#define Q64BE(n, d) \ + ( (table + ((n) * 8 + 0) * 0x100)[(Byte)(d)] \ + ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 3 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 4 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 5 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 6 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 7) * 0x100)[((d) >> 7 * 8)] ) + +#ifdef Z7_CRC64_DEBUG_BE + #define R64BE(a) GetBe64a((const UInt64 *)(const void *)p + (a)) +#else + #define R64BE(a) *((const UInt64 *)(const void *)p + (a)) +#endif + +#else -#define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[(Byte)((crc) >> 56) ^ (b)] ^ ((crc) << 8)) +#define Q32BE(n, d) \ + ( (table + ((n) * 4 + 0) * 0x100)[(Byte)(d)] \ + ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] ) -UInt64 MY_FAST_CALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table) +#ifdef Z7_CRC64_DEBUG_BE + #define R32BE(a) GetBe32a((const UInt32 *)(const void *)p + (a)) +#else + #define R32BE(a) *((const UInt32 *)(const void *)p + (a)) +#endif + +#endif + +#define CRC64_FUNC_PRE_BE2(step) \ +UInt64 Z7_FASTCALL XzCrc64UpdateBeT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table) + +#define CRC64_FUNC_PRE_BE(step) \ + CRC64_FUNC_PRE_BE2(step); \ + CRC64_FUNC_PRE_BE2(step) + +CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE) { const Byte *p = (const Byte *)data; - table += 0x100; - v = CRC_UINT64_SWAP(v); - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) + const Byte *lim; + v = Z7_BSWAP64(v); + for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++) v = CRC64_UPDATE_BYTE_2_BE(v, *p); - for (; size >= 4; size -= 4, p += 4) + lim = p + size; + if (size >= Z7_CRC64_NUM_TABLES_USE) { - UInt32 d = (UInt32)(v >> 32) ^ *(const UInt32 *)p; - v = (v << 32) - ^ (table + 0x000)[((d ) & 0xFF)] - ^ (table + 0x100)[((d >> 8) & 0xFF)] - ^ (table + 0x200)[((d >> 16) & 0xFF)] - ^ (table + 0x300)[((d >> 24))]; + lim -= Z7_CRC64_NUM_TABLES_USE; + do + { +#if Z7_CRC64_NUM_TABLES_USE == 4 + const UInt32 d = (UInt32)(v >> 32) ^ R32BE(0); + v = (v << 32) ^ Q32BE(0, d); +#elif Z7_CRC64_NUM_TABLES_USE == 12 + const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); + const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); + const UInt32 w = R32BE(2); + v = Q32BE(0, w); + v ^= Q32BE(2, d1) ^ Q32BE(1, d0); + +#elif Z7_CRC64_NUM_TABLES_USE == 8 + #ifdef Z7_CRC64_USE_64BIT + v ^= R64BE(0); + v = Q64BE(0, v); + #else + const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); + const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); + v = Q32BE(1, d1) ^ Q32BE(0, d0); + #endif +#elif Z7_CRC64_NUM_TABLES_USE == 16 + #ifdef Z7_CRC64_USE_64BIT + const UInt64 w = R64BE(1); + v ^= R64BE(0); + v = Q64BE(0, w) ^ Q64BE(1, v); + #else + const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); + const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); + const UInt32 w1 = R32BE(2); + const UInt32 w0 = R32BE(3); + v = Q32BE(1, w1) ^ Q32BE(0, w0); + v ^= Q32BE(3, d1) ^ Q32BE(2, d0); + #endif +#else +#error Stop_Compiling_Bad_CRC64_NUM_TABLES +#endif + p += Z7_CRC64_NUM_TABLES_USE; + } + while (p <= lim); + lim += Z7_CRC64_NUM_TABLES_USE; } - for (; size > 0; size--, p++) + for (; p < lim; p++) v = CRC64_UPDATE_BYTE_2_BE(v, *p); - return CRC_UINT64_SWAP(v); + return Z7_BSWAP64(v); } +#undef CRC64_UPDATE_BYTE_2_BE +#undef R32BE +#undef R64BE +#undef Q32BE +#undef Q64BE +#undef CRC64_FUNC_PRE_BE +#undef CRC64_FUNC_PRE_BE2 + +#endif +#undef Z7_CRC64_NUM_TABLES_USE #endif diff --git a/src/sdk/C/XzDec.c b/src/sdk/C/XzDec.c index 395e83f..2dac324 100644 --- a/src/sdk/C/XzDec.c +++ b/src/sdk/C/XzDec.c @@ -1,5 +1,5 @@ /* XzDec.c -- Xz Decode -2019-02-02 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -59,7 +59,7 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value) for (i = 0; i < limit;) { - Byte b = p[i]; + const unsigned b = p[i]; *value |= (UInt64)(b & 0x7F) << (7 * i++); if ((b & 0x80) == 0) return (b == 0 && i != 1) ? 0 : i; @@ -67,7 +67,8 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value) return 0; } -/* ---------- BraState ---------- */ + +/* ---------- XzBcFilterState ---------- */ #define BRA_BUF_SIZE (1 << 14) @@ -76,55 +77,60 @@ typedef struct size_t bufPos; size_t bufConv; size_t bufTotal; + Byte *buf; // must be aligned for 4 bytes + Xz_Func_BcFilterStateBase_Filter filter_func; + // int encodeMode; + CXzBcFilterStateBase base; + // Byte buf[BRA_BUF_SIZE]; +} CXzBcFilterState; - int encodeMode; - - UInt32 methodId; - UInt32 delta; - UInt32 ip; - UInt32 x86State; - Byte deltaState[DELTA_STATE_SIZE]; - Byte buf[BRA_BUF_SIZE]; -} CBraState; - -static void BraState_Free(void *pp, ISzAllocPtr alloc) +static void XzBcFilterState_Free(void *pp, ISzAllocPtr alloc) { - ISzAlloc_Free(alloc, pp); + if (pp) + { + CXzBcFilterState *p = ((CXzBcFilterState *)pp); + ISzAlloc_Free(alloc, p->buf); + ISzAlloc_Free(alloc, pp); + } } -static SRes BraState_SetProps(void *pp, const Byte *props, size_t propSize, ISzAllocPtr alloc) + +static SRes XzBcFilterState_SetProps(void *pp, const Byte *props, size_t propSize, ISzAllocPtr alloc) { - CBraState *p = ((CBraState *)pp); - UNUSED_VAR(alloc); + CXzBcFilterStateBase *p = &((CXzBcFilterState *)pp)->base; + UNUSED_VAR(alloc) p->ip = 0; if (p->methodId == XZ_ID_Delta) { if (propSize != 1) return SZ_ERROR_UNSUPPORTED; - p->delta = (unsigned)props[0] + 1; + p->delta = (UInt32)props[0] + 1; } else { if (propSize == 4) { - UInt32 v = GetUi32(props); + const UInt32 v = GetUi32(props); switch (p->methodId) { case XZ_ID_PPC: case XZ_ID_ARM: case XZ_ID_SPARC: - if ((v & 3) != 0) + case XZ_ID_ARM64: + if (v & 3) return SZ_ERROR_UNSUPPORTED; break; case XZ_ID_ARMT: - if ((v & 1) != 0) + case XZ_ID_RISCV: + if (v & 1) return SZ_ERROR_UNSUPPORTED; break; case XZ_ID_IA64: - if ((v & 0xF) != 0) + if (v & 0xf) return SZ_ERROR_UNSUPPORTED; break; + default: break; } p->ip = v; } @@ -134,73 +140,91 @@ static SRes BraState_SetProps(void *pp, const Byte *props, size_t propSize, ISzA return SZ_OK; } -static void BraState_Init(void *pp) + +static void XzBcFilterState_Init(void *pp) { - CBraState *p = ((CBraState *)pp); + CXzBcFilterState *p = ((CXzBcFilterState *)pp); p->bufPos = p->bufConv = p->bufTotal = 0; - x86_Convert_Init(p->x86State); - if (p->methodId == XZ_ID_Delta) - Delta_Init(p->deltaState); + p->base.X86_State = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL; + if (p->base.methodId == XZ_ID_Delta) + Delta_Init(p->base.delta_State); } -#define CASE_BRA_CONV(isa) case XZ_ID_ ## isa: size = isa ## _Convert(data, size, p->ip, p->encodeMode); break; - -static SizeT BraState_Filter(void *pp, Byte *data, SizeT size) +static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Dec[] = +{ + Z7_BRANCH_CONV_DEC_2 (BranchConv_PPC), + Z7_BRANCH_CONV_DEC_2 (BranchConv_IA64), + Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM), + Z7_BRANCH_CONV_DEC_2 (BranchConv_ARMT), + Z7_BRANCH_CONV_DEC_2 (BranchConv_SPARC), + Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM64), + Z7_BRANCH_CONV_DEC_2 (BranchConv_RISCV) +}; + +static SizeT XzBcFilterStateBase_Filter_Dec(CXzBcFilterStateBase *p, Byte *data, SizeT size) { - CBraState *p = ((CBraState *)pp); switch (p->methodId) { case XZ_ID_Delta: - if (p->encodeMode) - Delta_Encode(p->deltaState, p->delta, data, size); - else - Delta_Decode(p->deltaState, p->delta, data, size); + Delta_Decode(p->delta_State, p->delta, data, size); break; case XZ_ID_X86: - size = x86_Convert(data, size, p->ip, &p->x86State, p->encodeMode); + size = (SizeT)(z7_BranchConvSt_X86_Dec(data, size, p->ip, &p->X86_State) - data); + break; + default: + if (p->methodId >= XZ_ID_PPC) + { + const UInt32 i = p->methodId - XZ_ID_PPC; + if (i < Z7_ARRAY_SIZE(g_Funcs_BranchConv_RISC_Dec)) + size = (SizeT)(g_Funcs_BranchConv_RISC_Dec[i](data, size, p->ip) - data); + } break; - CASE_BRA_CONV(PPC) - CASE_BRA_CONV(IA64) - CASE_BRA_CONV(ARM) - CASE_BRA_CONV(ARMT) - CASE_BRA_CONV(SPARC) } p->ip += (UInt32)size; return size; } -static SRes BraState_Code2(void *pp, +static SizeT XzBcFilterState_Filter(void *pp, Byte *data, SizeT size) +{ + CXzBcFilterState *p = ((CXzBcFilterState *)pp); + return p->filter_func(&p->base, data, size); +} + + +static SRes XzBcFilterState_Code2(void *pp, Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, int srcWasFinished, ECoderFinishMode finishMode, // int *wasFinished ECoderStatus *status) { - CBraState *p = ((CBraState *)pp); + CXzBcFilterState *p = ((CXzBcFilterState *)pp); SizeT destRem = *destLen; SizeT srcRem = *srcLen; - UNUSED_VAR(finishMode); + UNUSED_VAR(finishMode) *destLen = 0; *srcLen = 0; // *wasFinished = False; *status = CODER_STATUS_NOT_FINISHED; - while (destRem > 0) + while (destRem != 0) { - if (p->bufPos != p->bufConv) { size_t size = p->bufConv - p->bufPos; - if (size > destRem) - size = destRem; - memcpy(dest, p->buf + p->bufPos, size); - p->bufPos += size; - *destLen += size; - dest += size; - destRem -= size; - continue; + if (size) + { + if (size > destRem) + size = destRem; + memcpy(dest, p->buf + p->bufPos, size); + p->bufPos += size; + *destLen += size; + dest += size; + destRem -= size; + continue; + } } p->bufTotal -= p->bufPos; @@ -220,7 +244,7 @@ static SRes BraState_Code2(void *pp, if (p->bufTotal == 0) break; - p->bufConv = BraState_Filter(pp, p->buf, p->bufTotal); + p->bufConv = p->filter_func(&p->base, p->buf, p->bufTotal); if (p->bufConv == 0) { @@ -240,26 +264,37 @@ static SRes BraState_Code2(void *pp, } -SRes BraState_SetFromMethod(IStateCoder *p, UInt64 id, int encodeMode, ISzAllocPtr alloc) +#define XZ_IS_SUPPORTED_FILTER_ID(id) \ + ((id) >= XZ_ID_Delta && (id) <= XZ_ID_RISCV) + +SRes Xz_StateCoder_Bc_SetFromMethod_Func(IStateCoder *p, UInt64 id, + Xz_Func_BcFilterStateBase_Filter func, ISzAllocPtr alloc) { - CBraState *decoder; - if (id < XZ_ID_Delta || id > XZ_ID_SPARC) + CXzBcFilterState *decoder; + if (!XZ_IS_SUPPORTED_FILTER_ID(id)) return SZ_ERROR_UNSUPPORTED; - decoder = (CBraState *)p->p; + decoder = (CXzBcFilterState *)p->p; if (!decoder) { - decoder = (CBraState *)ISzAlloc_Alloc(alloc, sizeof(CBraState)); + decoder = (CXzBcFilterState *)ISzAlloc_Alloc(alloc, sizeof(CXzBcFilterState)); if (!decoder) return SZ_ERROR_MEM; + decoder->buf = ISzAlloc_Alloc(alloc, BRA_BUF_SIZE); + if (!decoder->buf) + { + ISzAlloc_Free(alloc, decoder); + return SZ_ERROR_MEM; + } p->p = decoder; - p->Free = BraState_Free; - p->SetProps = BraState_SetProps; - p->Init = BraState_Init; - p->Code2 = BraState_Code2; - p->Filter = BraState_Filter; + p->Free = XzBcFilterState_Free; + p->SetProps = XzBcFilterState_SetProps; + p->Init = XzBcFilterState_Init; + p->Code2 = XzBcFilterState_Code2; + p->Filter = XzBcFilterState_Filter; + decoder->filter_func = func; } - decoder->methodId = (UInt32)id; - decoder->encodeMode = encodeMode; + decoder->base.methodId = (UInt32)id; + // decoder->encodeMode = encodeMode; return SZ_OK; } @@ -278,9 +313,9 @@ static void SbState_Free(void *pp, ISzAllocPtr alloc) static SRes SbState_SetProps(void *pp, const Byte *props, size_t propSize, ISzAllocPtr alloc) { - UNUSED_VAR(pp); - UNUSED_VAR(props); - UNUSED_VAR(alloc); + UNUSED_VAR(pp) + UNUSED_VAR(props) + UNUSED_VAR(alloc) return (propSize == 0) ? SZ_OK : SZ_ERROR_UNSUPPORTED; } @@ -296,7 +331,7 @@ static SRes SbState_Code2(void *pp, Byte *dest, SizeT *destLen, const Byte *src, { CSbDec *p = (CSbDec *)pp; SRes res; - UNUSED_VAR(srcWasFinished); + UNUSED_VAR(srcWasFinished) p->dest = dest; p->destLen = *destLen; p->src = src; @@ -388,7 +423,7 @@ static SRes Lzma2State_Code2(void *pp, Byte *dest, SizeT *destLen, const Byte *s ELzmaStatus status2; /* ELzmaFinishMode fm = (finishMode == LZMA_FINISH_ANY) ? LZMA_FINISH_ANY : LZMA_FINISH_END; */ SRes res; - UNUSED_VAR(srcWasFinished); + UNUSED_VAR(srcWasFinished) if (spec->outBufMode) { SizeT dicPos = spec->decoder.decoder.dicPos; @@ -419,7 +454,7 @@ static SRes Lzma2State_SetFromMethod(IStateCoder *p, Byte *outBuf, size_t outBuf p->Init = Lzma2State_Init; p->Code2 = Lzma2State_Code2; p->Filter = NULL; - Lzma2Dec_Construct(&spec->decoder); + Lzma2Dec_CONSTRUCT(&spec->decoder) } spec->outBufMode = False; if (outBuf) @@ -509,26 +544,24 @@ static SRes MixCoder_SetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 met { IStateCoder *sc = &p->coders[coderIndex]; p->ids[coderIndex] = methodId; - switch (methodId) - { - case XZ_ID_LZMA2: return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc); - #ifdef USE_SUBBLOCK - case XZ_ID_Subblock: return SbState_SetFromMethod(sc, p->alloc); - #endif - } + if (methodId == XZ_ID_LZMA2) + return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc); +#ifdef USE_SUBBLOCK + if (methodId == XZ_ID_Subblock) + return SbState_SetFromMethod(sc, p->alloc); +#endif if (coderIndex == 0) return SZ_ERROR_UNSUPPORTED; - return BraState_SetFromMethod(sc, methodId, 0, p->alloc); + return Xz_StateCoder_Bc_SetFromMethod_Func(sc, methodId, + XzBcFilterStateBase_Filter_Dec, p->alloc); } static SRes MixCoder_ResetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 methodId, Byte *outBuf, size_t outBufSize) { IStateCoder *sc = &p->coders[coderIndex]; - switch (methodId) - { - case XZ_ID_LZMA2: return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize); - } + if (methodId == XZ_ID_LZMA2) + return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize); return SZ_ERROR_UNSUPPORTED; } @@ -567,7 +600,7 @@ static SRes MixCoder_Code(CMixCoder *p, SizeT destLen2, srcLen2; int wasFinished; - PRF_STR("------- MixCoder Single ----------"); + PRF_STR("------- MixCoder Single ----------") srcLen2 = srcLenOrig; destLen2 = destLenOrig; @@ -614,14 +647,14 @@ static SRes MixCoder_Code(CMixCoder *p, processed = coder->Filter(coder->p, p->outBuf, processed); if (wasFinished || (destFinish && p->outWritten == destLenOrig)) processed = p->outWritten; - PRF_STR_INT("filter", i); + PRF_STR_INT("filter", i) } *destLen = processed; } return res; } - PRF_STR("standard mix"); + PRF_STR("standard mix") if (p->numCoders != 1) { @@ -763,21 +796,21 @@ SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf) static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf) { - return indexSize == (((UInt64)GetUi32(buf + 4) + 1) << 2) - && GetUi32(buf) == CrcCalc(buf + 4, 6) - && flags == GetBe16(buf + 8) - && buf[10] == XZ_FOOTER_SIG_0 - && buf[11] == XZ_FOOTER_SIG_1; + return indexSize == (((UInt64)GetUi32a(buf + 4) + 1) << 2) + && GetUi32a(buf) == CrcCalc(buf + 4, 6) + && flags == GetBe16a(buf + 8) + && GetUi16a(buf + 10) == (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8)); } #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ - { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ - if (s == 0) return SZ_ERROR_ARCHIVE; pos += s; } + { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ + if (s == 0) return SZ_ERROR_ARCHIVE; \ + pos += s; } static BoolInt XzBlock_AreSupportedFilters(const CXzBlock *p) { - unsigned numFilters = XzBlock_GetNumFilters(p) - 1; + const unsigned numFilters = XzBlock_GetNumFilters(p) - 1; unsigned i; { const CXzFilter *f = &p->filters[numFilters]; @@ -793,8 +826,7 @@ static BoolInt XzBlock_AreSupportedFilters(const CXzBlock *p) if (f->propsSize != 1) return False; } - else if (f->id < XZ_ID_Delta - || f->id > XZ_ID_SPARC + else if (!XZ_IS_SUPPORTED_FILTER_ID(f->id) || (f->propsSize != 0 && f->propsSize != 4)) return False; } @@ -819,22 +851,24 @@ SRes XzBlock_Parse(CXzBlock *p, const Byte *header) p->packSize = (UInt64)(Int64)-1; if (XzBlock_HasPackSize(p)) { - READ_VARINT_AND_CHECK(header, pos, headerSize, &p->packSize); + READ_VARINT_AND_CHECK(header, pos, headerSize, &p->packSize) if (p->packSize == 0 || p->packSize + headerSize >= (UInt64)1 << 63) return SZ_ERROR_ARCHIVE; } p->unpackSize = (UInt64)(Int64)-1; if (XzBlock_HasUnpackSize(p)) - READ_VARINT_AND_CHECK(header, pos, headerSize, &p->unpackSize); + { + READ_VARINT_AND_CHECK(header, pos, headerSize, &p->unpackSize) + } numFilters = XzBlock_GetNumFilters(p); for (i = 0; i < numFilters; i++) { CXzFilter *filter = p->filters + i; UInt64 size; - READ_VARINT_AND_CHECK(header, pos, headerSize, &filter->id); - READ_VARINT_AND_CHECK(header, pos, headerSize, &size); + READ_VARINT_AND_CHECK(header, pos, headerSize, &filter->id) + READ_VARINT_AND_CHECK(header, pos, headerSize, &size) if (size > headerSize - pos || size > XZ_FILTER_PROPS_SIZE_MAX) return SZ_ERROR_ARCHIVE; filter->propsSize = (UInt32)size; @@ -892,20 +926,20 @@ static SRes XzDecMix_Init(CMixCoder *p, const CXzBlock *block, Byte *outBuf, siz MixCoder_Free(p); for (i = 0; i < numFilters; i++) { - RINOK(MixCoder_SetFromMethod(p, i, block->filters[numFilters - 1 - i].id, outBuf, outBufSize)); + RINOK(MixCoder_SetFromMethod(p, i, block->filters[numFilters - 1 - i].id, outBuf, outBufSize)) } p->numCoders = numFilters; } else { - RINOK(MixCoder_ResetFromMethod(p, 0, block->filters[numFilters - 1].id, outBuf, outBufSize)); + RINOK(MixCoder_ResetFromMethod(p, 0, block->filters[numFilters - 1].id, outBuf, outBufSize)) } for (i = 0; i < numFilters; i++) { const CXzFilter *f = &block->filters[numFilters - 1 - i]; IStateCoder *sc = &p->coders[i]; - RINOK(sc->SetProps(sc->p, f->props, f->propsSize, p->alloc)); + RINOK(sc->SetProps(sc->p, f->props, f->propsSize, p->alloc)) } MixCoder_Init(p); @@ -999,7 +1033,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, SRes res; ECoderFinishMode finishMode2 = finishMode; - BoolInt srcFinished2 = srcFinished; + BoolInt srcFinished2 = (BoolInt)srcFinished; BoolInt destFinish = False; if (p->block.packSize != (UInt64)(Int64)-1) @@ -1038,7 +1072,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, (p->outBuf ? NULL : dest), &destLen2, destFinish, src, &srcLen2, srcFinished2, finishMode2); - + *status = p->decoder.status; XzCheck_Update(&p->check, (p->outBuf ? p->outBuf + p->outDataWritten : dest), destLen2); if (!p->outBuf) @@ -1052,14 +1086,14 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, (*destLen) += destLen2; p->unpackSize += destLen2; - RINOK(res); + RINOK(res) if (*status != CODER_STATUS_FINISHED_WITH_MARK) { if (p->block.packSize == p->packSize && *status == CODER_STATUS_NEEDS_MORE_INPUT) { - PRF_STR("CODER_STATUS_NEEDS_MORE_INPUT"); + PRF_STR("CODER_STATUS_NEEDS_MORE_INPUT") *status = CODER_STATUS_NOT_SPECIFIED; return SZ_ERROR_DATA; } @@ -1076,7 +1110,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, if ((p->block.packSize != (UInt64)(Int64)-1 && p->block.packSize != p->packSize) || (p->block.unpackSize != (UInt64)(Int64)-1 && p->block.unpackSize != p->unpackSize)) { - PRF_STR("ERROR: block.size mismatch"); + PRF_STR("ERROR: block.size mismatch") return SZ_ERROR_DATA; } } @@ -1092,7 +1126,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, return SZ_OK; } - switch (p->state) + switch ((int)p->state) { case XZ_STATE_STREAM_HEADER: { @@ -1107,7 +1141,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, } else { - RINOK(Xz_ParseHeader(&p->streamFlags, p->buf)); + RINOK(Xz_ParseHeader(&p->streamFlags, p->buf)) p->numStartedStreams++; p->indexSize = 0; p->numBlocks = 0; @@ -1131,21 +1165,21 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks); p->indexPos = p->indexPreSize; p->indexSize += p->indexPreSize; - Sha256_Final(&p->sha, p->shaDigest); + Sha256_Final(&p->sha, (Byte *)(void *)p->shaDigest32); Sha256_Init(&p->sha); p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize); p->state = XZ_STATE_STREAM_INDEX; break; } - p->blockHeaderSize = ((UInt32)p->buf[0] << 2) + 4; + p->blockHeaderSize = ((unsigned)p->buf[0] << 2) + 4; break; } if (p->pos != p->blockHeaderSize) { - UInt32 cur = p->blockHeaderSize - p->pos; + unsigned cur = p->blockHeaderSize - p->pos; if (cur > srcRem) - cur = (UInt32)srcRem; + cur = (unsigned)srcRem; memcpy(p->buf + p->pos, src, cur); p->pos += cur; (*srcLen) += cur; @@ -1153,7 +1187,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, } else { - RINOK(XzBlock_Parse(&p->block, p->buf)); + RINOK(XzBlock_Parse(&p->block, p->buf)) if (!XzBlock_AreSupportedFilters(&p->block)) return SZ_ERROR_UNSUPPORTED; p->numTotalBlocks++; @@ -1166,7 +1200,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, p->headerParsedOk = True; return SZ_OK; } - RINOK(XzDecMix_Init(&p->decoder, &p->block, p->outBuf, p->outBufSize)); + RINOK(XzDecMix_Init(&p->decoder, &p->block, p->outBuf, p->outBufSize)) } break; } @@ -1187,8 +1221,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, } else { - UInt32 checkSize = XzFlags_GetCheckSize(p->streamFlags); - UInt32 cur = checkSize - p->pos; + const unsigned checkSize = XzFlags_GetCheckSize(p->streamFlags); + unsigned cur = checkSize - p->pos; if (cur != 0) { if (srcRem == 0) @@ -1197,7 +1231,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, return SZ_OK; } if (cur > srcRem) - cur = (UInt32)srcRem; + cur = (unsigned)srcRem; memcpy(p->buf + p->pos, src, cur); p->pos += cur; (*srcLen) += cur; @@ -1206,10 +1240,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, break; } { - Byte digest[XZ_CHECK_SIZE_MAX]; + UInt32 digest32[XZ_CHECK_SIZE_MAX / 4]; p->state = XZ_STATE_BLOCK_HEADER; p->pos = 0; - if (XzCheck_Final(&p->check, digest) && memcmp(digest, p->buf, checkSize) != 0) + if (XzCheck_Final(&p->check, (void *)digest32) && memcmp(digest32, p->buf, checkSize) != 0) return SZ_ERROR_CRC; if (p->decodeOnlyOneBlock) { @@ -1254,12 +1288,12 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, } else { - Byte digest[SHA256_DIGEST_SIZE]; + UInt32 digest32[SHA256_DIGEST_SIZE / 4]; p->state = XZ_STATE_STREAM_INDEX_CRC; p->indexSize += 4; p->pos = 0; - Sha256_Final(&p->sha, digest); - if (memcmp(digest, p->shaDigest, SHA256_DIGEST_SIZE) != 0) + Sha256_Final(&p->sha, (void *)digest32); + if (memcmp(digest32, p->shaDigest32, SHA256_DIGEST_SIZE) != 0) return SZ_ERROR_CRC; } } @@ -1275,9 +1309,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, } else { + const Byte *ptr = p->buf; p->state = XZ_STATE_STREAM_FOOTER; p->pos = 0; - if (CRC_GET_DIGEST(p->crc) != GetUi32(p->buf)) + if (CRC_GET_DIGEST(p->crc) != GetUi32a(ptr)) return SZ_ERROR_CRC; } break; @@ -1285,9 +1320,9 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, case XZ_STATE_STREAM_FOOTER: { - UInt32 cur = XZ_STREAM_FOOTER_SIZE - p->pos; + unsigned cur = XZ_STREAM_FOOTER_SIZE - p->pos; if (cur > srcRem) - cur = (UInt32)srcRem; + cur = (unsigned)srcRem; memcpy(p->buf + p->pos, src, cur); p->pos += cur; (*srcLen) += cur; @@ -1307,7 +1342,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, { if (*src != 0) { - if (((UInt32)p->padSize & 3) != 0) + if ((unsigned)p->padSize & 3) return SZ_ERROR_NO_ARCHIVE; p->pos = 0; p->state = XZ_STATE_STREAM_HEADER; @@ -1322,6 +1357,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, } case XZ_STATE_BLOCK: break; /* to disable GCC warning */ + + default: return SZ_ERROR_FAIL; } } /* @@ -1386,7 +1423,7 @@ UInt64 XzUnpacker_GetExtraSize(const CXzUnpacker *p) -#ifndef _7ZIP_ST +#ifndef Z7_ST #include "MtDec.h" #endif @@ -1397,7 +1434,7 @@ void XzDecMtProps_Init(CXzDecMtProps *p) p->outStep_ST = 1 << 20; p->ignoreErrors = False; - #ifndef _7ZIP_ST + #ifndef Z7_ST p->numThreads = 1; p->inBufSize_MT = 1 << 18; p->memUseMax = sizeof(size_t) << 28; @@ -1406,7 +1443,7 @@ void XzDecMtProps_Init(CXzDecMtProps *p) -#ifndef _7ZIP_ST +#ifndef Z7_ST /* ---------- CXzDecMtThread ---------- */ @@ -1445,7 +1482,7 @@ typedef struct /* ---------- CXzDecMt ---------- */ -typedef struct +struct CXzDecMt { CAlignOffsetAlloc alignOffsetAlloc; ISzAllocPtr allocMid; @@ -1453,10 +1490,9 @@ typedef struct CXzDecMtProps props; size_t unpackBlockMaxSize; - ISeqInStream *inStream; - ISeqOutStream *outStream; - ICompressProgress *progress; - // CXzStatInfo *stat; + ISeqInStreamPtr inStream; + ISeqOutStreamPtr outStream; + ICompressProgressPtr progress; BoolInt finishMode; BoolInt outSize_Defined; @@ -1479,7 +1515,7 @@ typedef struct ECoderStatus status; SRes codeRes; - #ifndef _7ZIP_ST + #ifndef Z7_ST BoolInt mainDecoderWasCalled; // int statErrorDefined; int finishedDecoderIndex; @@ -1492,8 +1528,9 @@ typedef struct UInt64 numBlocks; // UInt64 numBadBlocks; - SRes mainErrorCode; - + SRes mainErrorCode; // it's set to error code, if the size Code() output doesn't patch the size from Parsing stage + // it can be = SZ_ERROR_INPUT_EOF + // it can be = SZ_ERROR_DATA, in some another cases BoolInt isBlockHeaderState_Parse; BoolInt isBlockHeaderState_Write; UInt64 outProcessed_Parse; @@ -1501,10 +1538,9 @@ typedef struct BoolInt mtc_WasConstructed; CMtDec mtc; - CXzDecMtThread coders[MTDEC__THREADS_MAX]; + CXzDecMtThread coders[MTDEC_THREADS_MAX]; #endif - -} CXzDecMt; +}; @@ -1532,11 +1568,11 @@ CXzDecMtHandle XzDecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid) XzDecMtProps_Init(&p->props); - #ifndef _7ZIP_ST + #ifndef Z7_ST p->mtc_WasConstructed = False; { unsigned i; - for (i = 0; i < MTDEC__THREADS_MAX; i++) + for (i = 0; i < MTDEC_THREADS_MAX; i++) { CXzDecMtThread *coder = &p->coders[i]; coder->dec_created = False; @@ -1546,16 +1582,16 @@ CXzDecMtHandle XzDecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid) } #endif - return p; + return (CXzDecMtHandle)p; } -#ifndef _7ZIP_ST +#ifndef Z7_ST static void XzDecMt_FreeOutBufs(CXzDecMt *p) { unsigned i; - for (i = 0; i < MTDEC__THREADS_MAX; i++) + for (i = 0; i < MTDEC_THREADS_MAX; i++) { CXzDecMtThread *coder = &p->coders[i]; if (coder->outBuf) @@ -1592,13 +1628,15 @@ static void XzDecMt_FreeSt(CXzDecMt *p) } -void XzDecMt_Destroy(CXzDecMtHandle pp) +// #define GET_CXzDecMt_p CXzDecMt *p = pp; + +void XzDecMt_Destroy(CXzDecMtHandle p) { - CXzDecMt *p = (CXzDecMt *)pp; + // GET_CXzDecMt_p XzDecMt_FreeSt(p); - #ifndef _7ZIP_ST + #ifndef Z7_ST if (p->mtc_WasConstructed) { @@ -1607,7 +1645,7 @@ void XzDecMt_Destroy(CXzDecMtHandle pp) } { unsigned i; - for (i = 0; i < MTDEC__THREADS_MAX; i++) + for (i = 0; i < MTDEC_THREADS_MAX; i++) { CXzDecMtThread *t = &p->coders[i]; if (t->dec_created) @@ -1622,12 +1660,12 @@ void XzDecMt_Destroy(CXzDecMtHandle pp) #endif - ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, pp); + ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, p); } -#ifndef _7ZIP_ST +#ifndef Z7_ST static void XzDecMt_Callback_Parse(void *obj, unsigned coderIndex, CMtDecCallbackInfo *cc) { @@ -1693,7 +1731,7 @@ static void XzDecMt_Callback_Parse(void *obj, unsigned coderIndex, CMtDecCallbac coder->dec.parseMode = True; coder->dec.headerParsedOk = False; - PRF_STR_INT("Parse", srcSize2); + PRF_STR_INT("Parse", srcSize2) res = XzUnpacker_Code(&coder->dec, NULL, &destSize, @@ -1736,10 +1774,10 @@ static void XzDecMt_Callback_Parse(void *obj, unsigned coderIndex, CMtDecCallbac } } { - UInt64 packSize = block->packSize; - UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3); - UInt32 checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags); - UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize; + const UInt64 packSize = block->packSize; + const UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3); + const unsigned checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags); + const UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize; // if (blockPackSum <= me->props.inBlockMax) // unpackBlockMaxSize { @@ -1877,7 +1915,7 @@ static SRes XzDecMt_Callback_PreCode(void *pp, unsigned coderIndex) { // if (res == SZ_ERROR_MEM) return res; if (me->props.ignoreErrors && res != SZ_ERROR_MEM) - return S_OK; + return SZ_OK; return res; } } @@ -1898,15 +1936,18 @@ static SRes XzDecMt_Callback_Code(void *pp, unsigned coderIndex, *outCodePos = coder->outCodeSize; *stop = True; + if (srcSize > coder->inPreSize - coder->inCodeSize) + return SZ_ERROR_FAIL; + if (coder->inCodeSize < coder->inPreHeaderSize) { - UInt64 rem = coder->inPreHeaderSize - coder->inCodeSize; - size_t step = srcSize; - if (step > rem) - step = (size_t)rem; + size_t step = coder->inPreHeaderSize - coder->inCodeSize; + if (step > srcSize) + step = srcSize; src += step; srcSize -= step; coder->inCodeSize += step; + *inCodePos = coder->inCodeSize; if (coder->inCodeSize < coder->inPreHeaderSize) { *stop = False; @@ -1956,7 +1997,7 @@ static SRes XzDecMt_Callback_Code(void *pp, unsigned coderIndex, { *inCodePos = coder->inPreSize; *outCodePos = coder->outPreSize; - return S_OK; + return SZ_OK; } return coder->codeRes; } @@ -1966,7 +2007,7 @@ static SRes XzDecMt_Callback_Code(void *pp, unsigned coderIndex, static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, BoolInt needWriteToStream, - const Byte *src, size_t srcSize, + const Byte *src, size_t srcSize, BoolInt isCross, // int srcFinished, BoolInt *needContinue, BoolInt *canRecode) @@ -1985,7 +2026,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, if (!coder->dec.headerParsedOk || !coder->outBuf) { if (me->finishedDecoderIndex < 0) - me->finishedDecoderIndex = coderIndex; + me->finishedDecoderIndex = (int)coderIndex; return SZ_OK; } @@ -2065,7 +2106,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, } data += cur; size -= cur; - // PRF_STR_INT("Written size =", size); + // PRF_STR_INT("Written size =", size) if (size == 0) break; res = MtProgress_ProgressAdd(&me->mtc.mtProgress, 0, 0); @@ -2077,16 +2118,16 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, if (coder->codeRes != SZ_OK) if (!me->props.ignoreErrors) { - me->finishedDecoderIndex = coderIndex; + me->finishedDecoderIndex = (int)coderIndex; return res; } - RINOK(res); + RINOK(res) if (coder->inPreSize != coder->inCodeSize || coder->blockPackTotal != coder->inCodeSize) { - me->finishedDecoderIndex = coderIndex; + me->finishedDecoderIndex = (int)coderIndex; return SZ_OK; } @@ -2100,13 +2141,13 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, // (coder->state == MTDEC_PARSE_END) means that there are no other working threads // so we can use mtc variables without lock - PRF_STR_INT("Write MTDEC_PARSE_END", me->mtc.inProcessed); + PRF_STR_INT("Write MTDEC_PARSE_END", me->mtc.inProcessed) me->mtc.mtProgress.totalInSize = me->mtc.inProcessed; { CXzUnpacker *dec = &me->dec; - PRF_STR_INT("PostSingle", srcSize); + PRF_STR_INT("PostSingle", srcSize) { size_t srcProcessed = srcSize; @@ -2125,22 +2166,41 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, return SZ_OK; } + /* + We have processed all xz-blocks of stream, + And xz unpacker is at XZ_STATE_BLOCK_HEADER state, where + (src) is a pointer to xz-Index structure. + We finish reading of current xz-Stream, including Zero padding after xz-Stream. + We exit, if we reach extra byte (first byte of new-Stream or another data). + But we don't update input stream pointer for that new extra byte. + If extra byte is not correct first byte of xz-signature, + we have SZ_ERROR_NO_ARCHIVE error here. + */ + res = XzUnpacker_Code(dec, NULL, &outSizeCur, src, &srcProcessed, me->mtc.readWasFinished, // srcFinished CODER_FINISH_END, // CODER_FINISH_ANY, &status); + + // res = SZ_ERROR_ARCHIVE; // for failure test me->status = status; me->codeRes = res; + if (isCross) + me->mtc.crossStart += srcProcessed; + me->mtc.inProcessed += srcProcessed; me->mtc.mtProgress.totalInSize = me->mtc.inProcessed; + srcSize -= srcProcessed; + src += srcProcessed; + if (res != SZ_OK) { - return S_OK; + return SZ_OK; // return res; } @@ -2149,20 +2209,26 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, *needContinue = True; me->isBlockHeaderState_Parse = False; me->isBlockHeaderState_Write = False; + + if (!isCross) { Byte *crossBuf = MtDec_GetCrossBuff(&me->mtc); if (!crossBuf) return SZ_ERROR_MEM; - memcpy(crossBuf, src + srcProcessed, srcSize - srcProcessed); + if (srcSize != 0) + memcpy(crossBuf, src, srcSize); + me->mtc.crossStart = 0; + me->mtc.crossEnd = srcSize; } - me->mtc.crossStart = 0; - me->mtc.crossEnd = srcSize - srcProcessed; + + PRF_STR_INT("XZ_STATE_STREAM_HEADER crossEnd = ", (unsigned)me->mtc.crossEnd) + return SZ_OK; } - if (status != CODER_STATUS_NEEDS_MORE_INPUT) + if (status != CODER_STATUS_NEEDS_MORE_INPUT || srcSize != 0) { - return E_FAIL; + return SZ_ERROR_FAIL; } if (me->mtc.readWasFinished) @@ -2174,7 +2240,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, { size_t inPos; size_t inLim; - const Byte *inData; + // const Byte *inData; UInt64 inProgressPrev = me->mtc.inProcessed; // XzDecMt_Prepare_InBuf_ST(p); @@ -2184,9 +2250,8 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, inPos = 0; inLim = 0; - // outProcessed = 0; - inData = crossBuf; + // inData = crossBuf; for (;;) { @@ -2201,7 +2266,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, { inPos = 0; inLim = me->mtc.inBufSize; - me->mtc.readRes = ISeqInStream_Read(me->inStream, (void *)inData, &inLim); + me->mtc.readRes = ISeqInStream_Read(me->inStream, (void *)crossBuf, &inLim); me->mtc.readProcessed += inLim; if (inLim == 0 || me->mtc.readRes != SZ_OK) me->mtc.readWasFinished = True; @@ -2213,7 +2278,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, res = XzUnpacker_Code(dec, NULL, &outProcessed, - inData + inPos, &inProcessed, + crossBuf + inPos, &inProcessed, (inProcessed == 0), // srcFinished CODER_FINISH_END, &status); @@ -2225,7 +2290,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, if (res != SZ_OK) { - return S_OK; + return SZ_OK; // return res; } @@ -2240,14 +2305,14 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex, } if (status != CODER_STATUS_NEEDS_MORE_INPUT) - return E_FAIL; + return SZ_ERROR_FAIL; if (me->mtc.progress) { UInt64 inDelta = me->mtc.inProcessed - inProgressPrev; if (inDelta >= (1 << 22)) { - RINOK(MtProgress_Progress_ST(&me->mtc.mtProgress)); + RINOK(MtProgress_Progress_ST(&me->mtc.mtProgress)) inProgressPrev = me->mtc.inProcessed; } } @@ -2276,13 +2341,6 @@ void XzStatInfo_Clear(CXzStatInfo *p) p->NumStreams_Defined = False; p->NumBlocks_Defined = False; - // p->IsArc = False; - // p->UnexpectedEnd = False; - // p->Unsupported = False; - // p->HeadersError = False; - // p->DataError = False; - // p->CrcError = False; - p->DataAfterEnd = False; p->DecodingTruncated = False; @@ -2296,9 +2354,19 @@ void XzStatInfo_Clear(CXzStatInfo *p) +/* + XzDecMt_Decode_ST() can return SZ_OK or the following errors + - SZ_ERROR_MEM for memory allocation error + - error from XzUnpacker_Code() function + - SZ_ERROR_WRITE for ISeqOutStream::Write(). stat->CombinedRes_Type = SZ_ERROR_WRITE in that case + - ICompressProgress::Progress() error, stat->CombinedRes_Type = SZ_ERROR_PROGRESS. + But XzDecMt_Decode_ST() doesn't return ISeqInStream::Read() errors. + ISeqInStream::Read() result is set to p->readRes. + also it can set stat->CombinedRes_Type to SZ_ERROR_WRITE or SZ_ERROR_PROGRESS. +*/ static SRes XzDecMt_Decode_ST(CXzDecMt *p - #ifndef _7ZIP_ST + #ifndef Z7_ST , BoolInt tMode #endif , CXzStatInfo *stat) @@ -2310,11 +2378,11 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p CXzUnpacker *dec; - #ifndef _7ZIP_ST + #ifndef Z7_ST if (tMode) { XzDecMt_FreeOutBufs(p); - tMode = MtDec_PrepareRead(&p->mtc); + tMode = (BoolInt)MtDec_PrepareRead(&p->mtc); } #endif @@ -2367,7 +2435,7 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p if (inPos == inLim) { - #ifndef _7ZIP_ST + #ifndef Z7_ST if (tMode) { inData = MtDec_Read(&p->mtc, &inLim); @@ -2384,7 +2452,7 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p inPos = 0; inLim = p->inBufSize; inData = p->inBuf; - p->readRes = ISeqInStream_Read(p->inStream, (void *)inData, &inLim); + p->readRes = ISeqInStream_Read(p->inStream, (void *)p->inBuf, &inLim); p->readProcessed += inLim; if (inLim == 0 || p->readRes != SZ_OK) p->readWasFinished = True; @@ -2426,8 +2494,8 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p if (finished || outProcessed >= outSize) if (outPos != 0) { - size_t written = ISeqOutStream_Write(p->outStream, p->outBuf, outPos); - p->outProcessed += written; + const size_t written = ISeqOutStream_Write(p->outStream, p->outBuf, outPos); + // p->outProcessed += written; // 21.01: BUG fixed if (written != outPos) { stat->CombinedRes_Type = SZ_ERROR_WRITE; @@ -2438,9 +2506,8 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p if (p->progress && res == SZ_OK) { - UInt64 inDelta = p->inProcessed - inPrev; - UInt64 outDelta = p->outProcessed - outPrev; - if (inDelta >= (1 << 22) || outDelta >= (1 << 22)) + if (p->inProcessed - inPrev >= (1 << 22) || + p->outProcessed - outPrev >= (1 << 22)) { res = ICompressProgress_Progress(p->progress, p->inProcessed, p->outProcessed); if (res != SZ_OK) @@ -2455,14 +2522,31 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p } if (finished) - return res; + { + // p->codeRes is preliminary error from XzUnpacker_Code. + // and it can be corrected later as final result + // so we return SZ_OK here instead of (res); + return SZ_OK; + // return res; + } } } -static SRes XzStatInfo_SetStat(const CXzUnpacker *dec, + + +/* +XzStatInfo_SetStat() transforms + CXzUnpacker return code and status to combined CXzStatInfo results. + it can convert SZ_OK to SZ_ERROR_INPUT_EOF + it can convert SZ_ERROR_NO_ARCHIVE to SZ_OK and (DataAfterEnd = 1) +*/ + +static void XzStatInfo_SetStat(const CXzUnpacker *dec, int finishMode, - UInt64 readProcessed, UInt64 inProcessed, - SRes res, ECoderStatus status, + // UInt64 readProcessed, + UInt64 inProcessed, + SRes res, // it's result from CXzUnpacker unpacker + ECoderStatus status, BoolInt decodingTruncated, CXzStatInfo *stat) { @@ -2484,12 +2568,20 @@ static SRes XzStatInfo_SetStat(const CXzUnpacker *dec, if (status == CODER_STATUS_NEEDS_MORE_INPUT) { // CODER_STATUS_NEEDS_MORE_INPUT is expected status for correct xz streams + // any extra data is part of correct data extraSize = 0; + // if xz stream was not finished, then we need more data if (!XzUnpacker_IsStreamWasFinished(dec)) res = SZ_ERROR_INPUT_EOF; } - else if (!decodingTruncated || finishMode) // (status == CODER_STATUS_NOT_FINISHED) - res = SZ_ERROR_DATA; + else + { + // CODER_STATUS_FINISHED_WITH_MARK is not possible for multi stream xz decoding + // so he we have (status == CODER_STATUS_NOT_FINISHED) + // if (status != CODER_STATUS_FINISHED_WITH_MARK) + if (!decodingTruncated || finishMode) + res = SZ_ERROR_DATA; + } } else if (res == SZ_ERROR_NO_ARCHIVE) { @@ -2497,37 +2589,42 @@ static SRes XzStatInfo_SetStat(const CXzUnpacker *dec, SZ_ERROR_NO_ARCHIVE is possible for 2 states: XZ_STATE_STREAM_HEADER - if bad signature or bad CRC XZ_STATE_STREAM_PADDING - if non-zero padding data - extraSize / inProcessed don't include "bad" byte + extraSize and inProcessed don't include "bad" byte */ - if (inProcessed != extraSize) // if good streams before error - if (extraSize != 0 || readProcessed != inProcessed) + // if (inProcessed == extraSize), there was no any good xz stream header, and we keep error + if (inProcessed != extraSize) // if there were good xz streams before error + { + // if (extraSize != 0 || readProcessed != inProcessed) { + // he we suppose that all xz streams were finsihed OK, and we have + // some extra data after all streams stat->DataAfterEnd = True; - // there is some good xz stream before. So we set SZ_OK res = SZ_OK; } + } } - stat->DecodeRes = res; + if (stat->DecodeRes == SZ_OK) + stat->DecodeRes = res; stat->InSize -= extraSize; - return res; } -SRes XzDecMt_Decode(CXzDecMtHandle pp, + +SRes XzDecMt_Decode(CXzDecMtHandle p, const CXzDecMtProps *props, const UInt64 *outDataSize, int finishMode, - ISeqOutStream *outStream, + ISeqOutStreamPtr outStream, // Byte *outBuf, size_t *outBufSize, - ISeqInStream *inStream, + ISeqInStreamPtr inStream, // const Byte *inData, size_t inDataSize, CXzStatInfo *stat, int *isMT, - ICompressProgress *progress) + ICompressProgressPtr progress) { - CXzDecMt *p = (CXzDecMt *)pp; - #ifndef _7ZIP_ST + // GET_CXzDecMt_p + #ifndef Z7_ST BoolInt tMode; #endif @@ -2548,7 +2645,7 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, p->outSize = *outDataSize; } - p->finishMode = finishMode; + p->finishMode = (BoolInt)finishMode; // p->outSize = 457; p->outSize_Defined = True; p->finishMode = False; // for test @@ -2557,8 +2654,9 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, p->inProcessed = 0; p->readProcessed = 0; p->readWasFinished = False; + p->readRes = SZ_OK; - p->codeRes = 0; + p->codeRes = SZ_OK; p->status = CODER_STATUS_NOT_SPECIFIED; XzUnpacker_Init(&p->dec); @@ -2577,7 +2675,7 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, */ - #ifndef _7ZIP_ST + #ifndef Z7_ST p->isBlockHeaderState_Parse = False; p->isBlockHeaderState_Write = False; @@ -2589,8 +2687,9 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, if (p->props.numThreads > 1) { - IMtDecCallback vt; - + IMtDecCallback2 vt; + BoolInt needContinue; + SRes res; // we just free ST buffers here // but we still keep state variables, that was set in XzUnpacker_Init() XzDecMt_FreeSt(p); @@ -2628,45 +2727,45 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, vt.Code = XzDecMt_Callback_Code; vt.Write = XzDecMt_Callback_Write; - { - BoolInt needContinue; - - SRes res = MtDec_Code(&p->mtc); - - stat->InSize = p->mtc.inProcessed; - p->inProcessed = p->mtc.inProcessed; - p->readRes = p->mtc.readRes; - p->readWasFinished = p->mtc.readWasFinished; - p->readProcessed = p->mtc.readProcessed; + res = MtDec_Code(&p->mtc); - tMode = True; - needContinue = False; - if (res == SZ_OK) + stat->InSize = p->mtc.inProcessed; + + p->inProcessed = p->mtc.inProcessed; + p->readRes = p->mtc.readRes; + p->readWasFinished = p->mtc.readWasFinished; + p->readProcessed = p->mtc.readProcessed; + + tMode = True; + needContinue = False; + + if (res == SZ_OK) + { + if (p->mtc.mtProgress.res != SZ_OK) { - if (p->mtc.mtProgress.res != SZ_OK) - { - res = p->mtc.mtProgress.res; - stat->ProgressRes = res; - stat->CombinedRes_Type = SZ_ERROR_PROGRESS; - } - else - needContinue = p->mtc.needContinue; + res = p->mtc.mtProgress.res; + stat->ProgressRes = res; + stat->CombinedRes_Type = SZ_ERROR_PROGRESS; } - - if (!needContinue) + else + needContinue = p->mtc.needContinue; + } + + if (!needContinue) + { { SRes codeRes; BoolInt truncated = False; ECoderStatus status; - CXzUnpacker *dec; + const CXzUnpacker *dec; stat->OutSize = p->outProcessed; if (p->finishedDecoderIndex >= 0) { - CXzDecMtThread *coder = &p->coders[(unsigned)p->finishedDecoderIndex]; + const CXzDecMtThread *coder = &p->coders[(unsigned)p->finishedDecoderIndex]; codeRes = coder->codeRes; dec = &coder->dec; status = coder->status; @@ -2679,41 +2778,46 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, truncated = p->parsing_Truncated; } else - return E_FAIL; + return SZ_ERROR_FAIL; + + if (p->mainErrorCode != SZ_OK) + stat->DecodeRes = p->mainErrorCode; XzStatInfo_SetStat(dec, p->finishMode, - p->mtc.readProcessed, p->mtc.inProcessed, + // p->mtc.readProcessed, + p->mtc.inProcessed, codeRes, status, truncated, stat); + } + + if (res == SZ_OK) + { + stat->ReadRes = p->mtc.readRes; - if (res == SZ_OK) + if (p->writeRes != SZ_OK) { - if (p->writeRes != SZ_OK) - { - res = p->writeRes; - stat->CombinedRes_Type = SZ_ERROR_WRITE; - } - else if (p->mtc.readRes != SZ_OK && p->mtc.inProcessed == p->mtc.readProcessed) - { - res = p->mtc.readRes; - stat->ReadRes = res; - stat->CombinedRes_Type = SZ_ERROR_READ; - } - else if (p->mainErrorCode != SZ_OK) - { - res = p->mainErrorCode; - } + res = p->writeRes; + stat->CombinedRes_Type = SZ_ERROR_WRITE; } - - stat->CombinedRes = res; - if (stat->CombinedRes_Type == SZ_OK) - stat->CombinedRes_Type = res; - return res; + else if (p->mtc.readRes != SZ_OK + // && p->mtc.inProcessed == p->mtc.readProcessed + && stat->DecodeRes == SZ_ERROR_INPUT_EOF) + { + res = p->mtc.readRes; + stat->CombinedRes_Type = SZ_ERROR_READ; + } + else if (stat->DecodeRes != SZ_OK) + res = stat->DecodeRes; } - - PRF_STR("----- decoding ST -----"); + + stat->CombinedRes = res; + if (stat->CombinedRes_Type == SZ_OK) + stat->CombinedRes_Type = res; + return res; } + + PRF_STR("----- decoding ST -----") } #endif @@ -2723,39 +2827,41 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, { SRes res = XzDecMt_Decode_ST(p - #ifndef _7ZIP_ST + #ifndef Z7_ST , tMode #endif , stat ); + #ifndef Z7_ST + // we must set error code from MT decoding at first + if (p->mainErrorCode != SZ_OK) + stat->DecodeRes = p->mainErrorCode; + #endif + XzStatInfo_SetStat(&p->dec, p->finishMode, - p->readProcessed, p->inProcessed, + // p->readProcessed, + p->inProcessed, p->codeRes, p->status, False, // truncated stat); + stat->ReadRes = p->readRes; + if (res == SZ_OK) { - /* - if (p->writeRes != SZ_OK) - { - res = p->writeRes; - stat->CombinedRes_Type = SZ_ERROR_WRITE; - } - else - */ - if (p->readRes != SZ_OK && p->inProcessed == p->readProcessed) + if (p->readRes != SZ_OK + // && p->inProcessed == p->readProcessed + && stat->DecodeRes == SZ_ERROR_INPUT_EOF) { + // we set read error as combined error, only if that error was the reason + // of decoding problem res = p->readRes; - stat->ReadRes = res; stat->CombinedRes_Type = SZ_ERROR_READ; } - #ifndef _7ZIP_ST - else if (p->mainErrorCode != SZ_OK) - res = p->mainErrorCode; - #endif + else if (stat->DecodeRes != SZ_OK) + res = stat->DecodeRes; } stat->CombinedRes = res; @@ -2764,3 +2870,7 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp, return res; } } + +#undef PRF +#undef PRF_STR +#undef PRF_STR_INT_2 diff --git a/src/sdk/C/XzEnc.c b/src/sdk/C/XzEnc.c index d0a8b44..e40f0c8 100644 --- a/src/sdk/C/XzEnc.c +++ b/src/sdk/C/XzEnc.c @@ -1,5 +1,5 @@ /* XzEnc.c -- Xz Encode -2019-02-02 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -18,42 +18,43 @@ #include "XzEnc.h" -// #define _7ZIP_ST +// #define Z7_ST -#ifndef _7ZIP_ST +#ifndef Z7_ST #include "MtCoder.h" #else -#define MTCODER__THREADS_MAX 1 -#define MTCODER__BLOCKS_MAX 1 +#define MTCODER_THREADS_MAX 1 +#define MTCODER_BLOCKS_MAX 1 #endif #define XZ_GET_PAD_SIZE(dataSize) ((4 - ((unsigned)(dataSize) & 3)) & 3) -/* max pack size for LZMA2 block + check-64bytrs: */ -#define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + 64) +#define XZ_CHECK_SIZE_MAX 64 +/* max pack size for LZMA2 block + pad4 + check_size: */ +#define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + XZ_CHECK_SIZE_MAX) #define XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(unpackSize) (XZ_BLOCK_HEADER_SIZE_MAX + XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize)) -#define XzBlock_ClearFlags(p) (p)->flags = 0; -#define XzBlock_SetNumFilters(p, n) (p)->flags |= ((n) - 1); +// #define XzBlock_ClearFlags(p) (p)->flags = 0; +#define XzBlock_ClearFlags_SetNumFilters(p, n) (p)->flags = (Byte)((n) - 1); #define XzBlock_SetHasPackSize(p) (p)->flags |= XZ_BF_PACK_SIZE; #define XzBlock_SetHasUnpackSize(p) (p)->flags |= XZ_BF_UNPACK_SIZE; -static SRes WriteBytes(ISeqOutStream *s, const void *buf, size_t size) +static SRes WriteBytes(ISeqOutStreamPtr s, const void *buf, size_t size) { return (ISeqOutStream_Write(s, buf, size) == size) ? SZ_OK : SZ_ERROR_WRITE; } -static SRes WriteBytesUpdateCrc(ISeqOutStream *s, const void *buf, size_t size, UInt32 *crc) +static SRes WriteBytes_UpdateCrc(ISeqOutStreamPtr s, const void *buf, size_t size, UInt32 *crc) { *crc = CrcUpdate(*crc, buf, size); return WriteBytes(s, buf, size); } -static SRes Xz_WriteHeader(CXzStreamFlags f, ISeqOutStream *s) +static SRes Xz_WriteHeader(CXzStreamFlags f, ISeqOutStreamPtr s) { UInt32 crc; Byte header[XZ_STREAM_HEADER_SIZE]; @@ -61,12 +62,12 @@ static SRes Xz_WriteHeader(CXzStreamFlags f, ISeqOutStream *s) header[XZ_SIG_SIZE] = (Byte)(f >> 8); header[XZ_SIG_SIZE + 1] = (Byte)(f & 0xFF); crc = CrcCalc(header + XZ_SIG_SIZE, XZ_STREAM_FLAGS_SIZE); - SetUi32(header + XZ_SIG_SIZE + XZ_STREAM_FLAGS_SIZE, crc); + SetUi32(header + XZ_SIG_SIZE + XZ_STREAM_FLAGS_SIZE, crc) return WriteBytes(s, header, XZ_STREAM_HEADER_SIZE); } -static SRes XzBlock_WriteHeader(const CXzBlock *p, ISeqOutStream *s) +static SRes XzBlock_WriteHeader(const CXzBlock *p, ISeqOutStreamPtr s) { Byte header[XZ_BLOCK_HEADER_SIZE_MAX]; @@ -91,7 +92,7 @@ static SRes XzBlock_WriteHeader(const CXzBlock *p, ISeqOutStream *s) header[pos++] = 0; header[0] = (Byte)(pos >> 2); - SetUi32(header + pos, CrcCalc(header, pos)); + SetUi32(header + pos, CrcCalc(header, pos)) return WriteBytes(s, header, pos + 4); } @@ -182,7 +183,7 @@ static SRes XzEncIndex_AddIndexRecord(CXzEncIndex *p, UInt64 unpackSize, UInt64 size_t newSize = p->allocated * 2 + 16 * 2; if (newSize < p->size + pos) return SZ_ERROR_MEM; - RINOK(XzEncIndex_ReAlloc(p, newSize, alloc)); + RINOK(XzEncIndex_ReAlloc(p, newSize, alloc)) } memcpy(p->blocks + p->size, buf, pos); p->size += pos; @@ -191,7 +192,7 @@ static SRes XzEncIndex_AddIndexRecord(CXzEncIndex *p, UInt64 unpackSize, UInt64 } -static SRes XzEncIndex_WriteFooter(const CXzEncIndex *p, CXzStreamFlags flags, ISeqOutStream *s) +static SRes XzEncIndex_WriteFooter(const CXzEncIndex *p, CXzStreamFlags flags, ISeqOutStreamPtr s) { Byte buf[32]; UInt64 globalPos; @@ -200,8 +201,8 @@ static SRes XzEncIndex_WriteFooter(const CXzEncIndex *p, CXzStreamFlags flags, I globalPos = pos; buf[0] = 0; - RINOK(WriteBytesUpdateCrc(s, buf, pos, &crc)); - RINOK(WriteBytesUpdateCrc(s, p->blocks, p->size, &crc)); + RINOK(WriteBytes_UpdateCrc(s, buf, pos, &crc)) + RINOK(WriteBytes_UpdateCrc(s, p->blocks, p->size, &crc)) globalPos += p->size; pos = XZ_GET_PAD_SIZE(globalPos); @@ -211,12 +212,12 @@ static SRes XzEncIndex_WriteFooter(const CXzEncIndex *p, CXzStreamFlags flags, I globalPos += pos; crc = CrcUpdate(crc, buf + 4 - pos, pos); - SetUi32(buf + 4, CRC_GET_DIGEST(crc)); + SetUi32(buf + 4, CRC_GET_DIGEST(crc)) - SetUi32(buf + 8 + 4, (UInt32)(globalPos >> 2)); + SetUi32(buf + 8 + 4, (UInt32)(globalPos >> 2)) buf[8 + 8] = (Byte)(flags >> 8); buf[8 + 9] = (Byte)(flags & 0xFF); - SetUi32(buf + 8, CrcCalc(buf + 8 + 4, 6)); + SetUi32(buf + 8, CrcCalc(buf + 8 + 4, 6)) buf[8 + 10] = XZ_FOOTER_SIG_0; buf[8 + 11] = XZ_FOOTER_SIG_1; @@ -230,7 +231,7 @@ static SRes XzEncIndex_WriteFooter(const CXzEncIndex *p, CXzStreamFlags flags, I typedef struct { ISeqInStream vt; - ISeqInStream *realStream; + ISeqInStreamPtr realStream; const Byte *data; UInt64 limit; UInt64 processed; @@ -251,9 +252,9 @@ static void SeqCheckInStream_GetDigest(CSeqCheckInStream *p, Byte *digest) XzCheck_Final(&p->check, digest); } -static SRes SeqCheckInStream_Read(const ISeqInStream *pp, void *data, size_t *size) +static SRes SeqCheckInStream_Read(ISeqInStreamPtr pp, void *data, size_t *size) { - CSeqCheckInStream *p = CONTAINER_FROM_VTBL(pp, CSeqCheckInStream, vt); + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSeqCheckInStream) size_t size2 = *size; SRes res = SZ_OK; @@ -285,15 +286,15 @@ static SRes SeqCheckInStream_Read(const ISeqInStream *pp, void *data, size_t *si typedef struct { ISeqOutStream vt; - ISeqOutStream *realStream; + ISeqOutStreamPtr realStream; Byte *outBuf; size_t outBufLimit; UInt64 processed; } CSeqSizeOutStream; -static size_t SeqSizeOutStream_Write(const ISeqOutStream *pp, const void *data, size_t size) +static size_t SeqSizeOutStream_Write(ISeqOutStreamPtr pp, const void *data, size_t size) { - CSeqSizeOutStream *p = CONTAINER_FROM_VTBL(pp, CSeqSizeOutStream, vt); + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSeqSizeOutStream) if (p->realStream) size = ISeqOutStream_Write(p->realStream, data, size); else @@ -313,8 +314,8 @@ static size_t SeqSizeOutStream_Write(const ISeqOutStream *pp, const void *data, typedef struct { - ISeqInStream p; - ISeqInStream *realStream; + ISeqInStream vt; + ISeqInStreamPtr realStream; IStateCoder StateCoder; Byte *buf; size_t curPos; @@ -323,7 +324,40 @@ typedef struct } CSeqInFilter; -SRes BraState_SetFromMethod(IStateCoder *p, UInt64 id, int encodeMode, ISzAllocPtr alloc); +static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Enc[] = +{ + Z7_BRANCH_CONV_ENC_2 (BranchConv_PPC), + Z7_BRANCH_CONV_ENC_2 (BranchConv_IA64), + Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM), + Z7_BRANCH_CONV_ENC_2 (BranchConv_ARMT), + Z7_BRANCH_CONV_ENC_2 (BranchConv_SPARC), + Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM64), + Z7_BRANCH_CONV_ENC_2 (BranchConv_RISCV) +}; + +static SizeT XzBcFilterStateBase_Filter_Enc(CXzBcFilterStateBase *p, Byte *data, SizeT size) +{ + switch (p->methodId) + { + case XZ_ID_Delta: + Delta_Encode(p->delta_State, p->delta, data, size); + break; + case XZ_ID_X86: + size = (SizeT)(z7_BranchConvSt_X86_Enc(data, size, p->ip, &p->X86_State) - data); + break; + default: + if (p->methodId >= XZ_ID_PPC) + { + const UInt32 i = p->methodId - XZ_ID_PPC; + if (i < Z7_ARRAY_SIZE(g_Funcs_BranchConv_RISC_Enc)) + size = (SizeT)(g_Funcs_BranchConv_RISC_Enc[i](data, size, p->ip) - data); + } + break; + } + p->ip += (UInt32)size; + return size; +} + static SRes SeqInFilter_Init(CSeqInFilter *p, const CXzFilter *props, ISzAllocPtr alloc) { @@ -335,17 +369,17 @@ static SRes SeqInFilter_Init(CSeqInFilter *p, const CXzFilter *props, ISzAllocPt } p->curPos = p->endPos = 0; p->srcWasFinished = 0; - RINOK(BraState_SetFromMethod(&p->StateCoder, props->id, 1, alloc)); - RINOK(p->StateCoder.SetProps(p->StateCoder.p, props->props, props->propsSize, alloc)); + RINOK(Xz_StateCoder_Bc_SetFromMethod_Func(&p->StateCoder, props->id, XzBcFilterStateBase_Filter_Enc, alloc)) + RINOK(p->StateCoder.SetProps(p->StateCoder.p, props->props, props->propsSize, alloc)) p->StateCoder.Init(p->StateCoder.p); return SZ_OK; } -static SRes SeqInFilter_Read(const ISeqInStream *pp, void *data, size_t *size) +static SRes SeqInFilter_Read(ISeqInStreamPtr pp, void *data, size_t *size) { - CSeqInFilter *p = CONTAINER_FROM_VTBL(pp, CSeqInFilter, p); - size_t sizeOriginal = *size; + Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSeqInFilter) + const size_t sizeOriginal = *size; if (sizeOriginal == 0) return SZ_OK; *size = 0; @@ -356,7 +390,7 @@ static SRes SeqInFilter_Read(const ISeqInStream *pp, void *data, size_t *size) { p->curPos = 0; p->endPos = FILTER_BUF_SIZE; - RINOK(ISeqInStream_Read(p->realStream, p->buf, &p->endPos)); + RINOK(ISeqInStream_Read(p->realStream, p->buf, &p->endPos)) if (p->endPos == 0) p->srcWasFinished = 1; } @@ -377,13 +411,15 @@ static SRes SeqInFilter_Read(const ISeqInStream *pp, void *data, size_t *size) } } +Z7_FORCE_INLINE static void SeqInFilter_Construct(CSeqInFilter *p) { p->buf = NULL; p->StateCoder.p = NULL; - p->p.Read = SeqInFilter_Read; + p->vt.Read = SeqInFilter_Read; } +Z7_FORCE_INLINE static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc) { if (p->StateCoder.p) @@ -406,13 +442,13 @@ static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc) typedef struct { ISeqInStream vt; - ISeqInStream *inStream; + ISeqInStreamPtr inStream; CSbEnc enc; } CSbEncInStream; -static SRes SbEncInStream_Read(const ISeqInStream *pp, void *data, size_t *size) +static SRes SbEncInStream_Read(ISeqInStreamPtr pp, void *data, size_t *size) { - CSbEncInStream *p = CONTAINER_FROM_VTBL(pp, CSbEncInStream, vt); + CSbEncInStream *p = Z7_CONTAINER_FROM_VTBL(pp, CSbEncInStream, vt); size_t sizeOriginal = *size; if (sizeOriginal == 0) return SZ_OK; @@ -422,7 +458,7 @@ static SRes SbEncInStream_Read(const ISeqInStream *pp, void *data, size_t *size) if (p->enc.needRead && !p->enc.readWasFinished) { size_t processed = p->enc.needReadSizeMax; - RINOK(p->inStream->Read(p->inStream, p->enc.buf + p->enc.readPos, &processed)); + RINOK(p->inStream->Read(p->inStream, p->enc.buf + p->enc.readPos, &processed)) p->enc.readPos += processed; if (processed == 0) { @@ -433,7 +469,7 @@ static SRes SbEncInStream_Read(const ISeqInStream *pp, void *data, size_t *size) } *size = sizeOriginal; - RINOK(SbEnc_Read(&p->enc, data, size)); + RINOK(SbEnc_Read(&p->enc, data, size)) if (*size != 0 || !p->enc.needRead) return SZ_OK; } @@ -473,7 +509,8 @@ void XzFilterProps_Init(CXzFilterProps *p) void XzProps_Init(CXzProps *p) { p->checkId = XZ_CHECK_CRC32; - p->blockSize = XZ_PROPS__BLOCK_SIZE__AUTO; + p->numThreadGroups = 0; + p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO; p->numBlockThreads_Reduced = -1; p->numBlockThreads_Max = -1; p->numTotalThreads = -1; @@ -502,8 +539,8 @@ static void XzEncProps_Normalize_Fixed(CXzProps *p) t2 = p->numBlockThreads_Max; t3 = p->numTotalThreads; - if (t2 > MTCODER__THREADS_MAX) - t2 = MTCODER__THREADS_MAX; + if (t2 > MTCODER_THREADS_MAX) + t2 = MTCODER_THREADS_MAX; if (t3 <= 0) { @@ -519,8 +556,8 @@ static void XzEncProps_Normalize_Fixed(CXzProps *p) t1 = 1; t2 = t3; } - if (t2 > MTCODER__THREADS_MAX) - t2 = MTCODER__THREADS_MAX; + if (t2 > MTCODER_THREADS_MAX) + t2 = MTCODER_THREADS_MAX; } else if (t1 <= 0) { @@ -552,7 +589,7 @@ static void XzEncProps_Normalize_Fixed(CXzProps *p) numBlocks++; if (numBlocks < (unsigned)t2) { - t2r = (unsigned)numBlocks; + t2r = (int)numBlocks; if (t2r == 0) t2r = 1; t3 = t1 * t2r; @@ -571,7 +608,7 @@ static void XzProps_Normalize(CXzProps *p) /* we normalize xzProps properties, but we normalize only some of CXzProps::lzma2Props properties. Lzma2Enc_SetProps() will normalize lzma2Props later. */ - if (p->blockSize == XZ_PROPS__BLOCK_SIZE__SOLID) + if (p->blockSize == XZ_PROPS_BLOCK_SIZE_SOLID) { p->lzma2Props.lzmaProps.reduceSize = p->reduceSize; p->numBlockThreads_Reduced = 1; @@ -583,15 +620,15 @@ static void XzProps_Normalize(CXzProps *p) else { CLzma2EncProps *lzma2 = &p->lzma2Props; - if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO) + if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO) { // xz-auto p->lzma2Props.lzmaProps.reduceSize = p->reduceSize; - if (lzma2->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID) + if (lzma2->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID) { // if (xz-auto && lzma2-solid) - we use solid for both - p->blockSize = XZ_PROPS__BLOCK_SIZE__SOLID; + p->blockSize = XZ_PROPS_BLOCK_SIZE_SOLID; p->numBlockThreads_Reduced = 1; p->numBlockThreads_Max = 1; if (p->lzma2Props.numTotalThreads <= 0) @@ -610,9 +647,9 @@ static void XzProps_Normalize(CXzProps *p) p->blockSize = tp.blockSize; // fixed or solid p->numBlockThreads_Reduced = tp.numBlockThreads_Reduced; p->numBlockThreads_Max = tp.numBlockThreads_Max; - if (lzma2->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO) - lzma2->blockSize = tp.blockSize; // fixed or solid, LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID - if (lzma2->lzmaProps.reduceSize > tp.blockSize && tp.blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID) + if (lzma2->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO) + lzma2->blockSize = tp.blockSize; // fixed or solid, LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID + if (lzma2->lzmaProps.reduceSize > tp.blockSize && tp.blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID) lzma2->lzmaProps.reduceSize = tp.blockSize; lzma2->numBlockThreads_Reduced = 1; lzma2->numBlockThreads_Max = 1; @@ -631,9 +668,9 @@ static void XzProps_Normalize(CXzProps *p) r = p->blockSize; lzma2->lzmaProps.reduceSize = r; } - if (lzma2->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO) - lzma2->blockSize = LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID; - else if (lzma2->blockSize > p->blockSize && lzma2->blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID) + if (lzma2->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO) + lzma2->blockSize = LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID; + else if (lzma2->blockSize > p->blockSize && lzma2->blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID) lzma2->blockSize = p->blockSize; XzEncProps_Normalize_Fixed(p); @@ -655,6 +692,7 @@ typedef struct } CLzma2WithFilters; +Z7_FORCE_INLINE static void Lzma2WithFilters_Construct(CLzma2WithFilters *p) { p->lzma2 = NULL; @@ -678,6 +716,7 @@ static SRes Lzma2WithFilters_Create(CLzma2WithFilters *p, ISzAllocPtr alloc, ISz } +Z7_FORCE_INLINE static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc) { #ifdef USE_SUBBLOCK @@ -704,17 +743,17 @@ typedef struct static SRes Xz_CompressBlock( CLzma2WithFilters *lzmaf, - ISeqOutStream *outStream, + ISeqOutStreamPtr outStream, Byte *outBufHeader, Byte *outBufData, size_t outBufDataLimit, - ISeqInStream *inStream, + ISeqInStreamPtr inStream, // UInt64 expectedSize, const Byte *inBuf, // used if (!inStream) size_t inBufSize, // used if (!inStream), it's block size, props->blockSize is ignored const CXzProps *props, - ICompressProgress *progress, + ICompressProgressPtr progress, int *inStreamFinished, /* only for inStream version */ CXzEncBlockInfo *blockSizes, ISzAllocPtr alloc, @@ -731,12 +770,12 @@ static SRes Xz_CompressBlock( *inStreamFinished = False; - RINOK(Lzma2WithFilters_Create(lzmaf, alloc, allocBig)); + RINOK(Lzma2WithFilters_Create(lzmaf, alloc, allocBig)) - RINOK(Lzma2Enc_SetProps(lzmaf->lzma2, &props->lzma2Props)); + RINOK(Lzma2Enc_SetProps(lzmaf->lzma2, &props->lzma2Props)) - XzBlock_ClearFlags(&block); - XzBlock_SetNumFilters(&block, 1 + (fp ? 1 : 0)); + // XzBlock_ClearFlags(&block) + XzBlock_ClearFlags_SetNumFilters(&block, 1 + (fp ? 1 : 0)) if (fp) { @@ -751,7 +790,8 @@ static SRes Xz_CompressBlock( } else if (fp->ipDefined) { - SetUi32(filter->props, fp->ip); + Byte *ptr = filter->props; + SetUi32(ptr, fp->ip) filter->propsSize = 4; } } @@ -776,13 +816,13 @@ static SRes Xz_CompressBlock( if (props->blockSize != (UInt64)(Int64)-1) if (expectedSize > props->blockSize) block.unpackSize = props->blockSize; - XzBlock_SetHasUnpackSize(&block); + XzBlock_SetHasUnpackSize(&block) } */ if (outStream) { - RINOK(XzBlock_WriteHeader(&block, &seqSizeOutStream.vt)); + RINOK(XzBlock_WriteHeader(&block, &seqSizeOutStream.vt)) } checkInStream.vt.Read = SeqCheckInStream_Read; @@ -800,13 +840,13 @@ static SRes Xz_CompressBlock( if (fp->id == XZ_ID_Subblock) { lzmaf->sb.inStream = &checkInStream.vt; - RINOK(SbEncInStream_Init(&lzmaf->sb)); + RINOK(SbEncInStream_Init(&lzmaf->sb)) } else #endif { lzmaf->filter.realStream = &checkInStream.vt; - RINOK(SeqInFilter_Init(&lzmaf->filter, filter, alloc)); + RINOK(SeqInFilter_Init(&lzmaf->filter, filter, alloc)) } } @@ -840,7 +880,7 @@ static SRes Xz_CompressBlock( #ifdef USE_SUBBLOCK (fp->id == XZ_ID_Subblock) ? &lzmaf->sb.vt: #endif - &lzmaf->filter.p) : + &lzmaf->filter.vt) : &checkInStream.vt) : NULL, useStream ? NULL : inBuf, @@ -851,13 +891,13 @@ static SRes Xz_CompressBlock( if (outBuf) seqSizeOutStream.processed += outSize; - RINOK(res); + RINOK(res) blockSizes->unpackSize = checkInStream.processed; } { - Byte buf[4 + 64]; - unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed); - UInt64 packSize = seqSizeOutStream.processed; + Byte buf[4 + XZ_CHECK_SIZE_MAX]; + const unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed); + const UInt64 packSize = seqSizeOutStream.processed; buf[0] = 0; buf[1] = 0; @@ -865,7 +905,8 @@ static SRes Xz_CompressBlock( buf[3] = 0; SeqCheckInStream_GetDigest(&checkInStream, buf + 4); - RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize), padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId))); + RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize), + padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId))) blockSizes->totalSize = seqSizeOutStream.processed - padSize; @@ -876,12 +917,12 @@ static SRes Xz_CompressBlock( seqSizeOutStream.processed = 0; block.unpackSize = blockSizes->unpackSize; - XzBlock_SetHasUnpackSize(&block); + XzBlock_SetHasUnpackSize(&block) block.packSize = packSize; - XzBlock_SetHasPackSize(&block); + XzBlock_SetHasPackSize(&block) - RINOK(XzBlock_WriteHeader(&block, &seqSizeOutStream.vt)); + RINOK(XzBlock_WriteHeader(&block, &seqSizeOutStream.vt)) blockSizes->headerSize = (size_t)seqSizeOutStream.processed; blockSizes->totalSize += seqSizeOutStream.processed; @@ -905,15 +946,15 @@ static SRes Xz_CompressBlock( typedef struct { ICompressProgress vt; - ICompressProgress *progress; + ICompressProgressPtr progress; UInt64 inOffset; UInt64 outOffset; } CCompressProgress_XzEncOffset; -static SRes CompressProgress_XzEncOffset_Progress(const ICompressProgress *pp, UInt64 inSize, UInt64 outSize) +static SRes CompressProgress_XzEncOffset_Progress(ICompressProgressPtr pp, UInt64 inSize, UInt64 outSize) { - const CCompressProgress_XzEncOffset *p = CONTAINER_FROM_VTBL(pp, CCompressProgress_XzEncOffset, vt); + const CCompressProgress_XzEncOffset *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CCompressProgress_XzEncOffset, vt); inSize += p->inOffset; outSize += p->outOffset; return ICompressProgress_Progress(p->progress, inSize, outSize); @@ -922,7 +963,7 @@ static SRes CompressProgress_XzEncOffset_Progress(const ICompressProgress *pp, U -typedef struct +struct CXzEnc { ISzAllocPtr alloc; ISzAllocPtr allocBig; @@ -932,20 +973,19 @@ typedef struct CXzEncIndex xzIndex; - CLzma2WithFilters lzmaf_Items[MTCODER__THREADS_MAX]; + CLzma2WithFilters lzmaf_Items[MTCODER_THREADS_MAX]; size_t outBufSize; /* size of allocated outBufs[i] */ - Byte *outBufs[MTCODER__BLOCKS_MAX]; + Byte *outBufs[MTCODER_BLOCKS_MAX]; - #ifndef _7ZIP_ST + #ifndef Z7_ST unsigned checkType; - ISeqOutStream *outStream; + ISeqOutStreamPtr outStream; BoolInt mtCoder_WasConstructed; CMtCoder mtCoder; - CXzEncBlockInfo EncBlocks[MTCODER__BLOCKS_MAX]; + CXzEncBlockInfo EncBlocks[MTCODER_BLOCKS_MAX]; #endif - -} CXzEnc; +}; static void XzEnc_Construct(CXzEnc *p) @@ -954,13 +994,13 @@ static void XzEnc_Construct(CXzEnc *p) XzEncIndex_Construct(&p->xzIndex); - for (i = 0; i < MTCODER__THREADS_MAX; i++) + for (i = 0; i < MTCODER_THREADS_MAX; i++) Lzma2WithFilters_Construct(&p->lzmaf_Items[i]); - #ifndef _7ZIP_ST + #ifndef Z7_ST p->mtCoder_WasConstructed = False; { - for (i = 0; i < MTCODER__BLOCKS_MAX; i++) + for (i = 0; i < MTCODER_BLOCKS_MAX; i++) p->outBufs[i] = NULL; p->outBufSize = 0; } @@ -971,7 +1011,7 @@ static void XzEnc_Construct(CXzEnc *p) static void XzEnc_FreeOutBufs(CXzEnc *p) { unsigned i; - for (i = 0; i < MTCODER__BLOCKS_MAX; i++) + for (i = 0; i < MTCODER_BLOCKS_MAX; i++) if (p->outBufs[i]) { ISzAlloc_Free(p->alloc, p->outBufs[i]); @@ -987,10 +1027,10 @@ static void XzEnc_Free(CXzEnc *p, ISzAllocPtr alloc) XzEncIndex_Free(&p->xzIndex, alloc); - for (i = 0; i < MTCODER__THREADS_MAX; i++) + for (i = 0; i < MTCODER_THREADS_MAX; i++) Lzma2WithFilters_Free(&p->lzmaf_Items[i], alloc); - #ifndef _7ZIP_ST + #ifndef Z7_ST if (p->mtCoder_WasConstructed) { MtCoder_Destruct(&p->mtCoder); @@ -1012,37 +1052,38 @@ CXzEncHandle XzEnc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig) p->expectedDataSize = (UInt64)(Int64)-1; p->alloc = alloc; p->allocBig = allocBig; - return p; + return (CXzEncHandle)p; } +// #define GET_CXzEnc_p CXzEnc *p = (CXzEnc *)(void *)pp; -void XzEnc_Destroy(CXzEncHandle pp) +void XzEnc_Destroy(CXzEncHandle p) { - CXzEnc *p = (CXzEnc *)pp; + // GET_CXzEnc_p XzEnc_Free(p, p->alloc); ISzAlloc_Free(p->alloc, p); } -SRes XzEnc_SetProps(CXzEncHandle pp, const CXzProps *props) +SRes XzEnc_SetProps(CXzEncHandle p, const CXzProps *props) { - CXzEnc *p = (CXzEnc *)pp; + // GET_CXzEnc_p p->xzProps = *props; XzProps_Normalize(&p->xzProps); return SZ_OK; } -void XzEnc_SetDataSize(CXzEncHandle pp, UInt64 expectedDataSiize) +void XzEnc_SetDataSize(CXzEncHandle p, UInt64 expectedDataSiize) { - CXzEnc *p = (CXzEnc *)pp; + // GET_CXzEnc_p p->expectedDataSize = expectedDataSiize; } -#ifndef _7ZIP_ST +#ifndef Z7_ST static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBufIndex, const Byte *src, size_t srcSize, int finished) @@ -1050,18 +1091,19 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf CXzEnc *me = (CXzEnc *)pp; SRes res; CMtProgressThunk progressThunk; - - Byte *dest = me->outBufs[outBufIndex]; - + Byte *dest; UNUSED_VAR(finished) - { CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; bInfo->totalSize = 0; bInfo->unpackSize = 0; bInfo->headerSize = 0; + // v23.02: we don't compress empty blocks + // also we must ignore that empty block in XzEnc_MtCallback_Write() + if (srcSize == 0) + return SZ_OK; } - + dest = me->outBufs[outBufIndex]; if (!dest) { dest = (Byte *)ISzAlloc_Alloc(me->alloc, me->outBufSize); @@ -1072,7 +1114,7 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf MtProgressThunk_CreateVTable(&progressThunk); progressThunk.mtProgress = &me->mtCoder.mtProgress; - MtProgressThunk_Init(&progressThunk); + MtProgressThunk_INIT(&progressThunk) { CXzEncBlockInfo blockSizes; @@ -1107,27 +1149,29 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf static SRes XzEnc_MtCallback_Write(void *pp, unsigned outBufIndex) { CXzEnc *me = (CXzEnc *)pp; - const CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; - const Byte *data = me->outBufs[outBufIndex]; - - RINOK(WriteBytes(me->outStream, data, bInfo->headerSize)); - + // v23.02: we don't write empty blocks + // note: if (bInfo->unpackSize == 0) then there is no compressed data of block + if (bInfo->unpackSize == 0) + return SZ_OK; { - UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize); - RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize)); + const Byte *data = me->outBufs[outBufIndex]; + RINOK(WriteBytes(me->outStream, data, bInfo->headerSize)) + { + const UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize); + RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize)) + } + return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc); } - - return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc); } #endif -SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ICompressProgress *progress) +SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, ICompressProgressPtr progress) { - CXzEnc *p = (CXzEnc *)pp; + // GET_CXzEnc_p const CXzProps *props = &p->xzProps; @@ -1136,7 +1180,7 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr UInt64 numBlocks = 1; UInt64 blockSize = props->blockSize; - if (blockSize != XZ_PROPS__BLOCK_SIZE__SOLID + if (blockSize != XZ_PROPS_BLOCK_SIZE_SOLID && props->reduceSize != (UInt64)(Int64)-1) { numBlocks = props->reduceSize / blockSize; @@ -1146,13 +1190,13 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr else blockSize = (UInt64)1 << 62; - RINOK(XzEncIndex_PreAlloc(&p->xzIndex, numBlocks, blockSize, XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(blockSize), p->alloc)); + RINOK(XzEncIndex_PreAlloc(&p->xzIndex, numBlocks, blockSize, XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(blockSize), p->alloc)) } - RINOK(Xz_WriteHeader((CXzStreamFlags)props->checkId, outStream)); + RINOK(Xz_WriteHeader((CXzStreamFlags)props->checkId, outStream)) - #ifndef _7ZIP_ST + #ifndef Z7_ST if (props->numBlockThreads_Reduced > 1) { IMtCoderCallback2 vt; @@ -1179,8 +1223,8 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr p->mtCoder.mtCallback = &vt; p->mtCoder.mtCallbackObject = p; - if ( props->blockSize == XZ_PROPS__BLOCK_SIZE__SOLID - || props->blockSize == XZ_PROPS__BLOCK_SIZE__AUTO) + if ( props->blockSize == XZ_PROPS_BLOCK_SIZE_SOLID + || props->blockSize == XZ_PROPS_BLOCK_SIZE_AUTO) return SZ_ERROR_FAIL; p->mtCoder.blockSize = (size_t)props->blockSize; @@ -1196,10 +1240,11 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr p->outBufSize = destBlockSize; } - p->mtCoder.numThreadsMax = props->numBlockThreads_Max; + p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max; + p->mtCoder.numThreadGroups = props->numThreadGroups; p->mtCoder.expectedDataSize = p->expectedDataSize; - RINOK(MtCoder_Code(&p->mtCoder)); + RINOK(MtCoder_Code(&p->mtCoder)) } else #endif @@ -1216,7 +1261,7 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr writeStartSizes = 0; - if (props->blockSize != XZ_PROPS__BLOCK_SIZE__SOLID) + if (props->blockSize != XZ_PROPS_BLOCK_SIZE_SOLID) { writeStartSizes = (props->forceWriteSizesInHeader > 0); @@ -1273,18 +1318,18 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr &inStreamFinished, &blockSizes, p->alloc, - p->allocBig)); + p->allocBig)) { UInt64 totalPackFull = blockSizes.totalSize + XZ_GET_PAD_SIZE(blockSizes.totalSize); if (writeStartSizes) { - RINOK(WriteBytes(outStream, p->outBufs[0], blockSizes.headerSize)); - RINOK(WriteBytes(outStream, bufData, (size_t)totalPackFull - blockSizes.headerSize)); + RINOK(WriteBytes(outStream, p->outBufs[0], blockSizes.headerSize)) + RINOK(WriteBytes(outStream, bufData, (size_t)totalPackFull - blockSizes.headerSize)) } - RINOK(XzEncIndex_AddIndexRecord(&p->xzIndex, blockSizes.unpackSize, blockSizes.totalSize, p->alloc)); + RINOK(XzEncIndex_AddIndexRecord(&p->xzIndex, blockSizes.unpackSize, blockSizes.totalSize, p->alloc)) progress2.inOffset += blockSizes.unpackSize; progress2.outOffset += totalPackFull; @@ -1301,8 +1346,8 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr #include "Alloc.h" -SRes Xz_Encode(ISeqOutStream *outStream, ISeqInStream *inStream, - const CXzProps *props, ICompressProgress *progress) +SRes Xz_Encode(ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, + const CXzProps *props, ICompressProgressPtr progress) { SRes res; CXzEncHandle xz = XzEnc_Create(&g_Alloc, &g_BigAlloc); @@ -1316,7 +1361,7 @@ SRes Xz_Encode(ISeqOutStream *outStream, ISeqInStream *inStream, } -SRes Xz_EncodeEmpty(ISeqOutStream *outStream) +SRes Xz_EncodeEmpty(ISeqOutStreamPtr outStream) { SRes res; CXzEncIndex xzIndex; diff --git a/src/sdk/C/XzEnc.h b/src/sdk/C/XzEnc.h index 0c29e7e..ac6bbf7 100644 --- a/src/sdk/C/XzEnc.h +++ b/src/sdk/C/XzEnc.h @@ -1,8 +1,8 @@ /* XzEnc.h -- Xz Encode -2017-06-27 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ -#ifndef __XZ_ENC_H -#define __XZ_ENC_H +#ifndef ZIP7_INC_XZ_ENC_H +#define ZIP7_INC_XZ_ENC_H #include "Lzma2Enc.h" @@ -11,8 +11,8 @@ EXTERN_C_BEGIN -#define XZ_PROPS__BLOCK_SIZE__AUTO LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO -#define XZ_PROPS__BLOCK_SIZE__SOLID LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID +#define XZ_PROPS_BLOCK_SIZE_AUTO LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO +#define XZ_PROPS_BLOCK_SIZE_SOLID LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID typedef struct @@ -31,6 +31,7 @@ typedef struct CLzma2EncProps lzma2Props; CXzFilterProps filterProps; unsigned checkId; + unsigned numThreadGroups; // 0 : no groups UInt64 blockSize; int numBlockThreads_Reduced; int numBlockThreads_Max; @@ -41,19 +42,20 @@ typedef struct void XzProps_Init(CXzProps *p); - -typedef void * CXzEncHandle; +typedef struct CXzEnc CXzEnc; +typedef CXzEnc * CXzEncHandle; +// Z7_DECLARE_HANDLE(CXzEncHandle) CXzEncHandle XzEnc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig); void XzEnc_Destroy(CXzEncHandle p); SRes XzEnc_SetProps(CXzEncHandle p, const CXzProps *props); void XzEnc_SetDataSize(CXzEncHandle p, UInt64 expectedDataSiize); -SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStream *outStream, ISeqInStream *inStream, ICompressProgress *progress); +SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, ICompressProgressPtr progress); -SRes Xz_Encode(ISeqOutStream *outStream, ISeqInStream *inStream, - const CXzProps *props, ICompressProgress *progress); +SRes Xz_Encode(ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, + const CXzProps *props, ICompressProgressPtr progress); -SRes Xz_EncodeEmpty(ISeqOutStream *outStream); +SRes Xz_EncodeEmpty(ISeqOutStreamPtr outStream); EXTERN_C_END diff --git a/src/sdk/C/XzIn.c b/src/sdk/C/XzIn.c index ff48e2d..ba31636 100644 --- a/src/sdk/C/XzIn.c +++ b/src/sdk/C/XzIn.c @@ -1,40 +1,44 @@ /* XzIn.c - Xz input -2018-07-04 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" #include #include "7zCrc.h" -#include "CpuArch.h" #include "Xz.h" +#include "CpuArch.h" -/* -#define XZ_FOOTER_SIG_CHECK(p) (memcmp((p), XZ_FOOTER_SIG, XZ_FOOTER_SIG_SIZE) == 0) -*/ -#define XZ_FOOTER_SIG_CHECK(p) ((p)[0] == XZ_FOOTER_SIG_0 && (p)[1] == XZ_FOOTER_SIG_1) - +#define XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(p) \ + (GetUi16a((const Byte *)(const void *)(p) + 10) == \ + (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8))) -SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStream *inStream) +SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream) { - Byte sig[XZ_STREAM_HEADER_SIZE]; - RINOK(SeqInStream_Read2(inStream, sig, XZ_STREAM_HEADER_SIZE, SZ_ERROR_NO_ARCHIVE)); - if (memcmp(sig, XZ_SIG, XZ_SIG_SIZE) != 0) + UInt32 data32[XZ_STREAM_HEADER_SIZE / 4]; + size_t processedSize = XZ_STREAM_HEADER_SIZE; + RINOK(SeqInStream_ReadMax(inStream, data32, &processedSize)) + if (processedSize != XZ_STREAM_HEADER_SIZE + || memcmp(data32, XZ_SIG, XZ_SIG_SIZE) != 0) return SZ_ERROR_NO_ARCHIVE; - return Xz_ParseHeader(p, sig); + return Xz_ParseHeader(p, (const Byte *)(const void *)data32); } -#define READ_VARINT_AND_CHECK(buf, pos, size, res) \ - { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ - if (s == 0) return SZ_ERROR_ARCHIVE; pos += s; } +#define READ_VARINT_AND_CHECK(buf, size, res) \ +{ const unsigned s = Xz_ReadVarInt(buf, size, res); \ + if (s == 0) return SZ_ERROR_ARCHIVE; \ + size -= s; \ + buf += s; \ +} -SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStream *inStream, BoolInt *isIndex, UInt32 *headerSizeRes) +SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes) { + MY_ALIGN(4) Byte header[XZ_BLOCK_HEADER_SIZE_MAX]; unsigned headerSize; *headerSizeRes = 0; - RINOK(SeqInStream_ReadByte(inStream, &header[0])); - headerSize = (unsigned)header[0]; + RINOK(SeqInStream_ReadByte(inStream, &header[0])) + headerSize = header[0]; if (headerSize == 0) { *headerSizeRes = 1; @@ -44,20 +48,31 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStream *inStream, BoolInt *isIndex, U *isIndex = False; headerSize = (headerSize << 2) + 4; - *headerSizeRes = headerSize; - RINOK(SeqInStream_Read(inStream, header + 1, headerSize - 1)); + *headerSizeRes = (UInt32)headerSize; + { + size_t processedSize = headerSize - 1; + RINOK(SeqInStream_ReadMax(inStream, header + 1, &processedSize)) + if (processedSize != headerSize - 1) + return SZ_ERROR_INPUT_EOF; + } return XzBlock_Parse(p, header); } + #define ADD_SIZE_CHECK(size, val) \ - { UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } +{ const UInt64 newSize = size + (val); \ + if (newSize < size) return XZ_SIZE_OVERFLOW; \ + size = newSize; \ +} UInt64 Xz_GetUnpackSize(const CXzStream *p) { UInt64 size = 0; size_t i; for (i = 0; i < p->numBlocks; i++) - ADD_SIZE_CHECK(size, p->blocks[i].unpackSize); + { + ADD_SIZE_CHECK(size, p->blocks[i].unpackSize) + } return size; } @@ -66,171 +81,204 @@ UInt64 Xz_GetPackSize(const CXzStream *p) UInt64 size = 0; size_t i; for (i = 0; i < p->numBlocks; i++) - ADD_SIZE_CHECK(size, (p->blocks[i].totalSize + 3) & ~(UInt64)3); + { + ADD_SIZE_CHECK(size, (p->blocks[i].totalSize + 3) & ~(UInt64)3) + } return size; } -/* -SRes XzBlock_ReadFooter(CXzBlock *p, CXzStreamFlags f, ISeqInStream *inStream) -{ - return SeqInStream_Read(inStream, p->check, XzFlags_GetCheckSize(f)); -} -*/ -static SRes Xz_ReadIndex2(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc) +// input; +// CXzStream (p) is empty object. +// size != 0 +// (size & 3) == 0 +// (buf) is aligned for at least 4 bytes. +// output: +// p->numBlocks is number of allocated items in p->blocks +// p->blocks[*] values must be ignored, if function returns error. +static SRes Xz_ParseIndex(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc) { - size_t numBlocks, pos = 1; - UInt32 crc; - + size_t numBlocks; if (size < 5 || buf[0] != 0) return SZ_ERROR_ARCHIVE; - size -= 4; - crc = CrcCalc(buf, size); - if (crc != GetUi32(buf + size)) - return SZ_ERROR_ARCHIVE; - + { + const UInt32 crc = CrcCalc(buf, size); + if (crc != GetUi32a(buf + size)) + return SZ_ERROR_ARCHIVE; + } + buf++; + size--; { UInt64 numBlocks64; - READ_VARINT_AND_CHECK(buf, pos, size, &numBlocks64); - numBlocks = (size_t)numBlocks64; - if (numBlocks != numBlocks64 || numBlocks * 2 > size) + READ_VARINT_AND_CHECK(buf, size, &numBlocks64) + // (numBlocks64) is 63-bit value, so we can calculate (numBlocks64 * 2): + if (numBlocks64 * 2 > size) return SZ_ERROR_ARCHIVE; + if (numBlocks64 >= ((size_t)1 << (sizeof(size_t) * 8 - 1)) / sizeof(CXzBlockSizes)) + return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE + numBlocks = (size_t)numBlocks64; } - - Xz_Free(p, alloc); - if (numBlocks != 0) + // Xz_Free(p, alloc); // it's optional, because (p) is empty already + if (numBlocks) { - size_t i; - p->numBlocks = numBlocks; - p->blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks); - if (!p->blocks) + CXzBlockSizes *blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks); + if (!blocks) return SZ_ERROR_MEM; - for (i = 0; i < numBlocks; i++) + p->blocks = blocks; + p->numBlocks = numBlocks; + // the caller will call Xz_Free() in case of error + do { - CXzBlockSizes *block = &p->blocks[i]; - READ_VARINT_AND_CHECK(buf, pos, size, &block->totalSize); - READ_VARINT_AND_CHECK(buf, pos, size, &block->unpackSize); - if (block->totalSize == 0) + READ_VARINT_AND_CHECK(buf, size, &blocks->totalSize) + READ_VARINT_AND_CHECK(buf, size, &blocks->unpackSize) + if (blocks->totalSize == 0) return SZ_ERROR_ARCHIVE; + blocks++; } + while (--numBlocks); } - while ((pos & 3) != 0) - if (buf[pos++] != 0) + if (size >= 4) + return SZ_ERROR_ARCHIVE; + while (size) + if (buf[--size]) return SZ_ERROR_ARCHIVE; - return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE; + return SZ_OK; } -static SRes Xz_ReadIndex(CXzStream *p, ILookInStream *stream, UInt64 indexSize, ISzAllocPtr alloc) + +/* +static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc) { SRes res; size_t size; Byte *buf; - if (indexSize > ((UInt32)1 << 31)) - return SZ_ERROR_UNSUPPORTED; + if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1))) + return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE size = (size_t)indexSize; - if (size != indexSize) - return SZ_ERROR_UNSUPPORTED; buf = (Byte *)ISzAlloc_Alloc(alloc, size); if (!buf) return SZ_ERROR_MEM; res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); if (res == SZ_OK) - res = Xz_ReadIndex2(p, buf, size, alloc); + res = Xz_ParseIndex(p, buf, size, alloc); ISzAlloc_Free(alloc, buf); return res; } +*/ -static SRes LookInStream_SeekRead_ForArc(ILookInStream *stream, UInt64 offset, void *buf, size_t size) +static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size) { - RINOK(LookInStream_SeekTo(stream, offset)); + RINOK(LookInStream_SeekTo(stream, offset)) return LookInStream_Read(stream, buf, size); /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */ } -static SRes Xz_ReadBackward(CXzStream *p, ILookInStream *stream, Int64 *startOffset, ISzAllocPtr alloc) + +/* +in: + (*startOffset) is position in (stream) where xz_stream must be finished. +out: + if returns SZ_OK, then (*startOffset) is position in stream that shows start of xz_stream. +*/ +static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc) { - UInt64 indexSize; - Byte buf[XZ_STREAM_FOOTER_SIZE]; - UInt64 pos = *startOffset; + #define TEMP_BUF_SIZE (1 << 10) + UInt32 buf32[TEMP_BUF_SIZE / 4]; + UInt64 pos = (UInt64)*startOffset; - if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE) + if ((pos & 3) || pos < XZ_STREAM_FOOTER_SIZE) return SZ_ERROR_NO_ARCHIVE; - pos -= XZ_STREAM_FOOTER_SIZE; - RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)); + RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE)) - if (!XZ_FOOTER_SIG_CHECK(buf + 10)) + if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32)) { - UInt32 total = 0; pos += XZ_STREAM_FOOTER_SIZE; - for (;;) { - size_t i; - #define TEMP_BUF_SIZE (1 << 10) - Byte temp[TEMP_BUF_SIZE]; - - i = (pos > TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : (size_t)pos; + // pos != 0 + // (pos & 3) == 0 + size_t i = pos >= TEMP_BUF_SIZE ? TEMP_BUF_SIZE : (size_t)pos; pos -= i; - RINOK(LookInStream_SeekRead_ForArc(stream, pos, temp, i)); - total += (UInt32)i; - for (; i != 0; i--) - if (temp[i - 1] != 0) + RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, i)) + i /= 4; + do + if (buf32[i - 1] != 0) break; - if (i != 0) - { - if ((i & 3) != 0) - return SZ_ERROR_NO_ARCHIVE; - pos += i; - break; - } - if (pos < XZ_STREAM_FOOTER_SIZE || total > (1 << 16)) + while (--i); + + pos += i * 4; + #define XZ_STREAM_BACKWARD_READING_PAD_MAX (1 << 16) + // here we don't support rare case with big padding for xz stream. + // so we have padding limit for backward reading. + if ((UInt64)*startOffset - pos > XZ_STREAM_BACKWARD_READING_PAD_MAX) return SZ_ERROR_NO_ARCHIVE; + if (i) + break; } - + // we try to open xz stream after skipping zero padding. + // ((UInt64)*startOffset == pos) is possible here! if (pos < XZ_STREAM_FOOTER_SIZE) return SZ_ERROR_NO_ARCHIVE; pos -= XZ_STREAM_FOOTER_SIZE; - RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)); - if (!XZ_FOOTER_SIG_CHECK(buf + 10)) + RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE)) + if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32)) return SZ_ERROR_NO_ARCHIVE; } - p->flags = (CXzStreamFlags)GetBe16(buf + 8); - + p->flags = (CXzStreamFlags)GetBe16a(buf32 + 2); if (!XzFlags_IsSupported(p->flags)) return SZ_ERROR_UNSUPPORTED; - - if (GetUi32(buf) != CrcCalc(buf + 4, 6)) - return SZ_ERROR_ARCHIVE; - - indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2; - - if (pos < indexSize) - return SZ_ERROR_ARCHIVE; - - pos -= indexSize; - RINOK(LookInStream_SeekTo(stream, pos)); - RINOK(Xz_ReadIndex(p, stream, indexSize, alloc)); - { - UInt64 totalSize = Xz_GetPackSize(p); - if (totalSize == XZ_SIZE_OVERFLOW - || totalSize >= ((UInt64)1 << 63) - || pos < totalSize + XZ_STREAM_HEADER_SIZE) + /* to eliminate GCC 6.3 warning: + dereferencing type-punned pointer will break strict-aliasing rules */ + const UInt32 *buf_ptr = buf32; + if (GetUi32a(buf_ptr) != CrcCalc(buf32 + 1, 6)) return SZ_ERROR_ARCHIVE; - pos -= (totalSize + XZ_STREAM_HEADER_SIZE); - RINOK(LookInStream_SeekTo(stream, pos)); - *startOffset = pos; + } + { + const UInt64 indexSize = ((UInt64)GetUi32a(buf32 + 1) + 1) << 2; + if (pos < indexSize) + return SZ_ERROR_ARCHIVE; + pos -= indexSize; + // v25.00: relaxed indexSize check. We allow big index table. + // if (indexSize > ((UInt32)1 << 31)) + if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1))) + return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE + RINOK(LookInStream_SeekTo(stream, pos)) + // RINOK(Xz_ReadIndex(p, stream, indexSize, alloc)) + { + SRes res; + const size_t size = (size_t)indexSize; + // if (size != indexSize) return SZ_ERROR_UNSUPPORTED; + Byte *buf = (Byte *)ISzAlloc_Alloc(alloc, size); + if (!buf) + return SZ_ERROR_MEM; + res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); + if (res == SZ_OK) + res = Xz_ParseIndex(p, buf, size, alloc); + ISzAlloc_Free(alloc, buf); + RINOK(res) + } + } + { + UInt64 total = Xz_GetPackSize(p); + if (total == XZ_SIZE_OVERFLOW || total >= ((UInt64)1 << 63)) + return SZ_ERROR_ARCHIVE; + total += XZ_STREAM_HEADER_SIZE; + if (pos < total) + return SZ_ERROR_ARCHIVE; + pos -= total; + RINOK(LookInStream_SeekTo(stream, pos)) + *startOffset = (Int64)pos; } { CXzStreamFlags headerFlags; CSecToRead secToRead; SecToRead_CreateVTable(&secToRead); secToRead.realStream = stream; - - RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt)); + RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt)) return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE; } } @@ -240,8 +288,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStream *stream, Int64 *startOff void Xzs_Construct(CXzs *p) { - p->num = p->numAllocated = 0; - p->streams = 0; + Xzs_CONSTRUCT(p) } void Xzs_Free(CXzs *p, ISzAllocPtr alloc) @@ -251,7 +298,7 @@ void Xzs_Free(CXzs *p, ISzAllocPtr alloc) Xz_Free(&p->streams[i], alloc); ISzAlloc_Free(alloc, p->streams); p->num = p->numAllocated = 0; - p->streams = 0; + p->streams = NULL; } UInt64 Xzs_GetNumBlocks(const CXzs *p) @@ -268,7 +315,9 @@ UInt64 Xzs_GetUnpackSize(const CXzs *p) UInt64 size = 0; size_t i; for (i = 0; i < p->num; i++) - ADD_SIZE_CHECK(size, Xz_GetUnpackSize(&p->streams[i])); + { + ADD_SIZE_CHECK(size, Xz_GetUnpackSize(&p->streams[i])) + } return size; } @@ -278,42 +327,59 @@ UInt64 Xzs_GetPackSize(const CXzs *p) UInt64 size = 0; size_t i; for (i = 0; i < p->num; i++) - ADD_SIZE_CHECK(size, Xz_GetTotalSize(&p->streams[i])); + { + ADD_SIZE_CHECK(size, Xz_GetTotalSize(&p->streams[i])) + } return size; } */ -SRes Xzs_ReadBackward(CXzs *p, ILookInStream *stream, Int64 *startOffset, ICompressProgress *progress, ISzAllocPtr alloc) +SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc) { Int64 endOffset = 0; - RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END)); + // it's supposed that CXzs object is empty here. + // if CXzs object is not empty, it will add new streams to that non-empty object. + // Xzs_Free(p, alloc); // it's optional call to empty CXzs object. + RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END)) *startOffset = endOffset; for (;;) { CXzStream st; SRes res; - Xz_Construct(&st); + Xz_CONSTRUCT(&st) res = Xz_ReadBackward(&st, stream, startOffset, alloc); - st.startOffset = *startOffset; - RINOK(res); + // if (res == SZ_OK), then (*startOffset) is start offset of new stream if + // if (res != SZ_OK), then (*startOffset) is unchend or it's expected start offset of stream with error + st.startOffset = (UInt64)*startOffset; + // we must store (st) object to array, or we must free (st) local object. + if (res != SZ_OK) + { + Xz_Free(&st, alloc); + return res; + } if (p->num == p->numAllocated) { - size_t newNum = p->num + p->num / 4 + 1; - Byte *data = (Byte *)ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream)); + const size_t newNum = p->num + p->num / 4 + 1; + void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream)); if (!data) + { + Xz_Free(&st, alloc); return SZ_ERROR_MEM; + } p->numAllocated = newNum; if (p->num != 0) memcpy(data, p->streams, p->num * sizeof(CXzStream)); ISzAlloc_Free(alloc, p->streams); p->streams = (CXzStream *)data; } + // we use direct copying of raw data from local variable (st) to object in array. + // so we don't need to call Xz_Free(&st, alloc) after copying and after p->num++ p->streams[p->num++] = st; if (*startOffset == 0) - break; - RINOK(LookInStream_SeekTo(stream, *startOffset)); - if (progress && ICompressProgress_Progress(progress, endOffset - *startOffset, (UInt64)(Int64)-1) != SZ_OK) + return SZ_OK; + // seek operation is optional: + // RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset)) + if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK) return SZ_ERROR_PROGRESS; } - return SZ_OK; } diff --git a/src/sdk/DOC/7zC.txt b/src/sdk/DOC/7zC.txt index 939b720..4927678 100644 --- a/src/sdk/DOC/7zC.txt +++ b/src/sdk/DOC/7zC.txt @@ -1,187 +1,187 @@ -7z ANSI-C Decoder 9.35 ----------------------- - -7z ANSI-C provides 7z/LZMA decoding. -7z ANSI-C version is simplified version ported from C++ code. - -LZMA is default and general compression method of 7z format -in 7-Zip compression program (www.7-zip.org). LZMA provides high -compression ratio and very fast decompression. - - -LICENSE -------- - -7z ANSI-C Decoder is part of the LZMA SDK. -LZMA SDK is written and placed in the public domain by Igor Pavlov. - -Files ---------------------- - -7zDecode.* - Low level 7z decoding -7zExtract.* - High level 7z decoding -7zHeader.* - .7z format constants -7zIn.* - .7z archive opening -7zItem.* - .7z structures -7zMain.c - Test application - - -How To Use ----------- - -You can create .7z archive with 7z.exe, 7za.exe or 7zr.exe: - - 7z.exe a archive.7z *.htm -r -mx -m0fb=255 - -If you have big number of files in archive, and you need fast extracting, -you can use partly-solid archives: - - 7za.exe a archive.7z *.htm -ms=512K -r -mx -m0fb=255 -m0d=512K - -In that example 7-Zip will use 512KB solid blocks. So it needs to decompress only -512KB for extracting one file from such archive. - - -Limitations of current version of 7z ANSI-C Decoder ---------------------------------------------------- - - - It reads only "FileName", "Size", "LastWriteTime" and "CRC" information for each file in archive. - - It supports only LZMA and Copy (no compression) methods with BCJ or BCJ2 filters. - - It converts original UTF-16 Unicode file names to UTF-8 Unicode file names. - -These limitations will be fixed in future versions. - - -Using 7z ANSI-C Decoder Test application: ------------------------------------------ - -Usage: 7zDec - -: - e: Extract files from archive - l: List contents of archive - t: Test integrity of archive - -Example: - - 7zDec l archive.7z - -lists contents of archive.7z - - 7zDec e archive.7z - -extracts files from archive.7z to current folder. - - -How to use .7z Decoder ----------------------- - -Memory allocation -~~~~~~~~~~~~~~~~~ - -7z Decoder uses two memory pools: -1) Temporary pool -2) Main pool -Such scheme can allow you to avoid fragmentation of allocated blocks. - - -Steps for using 7z decoder --------------------------- - -Use code at 7zMain.c as example. - -1) Declare variables: - inStream /* implements ILookInStream interface */ - CSzArEx db; /* 7z archive database structure */ - ISzAlloc allocImp; /* memory functions for main pool */ - ISzAlloc allocTempImp; /* memory functions for temporary pool */ - -2) call CrcGenerateTable(); function to initialize CRC structures. - -3) call SzArEx_Init(&db); function to initialize db structures. - -4) call SzArEx_Open(&db, inStream, &allocMain, &allocTemp) to open archive - -This function opens archive "inStream" and reads headers to "db". -All items in "db" will be allocated with "allocMain" functions. -SzArEx_Open function allocates and frees temporary structures by "allocTemp" functions. - -5) List items or Extract items - - Listing code: - ~~~~~~~~~~~~~ - - Use SzArEx_GetFileNameUtf16 function. Look example code in C\Util\7z\7zMain.c file. - - - Extracting code: - ~~~~~~~~~~~~~~~~ - - SZ_RESULT SzAr_Extract( - CArchiveDatabaseEx *db, - ILookInStream *inStream, - UInt32 fileIndex, /* index of file */ - UInt32 *blockIndex, /* index of solid block */ - Byte **outBuffer, /* pointer to pointer to output buffer (allocated with allocMain) */ - size_t *outBufferSize, /* buffer size for output buffer */ - size_t *offset, /* offset of stream for required file in *outBuffer */ - size_t *outSizeProcessed, /* size of file in *outBuffer */ - ISzAlloc *allocMain, - ISzAlloc *allocTemp); - - If you need to decompress more than one file, you can send these values from previous call: - blockIndex, - outBuffer, - outBufferSize, - You can consider "outBuffer" as cache of solid block. If your archive is solid, - it will increase decompression speed. - - After decompressing you must free "outBuffer": - allocImp.Free(outBuffer); - -6) call SzArEx_Free(&db, allocImp.Free) to free allocated items in "db". - - - - -Memory requirements for .7z decoding ------------------------------------- - -Memory usage for Archive opening: - - Temporary pool: - - Memory for uncompressed .7z headers - - some other temporary blocks - - Main pool: - - Memory for database: - Estimated size of one file structures in solid archive: - - Size (4 or 8 Bytes) - - CRC32 (4 bytes) - - LastWriteTime (8 bytes) - - Some file information (4 bytes) - - File Name (variable length) + pointer + allocation structures - -Memory usage for archive Decompressing: - - Temporary pool: - - Memory for LZMA decompressing structures - - Main pool: - - Memory for decompressed solid block - - Memory for temprorary buffers, if BCJ2 fileter is used. Usually these - temprorary buffers can be about 15% of solid block size. - - -7z Decoder doesn't allocate memory for compressed blocks. -Instead of this, you must allocate buffer with desired -size before calling 7z Decoder. Use 7zMain.c as example. - - -Defines -------- - -_SZ_ALLOC_DEBUG - define it if you want to debug alloc/free operations to stderr. - - ---- - -http://www.7-zip.org -http://www.7-zip.org/sdk.html -http://www.7-zip.org/support.html +7z ANSI-C Decoder 9.35 +---------------------- + +7z ANSI-C provides 7z/LZMA decoding. +7z ANSI-C version is simplified version ported from C++ code. + +LZMA is default and general compression method of 7z format +in 7-Zip compression program (www.7-zip.org). LZMA provides high +compression ratio and very fast decompression. + + +LICENSE +------- + +7z ANSI-C Decoder is part of the LZMA SDK. +LZMA SDK is written and placed in the public domain by Igor Pavlov. + +Files +--------------------- + +7zDecode.* - Low level 7z decoding +7zExtract.* - High level 7z decoding +7zHeader.* - .7z format constants +7zIn.* - .7z archive opening +7zItem.* - .7z structures +7zMain.c - Test application + + +How To Use +---------- + +You can create .7z archive with 7z.exe, 7za.exe or 7zr.exe: + + 7z.exe a archive.7z *.htm -r -mx -m0fb=255 + +If you have big number of files in archive, and you need fast extracting, +you can use partly-solid archives: + + 7za.exe a archive.7z *.htm -ms=512K -r -mx -m0fb=255 -m0d=512K + +In that example 7-Zip will use 512KB solid blocks. So it needs to decompress only +512KB for extracting one file from such archive. + + +Limitations of current version of 7z ANSI-C Decoder +--------------------------------------------------- + + - It reads only "FileName", "Size", "LastWriteTime" and "CRC" information for each file in archive. + - It supports only LZMA and Copy (no compression) methods with BCJ or BCJ2 filters. + - It converts original UTF-16 Unicode file names to UTF-8 Unicode file names. + +These limitations will be fixed in future versions. + + +Using 7z ANSI-C Decoder Test application: +----------------------------------------- + +Usage: 7zDec + +: + e: Extract files from archive + l: List contents of archive + t: Test integrity of archive + +Example: + + 7zDec l archive.7z + +lists contents of archive.7z + + 7zDec e archive.7z + +extracts files from archive.7z to current folder. + + +How to use .7z Decoder +---------------------- + +Memory allocation +~~~~~~~~~~~~~~~~~ + +7z Decoder uses two memory pools: +1) Temporary pool +2) Main pool +Such scheme can allow you to avoid fragmentation of allocated blocks. + + +Steps for using 7z decoder +-------------------------- + +Use code at 7zMain.c as example. + +1) Declare variables: + inStream /* implements ILookInStream interface */ + CSzArEx db; /* 7z archive database structure */ + ISzAlloc allocImp; /* memory functions for main pool */ + ISzAlloc allocTempImp; /* memory functions for temporary pool */ + +2) call CrcGenerateTable(); function to initialize CRC structures. + +3) call SzArEx_Init(&db); function to initialize db structures. + +4) call SzArEx_Open(&db, inStream, &allocMain, &allocTemp) to open archive + +This function opens archive "inStream" and reads headers to "db". +All items in "db" will be allocated with "allocMain" functions. +SzArEx_Open function allocates and frees temporary structures by "allocTemp" functions. + +5) List items or Extract items + + Listing code: + ~~~~~~~~~~~~~ + + Use SzArEx_GetFileNameUtf16 function. Look example code in C\Util\7z\7zMain.c file. + + + Extracting code: + ~~~~~~~~~~~~~~~~ + + SZ_RESULT SzAr_Extract( + CArchiveDatabaseEx *db, + ILookInStream *inStream, + UInt32 fileIndex, /* index of file */ + UInt32 *blockIndex, /* index of solid block */ + Byte **outBuffer, /* pointer to pointer to output buffer (allocated with allocMain) */ + size_t *outBufferSize, /* buffer size for output buffer */ + size_t *offset, /* offset of stream for required file in *outBuffer */ + size_t *outSizeProcessed, /* size of file in *outBuffer */ + ISzAlloc *allocMain, + ISzAlloc *allocTemp); + + If you need to decompress more than one file, you can send these values from previous call: + blockIndex, + outBuffer, + outBufferSize, + You can consider "outBuffer" as cache of solid block. If your archive is solid, + it will increase decompression speed. + + After decompressing you must free "outBuffer": + allocImp.Free(outBuffer); + +6) call SzArEx_Free(&db, allocImp.Free) to free allocated items in "db". + + + + +Memory requirements for .7z decoding +------------------------------------ + +Memory usage for Archive opening: + - Temporary pool: + - Memory for uncompressed .7z headers + - some other temporary blocks + - Main pool: + - Memory for database: + Estimated size of one file structures in solid archive: + - Size (4 or 8 Bytes) + - CRC32 (4 bytes) + - LastWriteTime (8 bytes) + - Some file information (4 bytes) + - File Name (variable length) + pointer + allocation structures + +Memory usage for archive Decompressing: + - Temporary pool: + - Memory for LZMA decompressing structures + - Main pool: + - Memory for decompressed solid block + - Memory for temprorary buffers, if BCJ2 fileter is used. Usually these + temprorary buffers can be about 15% of solid block size. + + +7z Decoder doesn't allocate memory for compressed blocks. +Instead of this, you must allocate buffer with desired +size before calling 7z Decoder. Use 7zMain.c as example. + + +Defines +------- + +_SZ_ALLOC_DEBUG - define it if you want to debug alloc/free operations to stderr. + + +--- + +http://www.7-zip.org +http://www.7-zip.org/sdk.html +http://www.7-zip.org/support.html diff --git a/src/sdk/DOC/7zFormat.txt b/src/sdk/DOC/7zFormat.txt index 74cdfa4..9239e93 100644 --- a/src/sdk/DOC/7zFormat.txt +++ b/src/sdk/DOC/7zFormat.txt @@ -1,469 +1,469 @@ -7z Format description (18.06) ----------------------------- - -This file contains description of 7z archive format. -7z archive can contain files compressed with any method. -See "Methods.txt" for description for defined compressing methods. - - -Format structure Overview -------------------------- - -Some fields can be optional. - -Archive structure -~~~~~~~~~~~~~~~~~ -SignatureHeader -[PackedStreams] -[PackedStreamsForHeaders] -[ - Header - or - { - Packed Header - HeaderInfo - } -] - - - -Header structure -~~~~~~~~~~~~~~~~ -{ - ArchiveProperties - AdditionalStreams - { - PackInfo - { - PackPos - NumPackStreams - Sizes[NumPackStreams] - CRCs[NumPackStreams] - } - CodersInfo - { - NumFolders - Folders[NumFolders] - { - NumCoders - CodersInfo[NumCoders] - { - ID - NumInStreams; - NumOutStreams; - PropertiesSize - Properties[PropertiesSize] - } - NumBindPairs - BindPairsInfo[NumBindPairs] - { - InIndex; - OutIndex; - } - PackedIndices - } - UnPackSize[Folders][Folders.NumOutstreams] - CRCs[NumFolders] - } - SubStreamsInfo - { - NumUnPackStreamsInFolders[NumFolders]; - UnPackSizes[] - CRCs[] - } - } - MainStreamsInfo - { - (Same as in AdditionalStreams) - } - FilesInfo - { - NumFiles - Properties[] - { - ID - Size - Data - } - } -} - -HeaderInfo structure -~~~~~~~~~~~~~~~~~~~~ -{ - (Same as in AdditionalStreams) -} - - - -Notes about Notation and encoding ---------------------------------- - -7z uses little endian encoding. - -7z archive format has optional headers that are marked as -[] -Header -[] - -REAL_UINT64 means real UINT64. - -UINT64 means real UINT64 encoded with the following scheme: - - Size of encoding sequence depends from first byte: - First_Byte Extra_Bytes Value - (binary) - 0xxxxxxx : ( xxxxxxx ) - 10xxxxxx BYTE y[1] : ( xxxxxx << (8 * 1)) + y - 110xxxxx BYTE y[2] : ( xxxxx << (8 * 2)) + y - ... - 1111110x BYTE y[6] : ( x << (8 * 6)) + y - 11111110 BYTE y[7] : y - 11111111 BYTE y[8] : y - - - -Property IDs ------------- - -0x00 = kEnd - -0x01 = kHeader - -0x02 = kArchiveProperties - -0x03 = kAdditionalStreamsInfo -0x04 = kMainStreamsInfo -0x05 = kFilesInfo - -0x06 = kPackInfo -0x07 = kUnPackInfo -0x08 = kSubStreamsInfo - -0x09 = kSize -0x0A = kCRC - -0x0B = kFolder - -0x0C = kCodersUnPackSize -0x0D = kNumUnPackStream - -0x0E = kEmptyStream -0x0F = kEmptyFile -0x10 = kAnti - -0x11 = kName -0x12 = kCTime -0x13 = kATime -0x14 = kMTime -0x15 = kWinAttributes -0x16 = kComment - -0x17 = kEncodedHeader - -0x18 = kStartPos -0x19 = kDummy - - -7z format headers ------------------ - -SignatureHeader -~~~~~~~~~~~~~~~ - BYTE kSignature[6] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C}; - - ArchiveVersion - { - BYTE Major; // now = 0 - BYTE Minor; // now = 4 - }; - - UINT32 StartHeaderCRC; - - StartHeader - { - REAL_UINT64 NextHeaderOffset - REAL_UINT64 NextHeaderSize - UINT32 NextHeaderCRC - } - - -........................... - - -ArchiveProperties -~~~~~~~~~~~~~~~~~ -BYTE NID::kArchiveProperties (0x02) -for (;;) -{ - BYTE PropertyType; - if (aType == 0) - break; - UINT64 PropertySize; - BYTE PropertyData[PropertySize]; -} - - -Digests (NumStreams) -~~~~~~~~~~~~~~~~~~~~~ - BYTE AllAreDefined - if (AllAreDefined == 0) - { - for(NumStreams) - BIT Defined - } - UINT32 CRCs[NumDefined] - - -PackInfo -~~~~~~~~~~~~ - BYTE NID::kPackInfo (0x06) - UINT64 PackPos - UINT64 NumPackStreams - - [] - BYTE NID::kSize (0x09) - UINT64 PackSizes[NumPackStreams] - [] - - [] - BYTE NID::kCRC (0x0A) - PackStreamDigests[NumPackStreams] - [] - - BYTE NID::kEnd - - -Folder -~~~~~~ - UINT64 NumCoders; - for (NumCoders) - { - BYTE - { - 0:3 CodecIdSize - 4: Is Complex Coder - 5: There Are Attributes - 6: Reserved - 7: There are more alternative methods. (Not used anymore, must be 0). - } - BYTE CodecId[CodecIdSize] - if (Is Complex Coder) - { - UINT64 NumInStreams; - UINT64 NumOutStreams; - } - if (There Are Attributes) - { - UINT64 PropertiesSize - BYTE Properties[PropertiesSize] - } - } - - NumBindPairs = NumOutStreamsTotal - 1; - - for (NumBindPairs) - { - UINT64 InIndex; - UINT64 OutIndex; - } - - NumPackedStreams = NumInStreamsTotal - NumBindPairs; - if (NumPackedStreams > 1) - for(NumPackedStreams) - { - UINT64 Index; - }; - - - - -Coders Info -~~~~~~~~~~~ - - BYTE NID::kUnPackInfo (0x07) - - - BYTE NID::kFolder (0x0B) - UINT64 NumFolders - BYTE External - switch(External) - { - case 0: - Folders[NumFolders] - case 1: - UINT64 DataStreamIndex - } - - - BYTE ID::kCodersUnPackSize (0x0C) - for(Folders) - for(Folder.NumOutStreams) - UINT64 UnPackSize; - - - [] - BYTE NID::kCRC (0x0A) - UnPackDigests[NumFolders] - [] - - - - BYTE NID::kEnd - - - -SubStreams Info -~~~~~~~~~~~~~~ - BYTE NID::kSubStreamsInfo; (0x08) - - [] - BYTE NID::kNumUnPackStream; (0x0D) - UINT64 NumUnPackStreamsInFolders[NumFolders]; - [] - - - [] - BYTE NID::kSize (0x09) - UINT64 UnPackSizes[] - [] - - - [] - BYTE NID::kCRC (0x0A) - Digests[Number of streams with unknown CRC] - [] - - - BYTE NID::kEnd - - -Streams Info -~~~~~~~~~~~~ - - [] - PackInfo - [] - - - [] - CodersInfo - [] - - - [] - SubStreamsInfo - [] - - BYTE NID::kEnd - - -FilesInfo -~~~~~~~~~ - BYTE NID::kFilesInfo; (0x05) - UINT64 NumFiles - - for (;;) - { - BYTE PropertyType; - if (aType == 0) - break; - - UINT64 Size; - - switch(PropertyType) - { - kEmptyStream: (0x0E) - for(NumFiles) - BIT IsEmptyStream - - kEmptyFile: (0x0F) - for(EmptyStreams) - BIT IsEmptyFile - - kAnti: (0x10) - for(EmptyStreams) - BIT IsAntiFile - - case kCTime: (0x12) - case kATime: (0x13) - case kMTime: (0x14) - BYTE AllAreDefined - if (AllAreDefined == 0) - { - for(NumFiles) - BIT TimeDefined - } - BYTE External; - if(External != 0) - UINT64 DataIndex - [] - for(Definded Items) - REAL_UINT64 Time - [] - - kNames: (0x11) - BYTE External; - if(External != 0) - UINT64 DataIndex - [] - for(Files) - { - wchar_t Names[NameSize]; - wchar_t 0; - } - [] - - kAttributes: (0x15) - BYTE AllAreDefined - if (AllAreDefined == 0) - { - for(NumFiles) - BIT AttributesAreDefined - } - BYTE External; - if(External != 0) - UINT64 DataIndex - [] - for(Definded Attributes) - UINT32 Attributes - [] - } - } - - -Header -~~~~~~ - BYTE NID::kHeader (0x01) - - [] - ArchiveProperties - [] - - [] - BYTE NID::kAdditionalStreamsInfo; (0x03) - StreamsInfo - [] - - [] - BYTE NID::kMainStreamsInfo; (0x04) - StreamsInfo - [] - - [] - FilesInfo - [] - - BYTE NID::kEnd - - -HeaderInfo -~~~~~~~~~~ - [] - BYTE NID::kEncodedHeader; (0x17) - StreamsInfo for Encoded Header - [] - - ---- -End of document +7z Format description (18.06) +---------------------------- + +This file contains description of 7z archive format. +7z archive can contain files compressed with any method. +See "Methods.txt" for description for defined compressing methods. + + +Format structure Overview +------------------------- + +Some fields can be optional. + +Archive structure +~~~~~~~~~~~~~~~~~ +SignatureHeader +[PackedStreams] +[PackedStreamsForHeaders] +[ + Header + or + { + Packed Header + HeaderInfo + } +] + + + +Header structure +~~~~~~~~~~~~~~~~ +{ + ArchiveProperties + AdditionalStreams + { + PackInfo + { + PackPos + NumPackStreams + Sizes[NumPackStreams] + CRCs[NumPackStreams] + } + CodersInfo + { + NumFolders + Folders[NumFolders] + { + NumCoders + CodersInfo[NumCoders] + { + ID + NumInStreams; + NumOutStreams; + PropertiesSize + Properties[PropertiesSize] + } + NumBindPairs + BindPairsInfo[NumBindPairs] + { + InIndex; + OutIndex; + } + PackedIndices + } + UnPackSize[Folders][Folders.NumOutstreams] + CRCs[NumFolders] + } + SubStreamsInfo + { + NumUnPackStreamsInFolders[NumFolders]; + UnPackSizes[] + CRCs[] + } + } + MainStreamsInfo + { + (Same as in AdditionalStreams) + } + FilesInfo + { + NumFiles + Properties[] + { + ID + Size + Data + } + } +} + +HeaderInfo structure +~~~~~~~~~~~~~~~~~~~~ +{ + (Same as in AdditionalStreams) +} + + + +Notes about Notation and encoding +--------------------------------- + +7z uses little endian encoding. + +7z archive format has optional headers that are marked as +[] +Header +[] + +REAL_UINT64 means real UINT64. + +UINT64 means real UINT64 encoded with the following scheme: + + Size of encoding sequence depends from first byte: + First_Byte Extra_Bytes Value + (binary) + 0xxxxxxx : ( xxxxxxx ) + 10xxxxxx BYTE y[1] : ( xxxxxx << (8 * 1)) + y + 110xxxxx BYTE y[2] : ( xxxxx << (8 * 2)) + y + ... + 1111110x BYTE y[6] : ( x << (8 * 6)) + y + 11111110 BYTE y[7] : y + 11111111 BYTE y[8] : y + + + +Property IDs +------------ + +0x00 = kEnd + +0x01 = kHeader + +0x02 = kArchiveProperties + +0x03 = kAdditionalStreamsInfo +0x04 = kMainStreamsInfo +0x05 = kFilesInfo + +0x06 = kPackInfo +0x07 = kUnPackInfo +0x08 = kSubStreamsInfo + +0x09 = kSize +0x0A = kCRC + +0x0B = kFolder + +0x0C = kCodersUnPackSize +0x0D = kNumUnPackStream + +0x0E = kEmptyStream +0x0F = kEmptyFile +0x10 = kAnti + +0x11 = kName +0x12 = kCTime +0x13 = kATime +0x14 = kMTime +0x15 = kWinAttributes +0x16 = kComment + +0x17 = kEncodedHeader + +0x18 = kStartPos +0x19 = kDummy + + +7z format headers +----------------- + +SignatureHeader +~~~~~~~~~~~~~~~ + BYTE kSignature[6] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C}; + + ArchiveVersion + { + BYTE Major; // now = 0 + BYTE Minor; // now = 4 + }; + + UINT32 StartHeaderCRC; + + StartHeader + { + REAL_UINT64 NextHeaderOffset + REAL_UINT64 NextHeaderSize + UINT32 NextHeaderCRC + } + + +........................... + + +ArchiveProperties +~~~~~~~~~~~~~~~~~ +BYTE NID::kArchiveProperties (0x02) +for (;;) +{ + BYTE PropertyType; + if (aType == 0) + break; + UINT64 PropertySize; + BYTE PropertyData[PropertySize]; +} + + +Digests (NumStreams) +~~~~~~~~~~~~~~~~~~~~~ + BYTE AllAreDefined + if (AllAreDefined == 0) + { + for(NumStreams) + BIT Defined + } + UINT32 CRCs[NumDefined] + + +PackInfo +~~~~~~~~~~~~ + BYTE NID::kPackInfo (0x06) + UINT64 PackPos + UINT64 NumPackStreams + + [] + BYTE NID::kSize (0x09) + UINT64 PackSizes[NumPackStreams] + [] + + [] + BYTE NID::kCRC (0x0A) + PackStreamDigests[NumPackStreams] + [] + + BYTE NID::kEnd + + +Folder +~~~~~~ + UINT64 NumCoders; + for (NumCoders) + { + BYTE + { + 0:3 CodecIdSize + 4: Is Complex Coder + 5: There Are Attributes + 6: Reserved + 7: There are more alternative methods. (Not used anymore, must be 0). + } + BYTE CodecId[CodecIdSize] + if (Is Complex Coder) + { + UINT64 NumInStreams; + UINT64 NumOutStreams; + } + if (There Are Attributes) + { + UINT64 PropertiesSize + BYTE Properties[PropertiesSize] + } + } + + NumBindPairs = NumOutStreamsTotal - 1; + + for (NumBindPairs) + { + UINT64 InIndex; + UINT64 OutIndex; + } + + NumPackedStreams = NumInStreamsTotal - NumBindPairs; + if (NumPackedStreams > 1) + for(NumPackedStreams) + { + UINT64 Index; + }; + + + + +Coders Info +~~~~~~~~~~~ + + BYTE NID::kUnPackInfo (0x07) + + + BYTE NID::kFolder (0x0B) + UINT64 NumFolders + BYTE External + switch(External) + { + case 0: + Folders[NumFolders] + case 1: + UINT64 DataStreamIndex + } + + + BYTE ID::kCodersUnPackSize (0x0C) + for(Folders) + for(Folder.NumOutStreams) + UINT64 UnPackSize; + + + [] + BYTE NID::kCRC (0x0A) + UnPackDigests[NumFolders] + [] + + + + BYTE NID::kEnd + + + +SubStreams Info +~~~~~~~~~~~~~~ + BYTE NID::kSubStreamsInfo; (0x08) + + [] + BYTE NID::kNumUnPackStream; (0x0D) + UINT64 NumUnPackStreamsInFolders[NumFolders]; + [] + + + [] + BYTE NID::kSize (0x09) + UINT64 UnPackSizes[] + [] + + + [] + BYTE NID::kCRC (0x0A) + Digests[Number of streams with unknown CRC] + [] + + + BYTE NID::kEnd + + +Streams Info +~~~~~~~~~~~~ + + [] + PackInfo + [] + + + [] + CodersInfo + [] + + + [] + SubStreamsInfo + [] + + BYTE NID::kEnd + + +FilesInfo +~~~~~~~~~ + BYTE NID::kFilesInfo; (0x05) + UINT64 NumFiles + + for (;;) + { + BYTE PropertyType; + if (aType == 0) + break; + + UINT64 Size; + + switch(PropertyType) + { + kEmptyStream: (0x0E) + for(NumFiles) + BIT IsEmptyStream + + kEmptyFile: (0x0F) + for(EmptyStreams) + BIT IsEmptyFile + + kAnti: (0x10) + for(EmptyStreams) + BIT IsAntiFile + + case kCTime: (0x12) + case kATime: (0x13) + case kMTime: (0x14) + BYTE AllAreDefined + if (AllAreDefined == 0) + { + for(NumFiles) + BIT TimeDefined + } + BYTE External; + if(External != 0) + UINT64 DataIndex + [] + for(Definded Items) + REAL_UINT64 Time + [] + + kNames: (0x11) + BYTE External; + if(External != 0) + UINT64 DataIndex + [] + for(Files) + { + wchar_t Names[NameSize]; + wchar_t 0; + } + [] + + kAttributes: (0x15) + BYTE AllAreDefined + if (AllAreDefined == 0) + { + for(NumFiles) + BIT AttributesAreDefined + } + BYTE External; + if(External != 0) + UINT64 DataIndex + [] + for(Definded Attributes) + UINT32 Attributes + [] + } + } + + +Header +~~~~~~ + BYTE NID::kHeader (0x01) + + [] + ArchiveProperties + [] + + [] + BYTE NID::kAdditionalStreamsInfo; (0x03) + StreamsInfo + [] + + [] + BYTE NID::kMainStreamsInfo; (0x04) + StreamsInfo + [] + + [] + FilesInfo + [] + + BYTE NID::kEnd + + +HeaderInfo +~~~~~~~~~~ + [] + BYTE NID::kEncodedHeader; (0x17) + StreamsInfo for Encoded Header + [] + + +--- +End of document diff --git a/src/sdk/DOC/Methods.txt b/src/sdk/DOC/Methods.txt index d4a1b1d..b840b55 100644 --- a/src/sdk/DOC/Methods.txt +++ b/src/sdk/DOC/Methods.txt @@ -1,173 +1,177 @@ -7-Zip method IDs for 7z and xz archives ---------------------------------------- - -Version: 18.06 -Date: 2018-06-30 - -Each compression or crypto method in 7z is associated with unique binary value (ID). -The length of ID in bytes is arbitrary but it can not exceed 63 bits (8 bytes). - -xz and 7z formats use same ID map. - -If you want to add some new ID, you have two ways: - 1) Write request for allocating IDs to 7-Zip developers. - 2) Generate 8-bytes ID: - - 3F ZZ ZZ ZZ ZZ ZZ MM MM - - 3F - Prefix for random IDs (1 byte) - ZZ ZZ ZZ ZZ ZZ - Developer ID (5 bytes). Use real random bytes. - - MM MM - Method ID (2 bytes) - - You can notify 7-Zip developers about your Developer ID / Method ID. - - Note: Use new ID, if old codec can not decode data encoded with new version. - - -List of defined IDs -------------------- - -00 - Copy - -03 - Delta -04 - BCJ (x86) -05 - PPC (big-endian) -06 - IA64 -07 - ARM (little-endian) -08 - ARMT (little-endian) -09 - SPARC - -21 - LZMA2 - -02.. - Common - 03 [Swap] - - 2 Swap2 - - 4 Swap4 - -03.. - 7z - 01 - - 01 - LZMA - - 03 - [Branch Codecs] - 01 - [x86 Codecs] - 03 - BCJ - 1B - BCJ2 (4 packed streams) - 02 - - 05 - PPC (big-endian) - 03 - - 01 - Alpha - 04 - - 01 - IA64 - 05 - - 01 - ARM (little-endian) - 06 - - 05 - M68 (big-endian) - 07 - - 01 - ARMT (little-endian) - 08 - - 05 - SPARC - - 04 - - 01 - PPMD - - 7F - - 01 - experimental method. - - -04.. - Misc codecs - - 00 - Reserved - - 01 - [Zip] - 00 - Copy (not used. Use {00} instead) - 01 - Shrink - 06 - Implode - 08 - Deflate - 09 - Deflate64 - 0A - Imploding - 0C - BZip2 (not used. Use {040202} instead) - 0E - LZMA (LZMA-zip) - 5F - xz - 60 - Jpeg - 61 - WavPack - 62 - PPMd (PPMd-zip) - 63 - wzAES - - 02 - - 02 - BZip2 - - 03 - [Rar] - 01 - Rar1 - 02 - Rar2 - 03 - Rar3 - 05 - Rar5 - - 04 - [Arj] - 01 - Arj(1,2,3) - 02 - Arj4 - - 05 - [Z] - - 06 - [Lzh] - - 07 - Reserved for 7z - - 08 - [Cab] - - 09 - [NSIS] - 01 - DeflateNSIS - 02 - BZip2NSIS - - F7 - External codecs (that are not included to 7-Zip) - - 0x xx - reserved - - 10 xx - reserved (LZHAM) - 01 - LZHAM - - 11 xx - reserved (Tino Reichardt) - 01 - ZSTD - 02 - BROTLI - 04 - LZ4 - 05 - LZ5 - 06 - LIZARD - - 12 xx - reserverd (Denis Anisimov) - - 01 - WavPack2 - FE - eSplitter - FF - RawSplitter - - -06.. - Crypto - - F0 - Ciphers without hashing algo - - 01 - [AES] - 0x - AES-128 - 4x - AES-192 - 8x - AES-256 - Cx - AES - - x0 - ECB - x1 - CBC - x2 - CFB - x3 - OFB - x4 - CTR - - F1 - Combine Ciphers - - 01 - [Zip] - 01 - ZipCrypto (Main Zip crypto algo) - - 03 - [RAR] - 02 - - 03 - Rar29AES (AES-128 + modified SHA-1) - - 07 - [7z] - 01 - 7zAES (AES-256 + SHA-256) - - ---- -End of document +7-Zip method IDs for 7z and xz archives +--------------------------------------- + +Version: 24.02 +Date: 2024-03-22 + +Each compression or crypto method in 7z is associated with unique binary value (ID). +The length of ID in bytes is arbitrary but it can not exceed 63 bits (8 bytes). + +xz and 7z formats use same ID map. + +If you want to add some new ID, you have two ways: + 1) Write request for allocating IDs to 7-Zip developers. + 2) Generate 8-bytes ID: + + 3F ZZ ZZ ZZ ZZ ZZ MM MM + + 3F - Prefix for random IDs (1 byte) + ZZ ZZ ZZ ZZ ZZ - Developer ID (5 bytes). Use real random bytes. + + MM MM - Method ID (2 bytes) + + You can notify 7-Zip developers about your Developer ID / Method ID. + + Note: Use new ID, if old codec can not decode data encoded with new version. + + +List of defined IDs +------------------- + +00 - Copy + +03 - Delta +04 - BCJ (x86) +05 - PPC (big-endian) +06 - IA64 +07 - ARM (little-endian) +08 - ARMT (little-endian) +09 - SPARC +0A - ARM64 +0B - RISCV + +21 - LZMA2 + +02.. - Common + 03 [Swap] + - 2 Swap2 + - 4 Swap4 + +03.. - 7z + 01 - + 01 - LZMA + + 03 - [Branch Codecs] + 01 - [x86 Codecs] + 03 - BCJ + 1B - BCJ2 (4 packed streams) + 02 - + 05 - PPC (big-endian) + 03 - + 01 - Alpha + 04 - + 01 - IA64 + 05 - + 01 - ARM (little-endian) + 06 - + 05 - M68 (big-endian) + 07 - + 01 - ARMT (little-endian) + 08 - + 05 - SPARC + + 04 - + 01 - PPMD + + 7F - + 01 - experimental method. + + +04.. - Misc codecs + + 00 - Reserved + + 01 - [Zip] + 00 - Copy (not used. Use {00} instead) + 01 - Shrink + 06 - Implode + 08 - Deflate + 09 - Deflate64 + 0A - Imploding + 0C - BZip2 (not used. Use {040202} instead) + 0E - LZMA (LZMA-zip) + + 5D - ZSTD + 5F - xz + 60 - Jpeg + 61 - WavPack + 62 - PPMd (PPMd-zip) + 63 - wzAES + + 02 - + 02 - BZip2 + + 03 - [Rar] + 01 - Rar1 + 02 - Rar2 + 03 - Rar3 + 05 - Rar5 + + 04 - [Arj] + 01 - Arj(1,2,3) + 02 - Arj4 + + 05 - [Z] + + 06 - [Lzh] + + 07 - Reserved for 7z + + 08 - [Cab] + + 09 - [NSIS] + 01 - DeflateNSIS + 02 - BZip2NSIS + + F7 - External codecs (that are not included to 7-Zip) + + 0x xx - reserved + + 10 xx - reserved (LZHAM) + 01 - LZHAM + + 11 xx - reserved (Tino Reichardt) + 01 - ZSTD + 02 - BROTLI + 04 - LZ4 + 05 - LZ5 + 06 - LIZARD + + 12 xx - reserverd (Denis Anisimov) + + 01 - WavPack2 + FE - eSplitter + FF - RawSplitter + + +06.. - Crypto + + F0 - Ciphers without hashing algo + + 01 - [AES] + 0x - AES-128 + 4x - AES-192 + 8x - AES-256 + Cx - AES + + x0 - ECB + x1 - CBC + x2 - CFB + x3 - OFB + x4 - CTR + + F1 - Combine Ciphers + + 01 - [Zip] + 01 - ZipCrypto (Main Zip crypto algo) + + 03 - [RAR] + 02 - + 03 - Rar29AES (AES-128 + modified SHA-1) + + 07 - [7z] + 01 - 7zAES (AES-256 + SHA-256) + + +--- +End of document diff --git a/src/sdk/DOC/installer.txt b/src/sdk/DOC/installer.txt index b99d21d..70ad7dc 100644 --- a/src/sdk/DOC/installer.txt +++ b/src/sdk/DOC/installer.txt @@ -1,166 +1,166 @@ -7-Zip for installers 9.38 -------------------------- - -7-Zip is a file archiver for Windows NT/2000/2003/2008/XP/Vista/7/8/10. - -7-Zip for installers is part of LZMA SDK. -LZMA SDK is written and placed in the public domain by Igor Pavlov. - -It's allowed to join 7-Zip SFX module with another software. -It's allowed to change resources of 7-Zip's SFX modules. - - -HOW to use ------------ - -7zr.exe is reduced version of 7za.exe of 7-Zip. -7zr.exe supports only format with these codecs: LZMA, LZMA2, BCJ, BCJ2, ARM, Copy. - -Example of compressing command for installation packages: - -7zr a archive.7z files - -7zSD.sfx is SFX module for installers. 7zSD.sfx uses msvcrt.dll. - -SFX modules for installers allow to create installation program. -Such module extracts archive to temp folder and then runs specified program and removes -temp files after program finishing. Self-extract archive for installers must be created -as joining 3 files: SFX_Module, Installer_Config, 7z_Archive. -Installer_Config is optional file. You can use the following command to create installer -self-extract archive: - -copy /b 7zSD.sfx + config.txt + archive.7z archive.exe - -The smallest installation package size can be achieved, if installation files was -uncompressed before including to 7z archive. - --y switch for installer module (at runtime) specifies quiet mode for extracting. - -Installer Config file format -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Config file contains commands for Installer. File begins from string -;!@Install@!UTF-8! and ends with ;!@InstallEnd@!. File must be written -in UTF-8 encoding. File contains string pairs: - -ID_String="Value" - -ID_String Description - -Title Title for messages -BeginPrompt Begin Prompt message -Progress Value can be "yes" or "no". Default value is "yes". -RunProgram Command for executing. Default value is "setup.exe". - Substring %%T will be replaced with path to temporary - folder, where files were extracted -Directory Directory prefix for "RunProgram". Default value is ".\\" -ExecuteFile Name of file for executing -ExecuteParameters Parameters for "ExecuteFile" - - -You can omit any string pair. - -There are two ways to run program: RunProgram and ExecuteFile. -Use RunProgram, if you want to run some program from .7z archive. -Use ExecuteFile, if you want to open some document from .7z archive or -if you want to execute some command from Windows. - -If you use RunProgram and if you specify empty directory prefix: Directory="", -the system searches for the executable file in the following sequence: - -1. The directory from which the application (installer) loaded. -2. The temporary folder, where files were extracted. -3. The Windows system directory. - - -Config file Examples -~~~~~~~~~~~~~~~~~~~~ - -;!@Install@!UTF-8! -Title="7-Zip 4.00" -BeginPrompt="Do you want to install the 7-Zip 4.00?" -RunProgram="setup.exe" -;!@InstallEnd@! - - - -;!@Install@!UTF-8! -Title="7-Zip 4.00" -BeginPrompt="Do you want to install the 7-Zip 4.00?" -ExecuteFile="7zip.msi" -;!@InstallEnd@! - - - -;!@Install@!UTF-8! -Title="7-Zip 4.01 Update" -BeginPrompt="Do you want to install the 7-Zip 4.01 Update?" -ExecuteFile="msiexec.exe" -ExecuteParameters="/i 7zip.msi REINSTALL=ALL REINSTALLMODE=vomus" -;!@InstallEnd@! - - - -Small SFX modules for installers --------------------------------- - -7zS2.sfx - small SFX module (GUI version) -7zS2con.sfx - small SFX module (Console version) - -Small SFX modules support this codecs: LZMA, LZMA2, BCJ, BCJ2, ARM, COPY - -Small SFX module is similar to common SFX module for installers. -The difference (what's new in small version): - - Smaller size (30 KB vs 100 KB) - - C source code instead of Ñ++ - - No installer Configuration file - - No extracting progress window - - It decompresses solid 7z blocks (it can be whole 7z archive) to RAM. - So user that calls SFX installer must have free RAM of size of largest - solid 7z block (size of 7z archive at simplest case). - -How to use ----------- - -copy /b 7zS2.sfx + archive.7z sfx.exe - -When you run installer sfx module (sfx.exe) -1) It creates "7zNNNNNNNN" temp folder in system temp folder. -2) It extracts .7z archive to that folder -3) It executes one file from "7zNNNNNNNN" temp folder. -4) It removes "7zNNNNNNNN" temp folder - -You can send parameters to installer, and installer will transfer them to extracted .exe file. - -Small SFX uses 3 levels of priorities to select file to execute: - - 1) Files in root folder have higher priority than files in subfolders. - 2) File extension priorities (from high to low priority order): - bat, cmd, exe, inf, msi, cab (under Windows CE), html, htm - 3) File name priorities (from high to low priority order): - setup, install, run, start - -Windows CE (ARM) version of 7zS2.sfx is included to 7-Zip for Windows Mobile package. - - -Examples --------- - -1) To create compressed console 7-Zip: - -7zr a c.7z 7z.exe 7z.dll -mx -copy /b 7zS2con.sfx + c.7z 7zCompr.exe -7zCompr.exe b -md22 - - -2) To create compressed GUI 7-Zip: - -7zr a g.7z 7zg.exe 7z.dll -mx -copy /b 7zS2.sfx + g.7z 7zgCompr.exe -7zgCompr.exe b -md22 - - -3) To open some file: - -7zr a h.7z readme.txt -mx -copy /b 7zS2.sfx + h.7z 7zTxt.exe -7zTxt.exe +7-Zip for installers 9.38 +------------------------- + +7-Zip is a file archiver for Windows NT/2000/2003/2008/XP/Vista/7/8/10. + +7-Zip for installers is part of LZMA SDK. +LZMA SDK is written and placed in the public domain by Igor Pavlov. + +It's allowed to join 7-Zip SFX module with another software. +It's allowed to change resources of 7-Zip's SFX modules. + + +HOW to use +----------- + +7zr.exe is reduced version of 7za.exe of 7-Zip. +7zr.exe supports only format with these codecs: LZMA, LZMA2, BCJ, BCJ2, ARM, Copy. + +Example of compressing command for installation packages: + +7zr a archive.7z files + +7zSD.sfx is SFX module for installers. 7zSD.sfx uses msvcrt.dll. + +SFX modules for installers allow to create installation program. +Such module extracts archive to temp folder and then runs specified program and removes +temp files after program finishing. Self-extract archive for installers must be created +as joining 3 files: SFX_Module, Installer_Config, 7z_Archive. +Installer_Config is optional file. You can use the following command to create installer +self-extract archive: + +copy /b 7zSD.sfx + config.txt + archive.7z archive.exe + +The smallest installation package size can be achieved, if installation files was +uncompressed before including to 7z archive. + +-y switch for installer module (at runtime) specifies quiet mode for extracting. + +Installer Config file format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Config file contains commands for Installer. File begins from string +;!@Install@!UTF-8! and ends with ;!@InstallEnd@!. File must be written +in UTF-8 encoding. File contains string pairs: + +ID_String="Value" + +ID_String Description + +Title Title for messages +BeginPrompt Begin Prompt message +Progress Value can be "yes" or "no". Default value is "yes". +RunProgram Command for executing. Default value is "setup.exe". + Substring %%T will be replaced with path to temporary + folder, where files were extracted +Directory Directory prefix for "RunProgram". Default value is ".\\" +ExecuteFile Name of file for executing +ExecuteParameters Parameters for "ExecuteFile" + + +You can omit any string pair. + +There are two ways to run program: RunProgram and ExecuteFile. +Use RunProgram, if you want to run some program from .7z archive. +Use ExecuteFile, if you want to open some document from .7z archive or +if you want to execute some command from Windows. + +If you use RunProgram and if you specify empty directory prefix: Directory="", +the system searches for the executable file in the following sequence: + +1. The directory from which the application (installer) loaded. +2. The temporary folder, where files were extracted. +3. The Windows system directory. + + +Config file Examples +~~~~~~~~~~~~~~~~~~~~ + +;!@Install@!UTF-8! +Title="7-Zip 4.00" +BeginPrompt="Do you want to install the 7-Zip 4.00?" +RunProgram="setup.exe" +;!@InstallEnd@! + + + +;!@Install@!UTF-8! +Title="7-Zip 4.00" +BeginPrompt="Do you want to install the 7-Zip 4.00?" +ExecuteFile="7zip.msi" +;!@InstallEnd@! + + + +;!@Install@!UTF-8! +Title="7-Zip 4.01 Update" +BeginPrompt="Do you want to install the 7-Zip 4.01 Update?" +ExecuteFile="msiexec.exe" +ExecuteParameters="/i 7zip.msi REINSTALL=ALL REINSTALLMODE=vomus" +;!@InstallEnd@! + + + +Small SFX modules for installers +-------------------------------- + +7zS2.sfx - small SFX module (GUI version) +7zS2con.sfx - small SFX module (Console version) + +Small SFX modules support this codecs: LZMA, LZMA2, BCJ, BCJ2, ARM, COPY + +Small SFX module is similar to common SFX module for installers. +The difference (what's new in small version): + - Smaller size (30 KB vs 100 KB) + - C source code instead of Ñ++ + - No installer Configuration file + - No extracting progress window + - It decompresses solid 7z blocks (it can be whole 7z archive) to RAM. + So user that calls SFX installer must have free RAM of size of largest + solid 7z block (size of 7z archive at simplest case). + +How to use +---------- + +copy /b 7zS2.sfx + archive.7z sfx.exe + +When you run installer sfx module (sfx.exe) +1) It creates "7zNNNNNNNN" temp folder in system temp folder. +2) It extracts .7z archive to that folder +3) It executes one file from "7zNNNNNNNN" temp folder. +4) It removes "7zNNNNNNNN" temp folder + +You can send parameters to installer, and installer will transfer them to extracted .exe file. + +Small SFX uses 3 levels of priorities to select file to execute: + + 1) Files in root folder have higher priority than files in subfolders. + 2) File extension priorities (from high to low priority order): + bat, cmd, exe, inf, msi, cab (under Windows CE), html, htm + 3) File name priorities (from high to low priority order): + setup, install, run, start + +Windows CE (ARM) version of 7zS2.sfx is included to 7-Zip for Windows Mobile package. + + +Examples +-------- + +1) To create compressed console 7-Zip: + +7zr a c.7z 7z.exe 7z.dll -mx +copy /b 7zS2con.sfx + c.7z 7zCompr.exe +7zCompr.exe b -md22 + + +2) To create compressed GUI 7-Zip: + +7zr a g.7z 7zg.exe 7z.dll -mx +copy /b 7zS2.sfx + g.7z 7zgCompr.exe +7zgCompr.exe b -md22 + + +3) To open some file: + +7zr a h.7z readme.txt -mx +copy /b 7zS2.sfx + h.7z 7zTxt.exe +7zTxt.exe diff --git a/src/sdk/DOC/lzma-history.txt b/src/sdk/DOC/lzma-history.txt index 48ee748..96da8bf 100644 --- a/src/sdk/DOC/lzma-history.txt +++ b/src/sdk/DOC/lzma-history.txt @@ -1,446 +1,651 @@ -HISTORY of the LZMA SDK ------------------------ - -19.00 2019-02-21 -------------------------- -- Encryption strength for 7z archives was increased: - the size of random initialization vector was increased from 64-bit to 128-bit, - and the pseudo-random number generator was improved. -- The bug in 7zIn.c code was fixed. - - -18.06 2018-12-30 -------------------------- -- The speed for LZMA/LZMA2 compressing was increased by 3-10%, - and there are minor changes in compression ratio. -- Some bugs were fixed. -- The bug in 7-Zip 18.02-18.05 was fixed: - There was memory leak in multithreading xz decoder - XzDecMt_Decode(), - if xz stream contains only one block. -- The changes for MSVS compiler makefiles: - - the makefiles now use "PLATFORM" macroname with values (x64, x86, arm64) - instead of "CPU" macroname with values (AMD64, ARM64). - - the makefiles by default now use static version of the run-time library. - - -18.05 2018-04-30 -------------------------- -- The speed for LZMA/LZMA2 compressing was increased - by 8% for fastest/fast compression levels and - by 3% for normal/maximum compression levels. -- Previous versions of 7-Zip could work incorrectly in "Large memory pages" mode in - Windows 10 because of some BUG with "Large Pages" in Windows 10. - Now 7-Zip doesn't use "Large Pages" on Windows 10 up to revision 1709 (16299). -- The BUG was fixed in Lzma2Enc.c - Lzma2Enc_Encode2() function worked incorretly, - if (inStream == NULL) and the number of block threads is more than 1. - - -18.03 beta 2018-03-04 -------------------------- -- Asm\x86\LzmaDecOpt.asm: new optimized LZMA decoder written in asm - for x64 with about 30% higher speed than main version of LZMA decoder written in C. -- The speed for single-thread LZMA/LZMA2 decoder written in C was increased by 3%. -- 7-Zip now can use multi-threading for 7z/LZMA2 decoding, - if there are multiple independent data chunks in LZMA2 stream. -- 7-Zip now can use multi-threading for xz decoding, - if there are multiple blocks in xz stream. - - -18.01 2019-01-28 -------------------------- -- The BUG in 17.01 - 18.00 beta was fixed: - XzDec.c : random block unpacking and XzUnpacker_IsBlockFinished() - didn't work correctly for xz archives without checksum (CRC). - - -18.00 beta 2019-01-10 -------------------------- -- The BUG in xz encoder was fixed: - There was memory leak of 16 KB for each file compressed with - xz compression method, if additional filter was used. - - -17.01 beta 2017-08-28 -------------------------- -- Minor speed optimization for LZMA2 (xz and 7z) multi-threading compression. - 7-Zip now uses additional memory buffers for multi-block LZMA2 compression. - CPU utilization was slightly improved. -- 7-zip now creates multi-block xz archives by default. Block size can be - specified with -ms[Size]{m|g} switch. -- xz decoder now can unpack random block from multi-block xz archives. -- 7-Zip command line: @listfile now doesn't work after -- switch. - Use -i@listfile before -- switch instead. -- The BUGs were fixed: - 7-Zip 17.00 beta crashed for commands that write anti-item to 7z archive. - - -17.00 beta 2017-04-29 -------------------------- -- NewHandler.h / NewHandler.cpp: - now it redefines operator new() only for old MSVC compilers (_MSC_VER < 1900). -- C/7zTypes.h : the names of variables in interface structures were changed (vt). -- Some bugs were fixed. 7-Zip could crash in some cases. -- Some internal changes in code. - - -16.04 2016-10-04 -------------------------- -- The bug was fixed in DllSecur.c. - - -16.03 2016-09-28 -------------------------- -- SFX modules now use some protection against DLL preloading attack. -- Some bugs in 7z code were fixed. - - -16.02 2016-05-21 -------------------------- -- The BUG in 16.00 - 16.01 was fixed: - Split Handler (SplitHandler.cpp) returned incorrect - total size value (kpidSize) for split archives. - - -16.01 2016-05-19 -------------------------- -- Some internal changes to reduce the number of compiler warnings. - - -16.00 2016-05-10 -------------------------- -- Some bugs were fixed. - - -15.12 2015-11-19 -------------------------- -- The BUG in C version of 7z decoder was fixed: - 7zDec.c : SzDecodeLzma2() - 7z decoder could mistakenly report about decoding error for some 7z archives - that use LZMA2 compression method. - The probability to get that mistaken decoding error report was about - one error per 16384 solid blocks for solid blocks larger than 16 KB (compressed size). -- The BUG (in 9.26-15.11) in C version of 7z decoder was fixed: - 7zArcIn.c : SzReadHeader2() - 7z decoder worked incorrectly for 7z archives that contain - empty solid blocks, that can be placed to 7z archive, if some file is - unavailable for reading during archive creation. - - -15.09 beta 2015-10-16 -------------------------- -- The BUG in LZMA / LZMA2 encoding code was fixed. - The BUG in LzFind.c::MatchFinder_ReadBlock() function. - If input data size is larger than (4 GiB - dictionary_size), - the following code worked incorrectly: - - LZMA : LzmaEnc_MemEncode(), LzmaEncode() : LZMA encoding functions - for compressing from memory to memory. - That BUG is not related to LZMA encoder version that works via streams. - - LZMA2 : multi-threaded version of LZMA2 encoder worked incorrectly, if - default value of chunk size (CLzma2EncProps::blockSize) is changed - to value larger than (4 GiB - dictionary_size). - - -9.38 beta 2015-01-03 -------------------------- -- The BUG in 9.31-9.37 was fixed: - IArchiveGetRawProps interface was disabled for 7z archives. -- The BUG in 9.26-9.36 was fixed: - Some code in CPP\7zip\Archive\7z\ worked correctly only under Windows. - - -9.36 beta 2014-12-26 -------------------------- -- The BUG in command line version was fixed: - 7-Zip created temporary archive in current folder during update archive - operation, if -w{Path} switch was not specified. - The fixed 7-Zip creates temporary archive in folder that contains updated archive. -- The BUG in 9.33-9.35 was fixed: - 7-Zip silently ignored file reading errors during 7z or gz archive creation, - and the created archive contained only part of file that was read before error. - The fixed 7-Zip stops archive creation and it reports about error. - - -9.35 beta 2014-12-07 -------------------------- -- 7zr.exe now support AES encryption. -- SFX mudules were added to LZMA SDK -- Some bugs were fixed. - - -9.21 beta 2011-04-11 -------------------------- -- New class FString for file names at file systems. -- Speed optimization in CRC code for big-endian CPUs. -- The BUG in Lzma2Dec.c was fixed: - Lzma2Decode function didn't work. - - -9.18 beta 2010-11-02 -------------------------- -- New small SFX module for installers (SfxSetup). - - -9.12 beta 2010-03-24 -------------------------- -- The BUG in LZMA SDK 9.* was fixed: LZMA2 codec didn't work, - if more than 10 threads were used (or more than 20 threads in some modes). - - -9.11 beta 2010-03-15 -------------------------- -- PPMd compression method support - - -9.09 2009-12-12 -------------------------- -- The bug was fixed: - Utf16_To_Utf8 funstions in UTFConvert.cpp and 7zMain.c - incorrectly converted surrogate characters (the code >= 0x10000) to UTF-8. -- Some bugs were fixed - - -9.06 2009-08-17 -------------------------- -- Some changes in ANSI-C 7z Decoder interfaces. - - -9.04 2009-05-30 -------------------------- -- LZMA2 compression method support -- xz format support - - -4.65 2009-02-03 -------------------------- -- Some minor fixes - - -4.63 2008-12-31 -------------------------- -- Some minor fixes - - -4.61 beta 2008-11-23 -------------------------- -- The bug in ANSI-C LZMA Decoder was fixed: - If encoded stream was corrupted, decoder could access memory - outside of allocated range. -- Some changes in ANSI-C 7z Decoder interfaces. -- LZMA SDK is placed in the public domain. - - -4.60 beta 2008-08-19 -------------------------- -- Some minor fixes. - - -4.59 beta 2008-08-13 -------------------------- -- The bug was fixed: - LZMA Encoder in fast compression mode could access memory outside of - allocated range in some rare cases. - - -4.58 beta 2008-05-05 -------------------------- -- ANSI-C LZMA Decoder was rewritten for speed optimizations. -- ANSI-C LZMA Encoder was included to LZMA SDK. -- C++ LZMA code now is just wrapper over ANSI-C code. - - -4.57 2007-12-12 -------------------------- -- Speed optimizations in Ñ++ LZMA Decoder. -- Small changes for more compatibility with some C/C++ compilers. - - -4.49 beta 2007-07-05 -------------------------- -- .7z ANSI-C Decoder: - - now it supports BCJ and BCJ2 filters - - now it supports files larger than 4 GB. - - now it supports "Last Write Time" field for files. -- C++ code for .7z archives compressing/decompressing from 7-zip - was included to LZMA SDK. - - -4.43 2006-06-04 -------------------------- -- Small changes for more compatibility with some C/C++ compilers. - - -4.42 2006-05-15 -------------------------- -- Small changes in .h files in ANSI-C version. - - -4.39 beta 2006-04-14 -------------------------- -- The bug in versions 4.33b:4.38b was fixed: - C++ version of LZMA encoder could not correctly compress - files larger than 2 GB with HC4 match finder (-mfhc4). - - -4.37 beta 2005-04-06 -------------------------- -- Fixes in C++ code: code could no be compiled if _NO_EXCEPTIONS was defined. - - -4.35 beta 2005-03-02 -------------------------- -- The bug was fixed in C++ version of LZMA Decoder: - If encoded stream was corrupted, decoder could access memory - outside of allocated range. - - -4.34 beta 2006-02-27 -------------------------- -- Compressing speed and memory requirements for compressing were increased -- LZMA now can use only these match finders: HC4, BT2, BT3, BT4 - - -4.32 2005-12-09 -------------------------- -- Java version of LZMA SDK was included - - -4.30 2005-11-20 -------------------------- -- Compression ratio was improved in -a2 mode -- Speed optimizations for compressing in -a2 mode -- -fb switch now supports values up to 273 -- The bug in 7z_C (7zIn.c) was fixed: - It used Alloc/Free functions from different memory pools. - So if program used two memory pools, it worked incorrectly. -- 7z_C: .7z format supporting was improved -- LZMA# SDK (C#.NET version) was included - - -4.27 (Updated) 2005-09-21 -------------------------- -- Some GUIDs/interfaces in C++ were changed. - IStream.h: - ISequentialInStream::Read now works as old ReadPart - ISequentialOutStream::Write now works as old WritePart - - -4.27 2005-08-07 -------------------------- -- The bug in LzmaDecodeSize.c was fixed: - if _LZMA_IN_CB and _LZMA_OUT_READ were defined, - decompressing worked incorrectly. - - -4.26 2005-08-05 -------------------------- -- Fixes in 7z_C code and LzmaTest.c: - previous versions could work incorrectly, - if malloc(0) returns 0 - - -4.23 2005-06-29 -------------------------- -- Small fixes in C++ code - - -4.22 2005-06-10 -------------------------- -- Small fixes - - -4.21 2005-06-08 -------------------------- -- Interfaces for ANSI-C LZMA Decoder (LzmaDecode.c) were changed -- New additional version of ANSI-C LZMA Decoder with zlib-like interface: - - LzmaStateDecode.h - - LzmaStateDecode.c - - LzmaStateTest.c -- ANSI-C LZMA Decoder now can decompress files larger than 4 GB - - -4.17 2005-04-18 -------------------------- -- New example for RAM->RAM compressing/decompressing: - LZMA + BCJ (filter for x86 code): - - LzmaRam.h - - LzmaRam.cpp - - LzmaRamDecode.h - - LzmaRamDecode.c - - -f86 switch for lzma.exe - - -4.16 2005-03-29 -------------------------- -- The bug was fixed in LzmaDecode.c (ANSI-C LZMA Decoder): - If _LZMA_OUT_READ was defined, and if encoded stream was corrupted, - decoder could access memory outside of allocated range. -- Speed optimization of ANSI-C LZMA Decoder (now it's about 20% faster). - Old version of LZMA Decoder now is in file LzmaDecodeSize.c. - LzmaDecodeSize.c can provide slightly smaller code than LzmaDecode.c -- Small speed optimization in LZMA C++ code -- filter for SPARC's code was added -- Simplified version of .7z ANSI-C Decoder was included - - -4.06 2004-09-05 -------------------------- -- The bug in v4.05 was fixed: - LZMA-Encoder didn't release output stream in some cases. - - -4.05 2004-08-25 -------------------------- -- Source code of filters for x86, IA-64, ARM, ARM-Thumb - and PowerPC code was included to SDK -- Some internal minor changes - - -4.04 2004-07-28 -------------------------- -- More compatibility with some C++ compilers - - -4.03 2004-06-18 -------------------------- -- "Benchmark" command was added. It measures compressing - and decompressing speed and shows rating values. - Also it checks hardware errors. - - -4.02 2004-06-10 -------------------------- -- C++ LZMA Encoder/Decoder code now is more portable - and it can be compiled by GCC on Linux. - - -4.01 2004-02-15 -------------------------- -- Some detection of data corruption was enabled. - LzmaDecode.c / RangeDecoderReadByte - ..... - { - rd->ExtraBytes = 1; - return 0xFF; - } - - -4.00 2004-02-13 -------------------------- -- Original version of LZMA SDK - - - -HISTORY of the LZMA -------------------- - 2001-2008: Improvements to LZMA compressing/decompressing code, - keeping compatibility with original LZMA format - 1996-2001: Development of LZMA compression format - - Some milestones: - - 2001-08-30: LZMA compression was added to 7-Zip - 1999-01-02: First version of 7-Zip was released - - -End of document +HISTORY of the LZMA SDK +----------------------- + +25.01 2025-08-03 +------------------------- +- The code for handling symbolic links has been changed + to provide greater security when extracting files from archives. + Command line switch -snld20 can be used to bypass default security + checks when creating symbolic links. + + +25.00 2025-07-05 +------------------------- +- 7-Zip for Windows can now use more than 64 CPU threads for compression + to zip/7z/xz archives and for the 7-Zip benchmark. + If there are more than one processor group in Windows (on systems with more than + 64 cpu threads), 7-Zip distributes running CPU threads across different processor groups. +- fixed some bugs and vulnerabilities. + + +24.09 2024-11-29 +------------------------- +- The default dictionary size values for LZMA/LZMA2 compression methods were increased: + dictionary size compression level + v24.08 v24.09 v24.09 + 32-bit 64-bit + 8 MB 16 MB 16 MB -mx4 + 16 MB 32 MB 32 MB -mx5 : Normal + 32 MB 64 MB 64 MB -mx6 + 32 MB 64 MB 128 MB -mx7 : Maximum + 64 MB 64 MB 256 MB -mx8 + 64 MB 64 MB 256 MB -mx9 : Ultra + The default dictionary size values for 32-bit versions of LZMA/LZMA2 don't exceed 64 MB. +- If an archive update operation uses a temporary archive folder and + the archive is moved to the destination folder, 7-Zip shows the progress of moving + the archive file, as this operation can take a long time if the archive is large. +- Some bugs were fixed. + + +24.07 2024-06-19 +------------------------- +- Changes in files: + Asm/x86/Sha256Opt.asm + Now it uses "READONLY" flag for constant array segment. + It fixes an issue where ".rodata" section in 7-Zip for x86/x64 Linux had a "WRITE" attribute. + + +24.05 2024-05-14 +------------------------- +- New switch -myv={MMNN} to set decoder compatibility version for 7z archive creating. + {MMNN} is 4-digit number that represents the version of 7-Zip without a dot. + If -myv={MMNN} switch is specified, 7-Zip will only use compression methods that can + be decoded by the specified version {MMNN} of 7-Zip and newer versions. + If -myv={MMNN} switch is not specified, -myv=2300 is used, and 7-Zip will only + use compression methods that can be decoded by 7-Zip 23.00 and newer versions. +- New switch -myfa={FilterID} to allow 7-Zip to use the specified filter method for 7z archive creating. +- New switch -myfd={FilterID} to disallow 7-Zip to use the specified filter method for 7z archive creating. + + +24.03 2024-03-23 +------------------------- +- 7-Zip now can use new RISCV filter for compression to 7z and xz archives. + RISCV filter can increase compression ratio for data containing executable + files compiled for RISC-V architecture. +- The speed for LZMA and LZMA2 decompression in ARM64 version for Windows + was increased by 20%-60%. + It uses arm64 assembler code, and clang-cl is required for arm64 assembler code compiling. +- -slmu switch : to show timestamps as UTC instead of LOCAL TIME. +- -slsl switch : in console 7-Zip for Windows : to show file paths with + linux path separator slash '/' instead of backslash separator '\'. +- 7-Zip supports .sha256 files that use backslash path separator '\'. +- Some bugs were fixed. + + +24.01 2024-01-31 +------------------------- +- 7-Zip uses file C/Precomp.h that is included to all c and c++ files. + CPP/Common/Common.h also includes C/Precomp.h. + C/Precomp.h defines the following macros (if _WIN32 is defined): + Z7_LARGE_PAGES 1 + Z7_LONG_PATH 1 + Z7_WIN32_WINNT_MIN 0x0500 (or higher) + _WIN32_WINNT 0x0500 (or higher) + WINVER _WIN32_WINNT + UNICODE 1 + _UNICODE 1 + if _WIN32_WINNT is defined already, C/Precomp.h doesn't redefine it. + +- Speed optimizations for hash caclulation: CRC-32, CRC-64. +- The bug was fixed: 7-Zip for Linux could fail for multivolume creation in some cases. +- 7zr.exe for arm64 is included to LZMA SDK package. +- Some bugs were fixed. + + +23.01 2023-06-20 +------------------------- +- 7-Zip now can use new ARM64 filter for compression to 7z and xz archives. + ARM64 filter can increase compression ratio for data containing executable + files compiled for ARM64 (AArch64) architecture. + Also 7-Zip now parses executable files (that have exe and dll filename extensions) + before compressing, and it selects appropriate filter for each parsed file: + - BCJ or BCJ2 filter for x86 executable files, + - ARM64 filter for ARM64 executable files. + Previous versions by default used x86 filter BCJ or BCJ2 for all exe/dll files. +- Default section size for BCJ2 filter was changed from 64 MiB to 240 MiB. + It can increase compression ratio for executable files larger than 64 MiB. +- Some optimizations in filters code: BCJ, BCJ2, Swap* and opthers. +- If 7-Zip uses BCJ2 filter for big datasets compressing, it can use additional temp + files in system's TEMP folder. 7-Zip uses temp file for additional compressed + data stream, if size of such compressed stream is larger than predefined limit: + 16 MiB in 32-bit version, 4 GiB in 64-bit version. +- When new 7-Zip creates multivolume archive, 7-Zip keeps in open state + only volumes that still can be changed. Previous versions kept all volumes + in open state until the end of the archive creation. +- 7-Zip for Linux and macOS now can reduce the number of simultaneously open files, + when 7-Zip opens, extracts or creates multivolume archive. It allows to avoid + the failures for cases with big number of volumes, bacause there is a limitation + for number of open files allowed for a single program in Linux and macOS. +- Some bugs were fixed. +- Source code changes: +- All external macros for compiling C/C++ code of 7-Zip now have Z7_ prefix. +- 7-Zip COM interfaces now use new macros that allow to declare and implement COM interface. +- The code has been modified to compile with the maximum diagnostic warning level: + -Wall in MSVC and -Weverything in CLANG. + And some warning types are disabled in 2 files: + - C/Compiler.h for C/C++ code warnings. + - CPP/Common/Common.h for C++ code warnings. +- Linux/macOS versions of 7-Zip: IUnknown interface in new code doesn't use + virtual destructor that was used in previous 7-Zip and p7zip: + // virtual ~IUnknown() {} + So 7-Zip's dynamically linked shared libraries (codecs) are not compatible + between new 7-Zip for Linux/macOS and old 7-Zip (and p7zip). + + +21.07 2021-12-26 +------------------------- +- New switches: -spm and -im!{file_path} to exclude directories from processing + for specified paths that don't contain path separator character at the end of path. +- The sorting order of files in archives was slightly changed to be more consistent + for cases where the name of some directory is the same as the prefix part of the name + of another directory or file. + + +21.06 2021-11-24 +------------------------- +- Bug in LZMA encoder in file LzmaEnc.c was fixed: + LzmaEnc_MemEncode(), LzmaEncode() and LzmaCompress() could work incorrectly, + if size value for output buffer is smaller than size required for all compressed data. + LzmaEnc_Encode() could work incorrectly, + if callback ISeqOutStream::Write() doesn't write all compressed data. + NCompress::NLzma::CEncoder::Code() could work incorrectly, + if callback ISequentialOutStream::Write() returns error code. +- Bug in versions 21.00-21.05 was fixed: + 7-Zip didn't set attributes of directories during archive extracting. + + +21.04 beta 2021-11-02 +------------------------- +- 7-Zip now reduces the number of working CPU threads for compression, + if RAM size is not enough for compression with big LZMA2 dictionary. +- 7-Zip now can create and check "file.sha256" text files that contain the list + of file names and SHA-256 checksums in format compatible with sha256sum program. + + +21.03 beta 2021-07-20 +------------------------- +- The maximum dictionary size for LZMA/LZMA2 compressing was increased to 4 GB (3840 MiB). +- Minor speed optimizations in LZMA/LZMA2 compressing. + + +21.02 alpha 2021-05-06 +------------------------- +- The command line version of 7-Zip for macOS was released. +- The speed for LZMA and LZMA2 decompression in arm64 versions for macOS and Linux + was increased by 20%-60%. + + +21.01 alpha 2021-03-09 +------------------------- +- The command line version of 7-Zip for Linux was released. +- The improvements for speed of ARM64 version using hardware CPU instructions + for AES, CRC-32, SHA-1 and SHA-256. +- Some bugs were fixed. + + +20.02 alpha 2020-08-08 +------------------------- +- The default number of LZMA2 chunks per solid block in 7z archive was increased to 64. + It allows to increase the compression speed for big 7z archives, if there is a big number + of CPU cores and threads. +- The speed of PPMd compressing/decompressing was increased for 7z archives. +- The new -ssp switch. If the switch -ssp is specified, 7-Zip doesn't allow the system + to modify "Last Access Time" property of source files for archiving and hashing operations. +- Some bugs were fixed. + + +20.00 alpha 2020-02-06 +------------------------- +- 7-Zip now supports new optional match finders for LZMA/LZMA2 compression: bt5 and hc5, + that can work faster than bt4 and hc4 match finders for the data with big redundancy. +- The compression ratio was improved for Fast and Fastest compression levels with the + following default settings: + - Fastest level (-mx1) : hc5 match finder with 256 KB dictionary. + - Fast level (-mx3) : hc5 match finder with 4 MB dictionary. +- Minor speed optimizations in multithreaded LZMA/LZMA2 compression for Normal/Maximum/Ultra + compression levels. + + +19.00 2019-02-21 +------------------------- +- Encryption strength for 7z archives was increased: + the size of random initialization vector was increased from 64-bit to 128-bit, + and the pseudo-random number generator was improved. +- The bug in 7zIn.c code was fixed. + + +18.06 2018-12-30 +------------------------- +- The speed for LZMA/LZMA2 compressing was increased by 3-10%, + and there are minor changes in compression ratio. +- Some bugs were fixed. +- The bug in 7-Zip 18.02-18.05 was fixed: + There was memory leak in multithreading xz decoder - XzDecMt_Decode(), + if xz stream contains only one block. +- The changes for MSVS compiler makefiles: + - the makefiles now use "PLATFORM" macroname with values (x64, x86, arm64) + instead of "CPU" macroname with values (AMD64, ARM64). + - the makefiles by default now use static version of the run-time library. + + +18.05 2018-04-30 +------------------------- +- The speed for LZMA/LZMA2 compressing was increased + by 8% for fastest/fast compression levels and + by 3% for normal/maximum compression levels. +- Previous versions of 7-Zip could work incorrectly in "Large memory pages" mode in + Windows 10 because of some BUG with "Large Pages" in Windows 10. + Now 7-Zip doesn't use "Large Pages" on Windows 10 up to revision 1709 (16299). +- The BUG was fixed in Lzma2Enc.c + Lzma2Enc_Encode2() function worked incorretly, + if (inStream == NULL) and the number of block threads is more than 1. + + +18.03 beta 2018-03-04 +------------------------- +- Asm\x86\LzmaDecOpt.asm: new optimized LZMA decoder written in asm + for x64 with about 30% higher speed than main version of LZMA decoder written in C. +- The speed for single-thread LZMA/LZMA2 decoder written in C was increased by 3%. +- 7-Zip now can use multi-threading for 7z/LZMA2 decoding, + if there are multiple independent data chunks in LZMA2 stream. +- 7-Zip now can use multi-threading for xz decoding, + if there are multiple blocks in xz stream. + + +18.01 2019-01-28 +------------------------- +- The BUG in 17.01 - 18.00 beta was fixed: + XzDec.c : random block unpacking and XzUnpacker_IsBlockFinished() + didn't work correctly for xz archives without checksum (CRC). + + +18.00 beta 2019-01-10 +------------------------- +- The BUG in xz encoder was fixed: + There was memory leak of 16 KB for each file compressed with + xz compression method, if additional filter was used. + + +17.01 beta 2017-08-28 +------------------------- +- Minor speed optimization for LZMA2 (xz and 7z) multi-threading compression. + 7-Zip now uses additional memory buffers for multi-block LZMA2 compression. + CPU utilization was slightly improved. +- 7-zip now creates multi-block xz archives by default. Block size can be + specified with -ms[Size]{m|g} switch. +- xz decoder now can unpack random block from multi-block xz archives. +- 7-Zip command line: @listfile now doesn't work after -- switch. + Use -i@listfile before -- switch instead. +- The BUGs were fixed: + 7-Zip 17.00 beta crashed for commands that write anti-item to 7z archive. + + +17.00 beta 2017-04-29 +------------------------- +- NewHandler.h / NewHandler.cpp: + now it redefines operator new() only for old MSVC compilers (_MSC_VER < 1900). +- C/7zTypes.h : the names of variables in interface structures were changed (vt). +- Some bugs were fixed. 7-Zip could crash in some cases. +- Some internal changes in code. + + +16.04 2016-10-04 +------------------------- +- The bug was fixed in DllSecur.c. + + +16.03 2016-09-28 +------------------------- +- SFX modules now use some protection against DLL preloading attack. +- Some bugs in 7z code were fixed. + + +16.02 2016-05-21 +------------------------- +- The BUG in 16.00 - 16.01 was fixed: + Split Handler (SplitHandler.cpp) returned incorrect + total size value (kpidSize) for split archives. + + +16.01 2016-05-19 +------------------------- +- Some internal changes to reduce the number of compiler warnings. + + +16.00 2016-05-10 +------------------------- +- Some bugs were fixed. + + +15.12 2015-11-19 +------------------------- +- The BUG in C version of 7z decoder was fixed: + 7zDec.c : SzDecodeLzma2() + 7z decoder could mistakenly report about decoding error for some 7z archives + that use LZMA2 compression method. + The probability to get that mistaken decoding error report was about + one error per 16384 solid blocks for solid blocks larger than 16 KB (compressed size). +- The BUG (in 9.26-15.11) in C version of 7z decoder was fixed: + 7zArcIn.c : SzReadHeader2() + 7z decoder worked incorrectly for 7z archives that contain + empty solid blocks, that can be placed to 7z archive, if some file is + unavailable for reading during archive creation. + + +15.09 beta 2015-10-16 +------------------------- +- The BUG in LZMA / LZMA2 encoding code was fixed. + The BUG in LzFind.c::MatchFinder_ReadBlock() function. + If input data size is larger than (4 GiB - dictionary_size), + the following code worked incorrectly: + - LZMA : LzmaEnc_MemEncode(), LzmaEncode() : LZMA encoding functions + for compressing from memory to memory. + That BUG is not related to LZMA encoder version that works via streams. + - LZMA2 : multi-threaded version of LZMA2 encoder worked incorrectly, if + default value of chunk size (CLzma2EncProps::blockSize) is changed + to value larger than (4 GiB - dictionary_size). + + +9.38 beta 2015-01-03 +------------------------- +- The BUG in 9.31-9.37 was fixed: + IArchiveGetRawProps interface was disabled for 7z archives. +- The BUG in 9.26-9.36 was fixed: + Some code in CPP\7zip\Archive\7z\ worked correctly only under Windows. + + +9.36 beta 2014-12-26 +------------------------- +- The BUG in command line version was fixed: + 7-Zip created temporary archive in current folder during update archive + operation, if -w{Path} switch was not specified. + The fixed 7-Zip creates temporary archive in folder that contains updated archive. +- The BUG in 9.33-9.35 was fixed: + 7-Zip silently ignored file reading errors during 7z or gz archive creation, + and the created archive contained only part of file that was read before error. + The fixed 7-Zip stops archive creation and it reports about error. + + +9.35 beta 2014-12-07 +------------------------- +- 7zr.exe now support AES encryption. +- SFX mudules were added to LZMA SDK +- Some bugs were fixed. + + +9.21 beta 2011-04-11 +------------------------- +- New class FString for file names at file systems. +- Speed optimization in CRC code for big-endian CPUs. +- The BUG in Lzma2Dec.c was fixed: + Lzma2Decode function didn't work. + + +9.18 beta 2010-11-02 +------------------------- +- New small SFX module for installers (SfxSetup). + + +9.12 beta 2010-03-24 +------------------------- +- The BUG in LZMA SDK 9.* was fixed: LZMA2 codec didn't work, + if more than 10 threads were used (or more than 20 threads in some modes). + + +9.11 beta 2010-03-15 +------------------------- +- PPMd compression method support + + +9.09 2009-12-12 +------------------------- +- The bug was fixed: + Utf16_To_Utf8 funstions in UTFConvert.cpp and 7zMain.c + incorrectly converted surrogate characters (the code >= 0x10000) to UTF-8. +- Some bugs were fixed + + +9.06 2009-08-17 +------------------------- +- Some changes in ANSI-C 7z Decoder interfaces. + + +9.04 2009-05-30 +------------------------- +- LZMA2 compression method support +- xz format support + + +4.65 2009-02-03 +------------------------- +- Some minor fixes + + +4.63 2008-12-31 +------------------------- +- Some minor fixes + + +4.61 beta 2008-11-23 +------------------------- +- The bug in ANSI-C LZMA Decoder was fixed: + If encoded stream was corrupted, decoder could access memory + outside of allocated range. +- Some changes in ANSI-C 7z Decoder interfaces. +- LZMA SDK is placed in the public domain. + + +4.60 beta 2008-08-19 +------------------------- +- Some minor fixes. + + +4.59 beta 2008-08-13 +------------------------- +- The bug was fixed: + LZMA Encoder in fast compression mode could access memory outside of + allocated range in some rare cases. + + +4.58 beta 2008-05-05 +------------------------- +- ANSI-C LZMA Decoder was rewritten for speed optimizations. +- ANSI-C LZMA Encoder was included to LZMA SDK. +- C++ LZMA code now is just wrapper over ANSI-C code. + + +4.57 2007-12-12 +------------------------- +- Speed optimizations in Ñ++ LZMA Decoder. +- Small changes for more compatibility with some C/C++ compilers. + + +4.49 beta 2007-07-05 +------------------------- +- .7z ANSI-C Decoder: + - now it supports BCJ and BCJ2 filters + - now it supports files larger than 4 GB. + - now it supports "Last Write Time" field for files. +- C++ code for .7z archives compressing/decompressing from 7-zip + was included to LZMA SDK. + + +4.43 2006-06-04 +------------------------- +- Small changes for more compatibility with some C/C++ compilers. + + +4.42 2006-05-15 +------------------------- +- Small changes in .h files in ANSI-C version. + + +4.39 beta 2006-04-14 +------------------------- +- The bug in versions 4.33b:4.38b was fixed: + C++ version of LZMA encoder could not correctly compress + files larger than 2 GB with HC4 match finder (-mfhc4). + + +4.37 beta 2005-04-06 +------------------------- +- Fixes in C++ code: code could no be compiled if _NO_EXCEPTIONS was defined. + + +4.35 beta 2005-03-02 +------------------------- +- The bug was fixed in C++ version of LZMA Decoder: + If encoded stream was corrupted, decoder could access memory + outside of allocated range. + + +4.34 beta 2006-02-27 +------------------------- +- Compressing speed and memory requirements for compressing were increased +- LZMA now can use only these match finders: HC4, BT2, BT3, BT4 + + +4.32 2005-12-09 +------------------------- +- Java version of LZMA SDK was included + + +4.30 2005-11-20 +------------------------- +- Compression ratio was improved in -a2 mode +- Speed optimizations for compressing in -a2 mode +- -fb switch now supports values up to 273 +- The bug in 7z_C (7zIn.c) was fixed: + It used Alloc/Free functions from different memory pools. + So if program used two memory pools, it worked incorrectly. +- 7z_C: .7z format supporting was improved +- LZMA# SDK (C#.NET version) was included + + +4.27 (Updated) 2005-09-21 +------------------------- +- Some GUIDs/interfaces in C++ were changed. + IStream.h: + ISequentialInStream::Read now works as old ReadPart + ISequentialOutStream::Write now works as old WritePart + + +4.27 2005-08-07 +------------------------- +- The bug in LzmaDecodeSize.c was fixed: + if _LZMA_IN_CB and _LZMA_OUT_READ were defined, + decompressing worked incorrectly. + + +4.26 2005-08-05 +------------------------- +- Fixes in 7z_C code and LzmaTest.c: + previous versions could work incorrectly, + if malloc(0) returns 0 + + +4.23 2005-06-29 +------------------------- +- Small fixes in C++ code + + +4.22 2005-06-10 +------------------------- +- Small fixes + + +4.21 2005-06-08 +------------------------- +- Interfaces for ANSI-C LZMA Decoder (LzmaDecode.c) were changed +- New additional version of ANSI-C LZMA Decoder with zlib-like interface: + - LzmaStateDecode.h + - LzmaStateDecode.c + - LzmaStateTest.c +- ANSI-C LZMA Decoder now can decompress files larger than 4 GB + + +4.17 2005-04-18 +------------------------- +- New example for RAM->RAM compressing/decompressing: + LZMA + BCJ (filter for x86 code): + - LzmaRam.h + - LzmaRam.cpp + - LzmaRamDecode.h + - LzmaRamDecode.c + - -f86 switch for lzma.exe + + +4.16 2005-03-29 +------------------------- +- The bug was fixed in LzmaDecode.c (ANSI-C LZMA Decoder): + If _LZMA_OUT_READ was defined, and if encoded stream was corrupted, + decoder could access memory outside of allocated range. +- Speed optimization of ANSI-C LZMA Decoder (now it's about 20% faster). + Old version of LZMA Decoder now is in file LzmaDecodeSize.c. + LzmaDecodeSize.c can provide slightly smaller code than LzmaDecode.c +- Small speed optimization in LZMA C++ code +- filter for SPARC's code was added +- Simplified version of .7z ANSI-C Decoder was included + + +4.06 2004-09-05 +------------------------- +- The bug in v4.05 was fixed: + LZMA-Encoder didn't release output stream in some cases. + + +4.05 2004-08-25 +------------------------- +- Source code of filters for x86, IA-64, ARM, ARM-Thumb + and PowerPC code was included to SDK +- Some internal minor changes + + +4.04 2004-07-28 +------------------------- +- More compatibility with some C++ compilers + + +4.03 2004-06-18 +------------------------- +- "Benchmark" command was added. It measures compressing + and decompressing speed and shows rating values. + Also it checks hardware errors. + + +4.02 2004-06-10 +------------------------- +- C++ LZMA Encoder/Decoder code now is more portable + and it can be compiled by GCC on Linux. + + +4.01 2004-02-15 +------------------------- +- Some detection of data corruption was enabled. + LzmaDecode.c / RangeDecoderReadByte + ..... + { + rd->ExtraBytes = 1; + return 0xFF; + } + + +4.00 2004-02-13 +------------------------- +- Original version of LZMA SDK + + + +HISTORY of the LZMA +------------------- + 2001-2008: Improvements to LZMA compressing/decompressing code, + keeping compatibility with original LZMA format + 1996-2001: Development of LZMA compression format + + Some milestones: + + 2001-08-30: LZMA compression was added to 7-Zip + 1999-01-02: First version of 7-Zip was released + + +End of document diff --git a/src/sdk/DOC/lzma-sdk.txt b/src/sdk/DOC/lzma-sdk.txt index b0e14a2..57279bf 100644 --- a/src/sdk/DOC/lzma-sdk.txt +++ b/src/sdk/DOC/lzma-sdk.txt @@ -1,357 +1,437 @@ -LZMA SDK 19.00 --------------- - -LZMA SDK provides the documentation, samples, header files, -libraries, and tools you need to develop applications that -use 7z / LZMA / LZMA2 / XZ compression. - -LZMA is an improved version of famous LZ77 compression algorithm. -It was improved in way of maximum increasing of compression ratio, -keeping high decompression speed and low memory requirements for -decompressing. - -LZMA2 is a LZMA based compression method. LZMA2 provides better -multithreading support for compression than LZMA and some other improvements. - -7z is a file format for data compression and file archiving. -7z is a main file format for 7-Zip compression program (www.7-zip.org). -7z format supports different compression methods: LZMA, LZMA2 and others. -7z also supports AES-256 based encryption. - -XZ is a file format for data compression that uses LZMA2 compression. -XZ format provides additional features: SHA/CRC check, filters for -improved compression ratio, splitting to blocks and streams, - - - -LICENSE -------- - -LZMA SDK is written and placed in the public domain by Igor Pavlov. - -Some code in LZMA SDK is based on public domain code from another developers: - 1) PPMd var.H (2001): Dmitry Shkarin - 2) SHA-256: Wei Dai (Crypto++ library) - -Anyone is free to copy, modify, publish, use, compile, sell, or distribute the -original LZMA SDK code, either in source code form or as a compiled binary, for -any purpose, commercial or non-commercial, and by any means. - -LZMA SDK code is compatible with open source licenses, for example, you can -include it to GNU GPL or GNU LGPL code. - - -LZMA SDK Contents ------------------ - - Source code: - - - C / C++ / C# / Java - LZMA compression and decompression - - C / C++ - LZMA2 compression and decompression - - C / C++ - XZ compression and decompression - - C - 7z decompression - - C++ - 7z compression and decompression - - C - small SFXs for installers (7z decompression) - - C++ - SFXs and SFXs for installers (7z decompression) - - Precomiled binaries: - - - console programs for lzma / 7z / xz compression and decompression - - SFX modules for installers. - - -UNIX/Linux version ------------------- -To compile C++ version of file->file LZMA encoding, go to directory -CPP/7zip/Bundles/LzmaCon -and call make to recompile it: - make -f makefile.gcc clean all - -In some UNIX/Linux versions you must compile LZMA with static libraries. -To compile with static libraries, you can use -LIB = -lm -static - -Also you can use p7zip (port of 7-Zip for POSIX systems like Unix or Linux): - - http://p7zip.sourceforge.net/ - - -Files ------ - -DOC/7zC.txt - 7z ANSI-C Decoder description -DOC/7zFormat.txt - 7z Format description -DOC/installer.txt - information about 7-Zip for installers -DOC/lzma.txt - LZMA compression description -DOC/lzma-sdk.txt - LZMA SDK description (this file) -DOC/lzma-history.txt - history of LZMA SDK -DOC/lzma-specification.txt - Specification of LZMA -DOC/Methods.txt - Compression method IDs for .7z - -bin/installer/ - example script to create installer that uses SFX module, - -bin/7zdec.exe - simplified 7z archive decoder -bin/7zr.exe - 7-Zip console program (reduced version) -bin/x64/7zr.exe - 7-Zip console program (reduced version) (x64 version) -bin/lzma.exe - file->file LZMA encoder/decoder for Windows -bin/7zS2.sfx - small SFX module for installers (GUI version) -bin/7zS2con.sfx - small SFX module for installers (Console version) -bin/7zSD.sfx - SFX module for installers. - - -7zDec.exe ---------- -7zDec.exe is simplified 7z archive decoder. -It supports only LZMA, LZMA2, and PPMd methods. -7zDec decodes whole solid block from 7z archive to RAM. -The RAM consumption can be high. - - - - -Source code structure ---------------------- - - -Asm/ - asm files (optimized code for CRC calculation and Intel-AES encryption) - -C/ - C files (compression / decompression and other) - Util/ - 7z - 7z decoder program (decoding 7z files) - Lzma - LZMA program (file->file LZMA encoder/decoder). - LzmaLib - LZMA library (.DLL for Windows) - SfxSetup - small SFX module for installers - -CPP/ -- CPP files - - Common - common files for C++ projects - Windows - common files for Windows related code - - 7zip - files related to 7-Zip - - Archive - files related to archiving - - Common - common files for archive handling - 7z - 7z C++ Encoder/Decoder - - Bundles - Modules that are bundles of other modules (files) - - Alone7z - 7zr.exe: Standalone 7-Zip console program (reduced version) - Format7zExtractR - 7zxr.dll: Reduced version of 7z DLL: extracting from 7z/LZMA/BCJ/BCJ2. - Format7zR - 7zr.dll: Reduced version of 7z DLL: extracting/compressing to 7z/LZMA/BCJ/BCJ2 - LzmaCon - lzma.exe: LZMA compression/decompression - LzmaSpec - example code for LZMA Specification - SFXCon - 7zCon.sfx: Console 7z SFX module - SFXSetup - 7zS.sfx: 7z SFX module for installers - SFXWin - 7z.sfx: GUI 7z SFX module - - Common - common files for 7-Zip - - Compress - files for compression/decompression - - Crypto - files for encryption / decompression - - UI - User Interface files - - Client7z - Test application for 7za.dll, 7zr.dll, 7zxr.dll - Common - Common UI files - Console - Code for console program (7z.exe) - Explorer - Some code from 7-Zip Shell extension - FileManager - Some GUI code from 7-Zip File Manager - GUI - Some GUI code from 7-Zip - - -CS/ - C# files - 7zip - Common - some common files for 7-Zip - Compress - files related to compression/decompression - LZ - files related to LZ (Lempel-Ziv) compression algorithm - LZMA - LZMA compression/decompression - LzmaAlone - file->file LZMA compression/decompression - RangeCoder - Range Coder (special code of compression/decompression) - -Java/ - Java files - SevenZip - Compression - files related to compression/decompression - LZ - files related to LZ (Lempel-Ziv) compression algorithm - LZMA - LZMA compression/decompression - RangeCoder - Range Coder (special code of compression/decompression) - - -Note: - Asm / C / C++ source code of LZMA SDK is part of 7-Zip's source code. - 7-Zip's source code can be downloaded from 7-Zip's SourceForge page: - - http://sourceforge.net/projects/sevenzip/ - - - -LZMA features -------------- - - Variable dictionary size (up to 1 GB) - - Estimated compressing speed: about 2 MB/s on 2 GHz CPU - - Estimated decompressing speed: - - 20-30 MB/s on modern 2 GHz cpu - - 1-2 MB/s on 200 MHz simple RISC cpu: (ARM, MIPS, PowerPC) - - Small memory requirements for decompressing (16 KB + DictionarySize) - - Small code size for decompressing: 5-8 KB - -LZMA decoder uses only integer operations and can be -implemented in any modern 32-bit CPU (or on 16-bit CPU with some conditions). - -Some critical operations that affect the speed of LZMA decompression: - 1) 32*16 bit integer multiply - 2) Mispredicted branches (penalty mostly depends from pipeline length) - 3) 32-bit shift and arithmetic operations - -The speed of LZMA decompressing mostly depends from CPU speed. -Memory speed has no big meaning. But if your CPU has small data cache, -overall weight of memory speed will slightly increase. - - -How To Use ----------- - -Using LZMA encoder/decoder executable --------------------------------------- - -Usage: LZMA inputFile outputFile [...] - - e: encode file - - d: decode file - - b: Benchmark. There are two tests: compressing and decompressing - with LZMA method. Benchmark shows rating in MIPS (million - instructions per second). Rating value is calculated from - measured speed and it is normalized with Intel's Core 2 results. - Also Benchmark checks possible hardware errors (RAM - errors in most cases). Benchmark uses these settings: - (-a1, -d21, -fb32, -mfbt4). You can change only -d parameter. - Also you can change the number of iterations. Example for 30 iterations: - LZMA b 30 - Default number of iterations is 10. - - - - - -a{N}: set compression mode 0 = fast, 1 = normal - default: 1 (normal) - - d{N}: Sets Dictionary size - [0, 30], default: 23 (8MB) - The maximum value for dictionary size is 1 GB = 2^30 bytes. - Dictionary size is calculated as DictionarySize = 2^N bytes. - For decompressing file compressed by LZMA method with dictionary - size D = 2^N you need about D bytes of memory (RAM). - - -fb{N}: set number of fast bytes - [5, 273], default: 128 - Usually big number gives a little bit better compression ratio - and slower compression process. - - -lc{N}: set number of literal context bits - [0, 8], default: 3 - Sometimes lc=4 gives gain for big files. - - -lp{N}: set number of literal pos bits - [0, 4], default: 0 - lp switch is intended for periodical data when period is - equal 2^N. For example, for 32-bit (4 bytes) - periodical data you can use lp=2. Often it's better to set lc0, - if you change lp switch. - - -pb{N}: set number of pos bits - [0, 4], default: 2 - pb switch is intended for periodical data - when period is equal 2^N. - - -mf{MF_ID}: set Match Finder. Default: bt4. - Algorithms from hc* group doesn't provide good compression - ratio, but they often works pretty fast in combination with - fast mode (-a0). - - Memory requirements depend from dictionary size - (parameter "d" in table below). - - MF_ID Memory Description - - bt2 d * 9.5 + 4MB Binary Tree with 2 bytes hashing. - bt3 d * 11.5 + 4MB Binary Tree with 3 bytes hashing. - bt4 d * 11.5 + 4MB Binary Tree with 4 bytes hashing. - hc4 d * 7.5 + 4MB Hash Chain with 4 bytes hashing. - - -eos: write End Of Stream marker. By default LZMA doesn't write - eos marker, since LZMA decoder knows uncompressed size - stored in .lzma file header. - - -si: Read data from stdin (it will write End Of Stream marker). - -so: Write data to stdout - - -Examples: - -1) LZMA e file.bin file.lzma -d16 -lc0 - -compresses file.bin to file.lzma with 64 KB dictionary (2^16=64K) -and 0 literal context bits. -lc0 allows to reduce memory requirements -for decompression. - - -2) LZMA e file.bin file.lzma -lc0 -lp2 - -compresses file.bin to file.lzma with settings suitable -for 32-bit periodical data (for example, ARM or MIPS code). - -3) LZMA d file.lzma file.bin - -decompresses file.lzma to file.bin. - - -Compression ratio hints ------------------------ - -Recommendations ---------------- - -To increase the compression ratio for LZMA compressing it's desirable -to have aligned data (if it's possible) and also it's desirable to locate -data in such order, where code is grouped in one place and data is -grouped in other place (it's better than such mixing: code, data, code, -data, ...). - - -Filters -------- -You can increase the compression ratio for some data types, using -special filters before compressing. For example, it's possible to -increase the compression ratio on 5-10% for code for those CPU ISAs: -x86, IA-64, ARM, ARM-Thumb, PowerPC, SPARC. - -You can find C source code of such filters in C/Bra*.* files - -You can check the compression ratio gain of these filters with such -7-Zip commands (example for ARM code): -No filter: - 7z a a1.7z a.bin -m0=lzma - -With filter for little-endian ARM code: - 7z a a2.7z a.bin -m0=arm -m1=lzma - -It works in such manner: -Compressing = Filter_encoding + LZMA_encoding -Decompressing = LZMA_decoding + Filter_decoding - -Compressing and decompressing speed of such filters is very high, -so it will not increase decompressing time too much. -Moreover, it reduces decompression time for LZMA_decoding, -since compression ratio with filtering is higher. - -These filters convert CALL (calling procedure) instructions -from relative offsets to absolute addresses, so such data becomes more -compressible. - -For some ISAs (for example, for MIPS) it's impossible to get gain from such filter. - - - ---- - -http://www.7-zip.org -http://www.7-zip.org/sdk.html -http://www.7-zip.org/support.html +LZMA SDK 25.01 +-------------- + +LZMA SDK provides the documentation, samples, header files, +libraries, and tools you need to develop applications that +use 7z / LZMA / LZMA2 / XZ compression. + +LZMA is an improved version of famous LZ77 compression algorithm. +It was improved in way of maximum increasing of compression ratio, +keeping high decompression speed and low memory requirements for +decompressing. + +LZMA2 is a LZMA based compression method. LZMA2 provides better +multithreading support for compression than LZMA and some other improvements. + +7z is a file format for data compression and file archiving. +7z is a main file format for 7-Zip compression program (www.7-zip.org). +7z format supports different compression methods: LZMA, LZMA2 and others. +7z also supports AES-256 based encryption. + +XZ is a file format for data compression that uses LZMA2 compression. +XZ format provides additional features: SHA/CRC check, filters for +improved compression ratio, splitting to blocks and streams, + + + +LICENSE +------- + +LZMA SDK is written and placed in the public domain by Igor Pavlov. + +Some code in LZMA SDK is based on public domain code from another developers: + 1) PPMd var.H (2001): Dmitry Shkarin + 2) SHA-256: Wei Dai (Crypto++ library) + +Anyone is free to copy, modify, publish, use, compile, sell, or distribute the +original LZMA SDK code, either in source code form or as a compiled binary, for +any purpose, commercial or non-commercial, and by any means. + +LZMA SDK code is compatible with open source licenses, for example, you can +include it to GNU GPL or GNU LGPL code. + + +LZMA SDK Contents +----------------- + + Source code: + + - C / C++ / C# / Java - LZMA compression and decompression + - C / C++ - LZMA2 compression and decompression + - C / C++ - XZ compression and decompression + - C - 7z decompression + - C++ - 7z compression and decompression + - C - small SFXs for installers (7z decompression) + - C++ - SFXs and SFXs for installers (7z decompression) + + Precomiled binaries: + + - console programs for lzma / 7z / xz compression and decompression + - SFX modules for installers. + +How to compile with makefile in Windows +--------------------------------------- + +Some macronames can be defined for compiling with makefile: + +PLATFORM + with possible values: x64, x86, arm64, arm, ia64 + +OLD_COMPILER + for old VC compiler, like MSCV 6.0. + +MY_DYNAMIC_LINK + for dynamic linking to the run-time library (msvcrt.dll). + The default makefile option is static linking to the run-time library. + +To compile 7zr.exe file for x64 with Visual Studio 2022, +use the following command sequence: + + cd SRC\CPP\7zip\Bundles\Alone7z\ + %comspec% /k "C:\Program Files\VS2022\VC\Auxiliary\Build\vcvars64.bat" + nmake + +You can use other "vcvars*.bat" files from the "VS2022\VC\Auxiliary\Build" directory +to compile for other platforms: + vcvars64.bat + vcvarsamd64_arm64.bat + vcvarsamd64_x86.bat + + +UNIX/Linux version +------------------ +There are several otpions to compile 7-Zip with different compilers: gcc and clang. +Also 7-Zip code contains two versions for some critical parts of code: in C and in Assembeler. +So if you compile the version with Assembeler code, you will get faster 7-Zip binary. + +7-Zip's assembler code uses the following syntax for different platforms: + +1) x86 and x86-64 (AMD64): MASM syntax. + There are 2 programs that supports MASM syntax in Linux. +' 'Asmc Macro Assembler and JWasm. But JWasm now doesn't support some + cpu instructions used in 7-Zip. + So you must install Asmc Macro Assembler in Linux, if you want to compile fastest version + of 7-Zip x86 and x86-64: + https://github.com/nidud/asmc + +2) arm64: GNU assembler for ARM64 with preprocessor. + That systax of that arm64 assembler code in 7-Zip is supported by GCC and CLANG for ARM64. + +There are different binaries that can be compiled from 7-Zip source. +There are 2 main files in folder for compiling: + makefile - that can be used for compiling Windows version of 7-Zip with nmake command + makefile.gcc - that can be used for compiling Linux/macOS versions of 7-Zip with make command + +At first you must change the current folder to folder that contains `makefile.gcc`: + + cd CPP/7zip/Bundles/Alone7z + +Then you can compile `makefile.gcc` with the command: + + make -j -f makefile.gcc + +Also there are additional "*.mak" files in folder "CPP/7zip/" that can be used to compile +7-Zip binaries with optimized code and optimzing options. + +To compile with GCC without assembler: + cd CPP/7zip/Bundles/Alone7z + make -j -f ../../cmpl_gcc.mak + +To compile with CLANG without assembler: + make -j -f ../../cmpl_clang.mak + +To compile 7-Zip for x86-64 with asmc assembler: + make -j -f ../../cmpl_gcc_x64.mak + +To compile 7-Zip for arm64 with assembler: + make -j -f ../../cmpl_gcc_arm64.mak + +To compile 7-Zip for arm64 for macOS: + make -j -f ../../cmpl_mac_arm64.mak + +Also you can change some compiler options in the mak files: + cmpl_gcc.mak + var_gcc.mak + warn_gcc.mak + + + +Also you can use p7zip (port of 7-Zip for POSIX systems like Unix or Linux): + + http://p7zip.sourceforge.net/ + + +Files +----- + +DOC/7zC.txt - 7z ANSI-C Decoder description +DOC/7zFormat.txt - 7z Format description +DOC/installer.txt - information about 7-Zip for installers +DOC/lzma.txt - LZMA compression description +DOC/lzma-sdk.txt - LZMA SDK description (this file) +DOC/lzma-history.txt - history of LZMA SDK +DOC/lzma-specification.txt - Specification of LZMA +DOC/Methods.txt - Compression method IDs for .7z + +bin/installer/ - example script to create installer that uses SFX module, + +bin/7zdec.exe - simplified 7z archive decoder (x86 32-bit version) +bin/7zr.exe - 7-Zip console program (reduced version) (x86 32-bit version) +bin/x64/7zr.exe - 7-Zip console program (reduced version) (x64 version) +bin/x64/7zdec.exe - simplified 7z archive decoder (x64 version) +bin/arm64/7zr.exe - 7-Zip console program (reduced version) (arm64 version) +bin/arm64/7zdec.exe - simplified 7z archive decoder (arm64 version) +bin/lzma.exe - file->file LZMA encoder/decoder for Windows +bin/7zS2.sfx - small SFX module for installers (GUI version) +bin/7zS2con.sfx - small SFX module for installers (Console version) +bin/7zSD.sfx - SFX module for installers. + + +7zDec.exe +--------- +7zDec.exe is simplified 7z archive decoder. +It supports only LZMA, LZMA2, and PPMd methods. +7zDec decodes whole solid block from 7z archive to RAM. +The RAM consumption can be high. + + + + +Source code structure +--------------------- + + +Asm/ - asm files (optimized code for CRC calculation and Intel-AES encryption) + +C/ - C files (compression / decompression and other) + Util/ + 7z - 7z decoder program (decoding 7z files) + Lzma - LZMA program (file->file LZMA encoder/decoder). + LzmaLib - LZMA library (.DLL for Windows) + SfxSetup - small SFX module for installers + +CPP/ -- CPP files + + Common - common files for C++ projects + Windows - common files for Windows related code + + 7zip - files related to 7-Zip + + Archive - files related to archiving + + Common - common files for archive handling + 7z - 7z C++ Encoder/Decoder + + Bundles - Modules that are bundles of other modules (files) + + Alone7z - 7zr.exe: Standalone 7-Zip console program (reduced version) + Format7zExtractR - 7zxr.dll: Reduced version of 7z DLL: extracting from 7z/LZMA/BCJ/BCJ2. + Format7zR - 7zr.dll: Reduced version of 7z DLL: extracting/compressing to 7z/LZMA/BCJ/BCJ2 + LzmaCon - lzma.exe: LZMA compression/decompression + LzmaSpec - example code for LZMA Specification + SFXCon - 7zCon.sfx: Console 7z SFX module + SFXSetup - 7zS.sfx: 7z SFX module for installers + SFXWin - 7z.sfx: GUI 7z SFX module + + Common - common files for 7-Zip + + Compress - files for compression/decompression + + Crypto - files for encryption / decompression + + UI - User Interface files + + Client7z - Test application for 7za.dll, 7zr.dll, 7zxr.dll + Common - Common UI files + Console - Code for console program (7z.exe) + Explorer - Some code from 7-Zip Shell extension + FileManager - Some GUI code from 7-Zip File Manager + GUI - Some GUI code from 7-Zip + + +CS/ - C# files + 7zip + Common - some common files for 7-Zip + Compress - files related to compression/decompression + LZ - files related to LZ (Lempel-Ziv) compression algorithm + LZMA - LZMA compression/decompression + LzmaAlone - file->file LZMA compression/decompression + RangeCoder - Range Coder (special code of compression/decompression) + +Java/ - Java files + SevenZip + Compression - files related to compression/decompression + LZ - files related to LZ (Lempel-Ziv) compression algorithm + LZMA - LZMA compression/decompression + RangeCoder - Range Coder (special code of compression/decompression) + + +Note: + Asm / C / C++ source code of LZMA SDK is part of 7-Zip's source code. + 7-Zip's source code can be downloaded from 7-Zip's SourceForge page: + + http://sourceforge.net/projects/sevenzip/ + + + +LZMA features +------------- + - Variable dictionary size (up to 4 GB) + - Estimated compressing speed: about 2 MB/s on 2 GHz CPU + - Estimated decompressing speed: + - 20-30 MB/s on modern 2 GHz cpu + - 1-2 MB/s on 200 MHz simple RISC cpu: (ARM, MIPS, PowerPC) + - Small memory requirements for decompressing (16 KB + DictionarySize) + - Small code size for decompressing: 5-8 KB + +LZMA decoder uses only integer operations and can be +implemented in any modern 32-bit CPU (or on 16-bit CPU with some conditions). + +Some critical operations that affect the speed of LZMA decompression: + 1) 32*16 bit integer multiply + 2) Mispredicted branches (penalty mostly depends from pipeline length) + 3) 32-bit shift and arithmetic operations + +The speed of LZMA decompressing mostly depends from CPU speed. +Memory speed has no big meaning. But if your CPU has small data cache, +overall weight of memory speed will slightly increase. + + +How To Use +---------- + +Using LZMA encoder/decoder executable +-------------------------------------- + +Usage: LZMA inputFile outputFile [...] + + e: encode file + + d: decode file + + b: Benchmark. There are two tests: compressing and decompressing + with LZMA method. Benchmark shows rating in MIPS (million + instructions per second). Rating value is calculated from + measured speed and it is normalized with Intel's Core 2 results. + Also Benchmark checks possible hardware errors (RAM + errors in most cases). Benchmark uses these settings: + (-a1, -d21, -fb32, -mfbt4). You can change only -d parameter. + Also you can change the number of iterations. Example for 30 iterations: + LZMA b 30 + Default number of iterations is 10. + + + + + -a{N}: set compression mode 0 = fast, 1 = normal + default: 1 (normal) + + d{N}: Sets Dictionary size - [0, 31], default: N=24 (32 MB) + The maximum value for dictionary size is N=31 (2 GB). + Dictionary size is calculated as DictionarySize = 2^N bytes. + For decompressing file compressed by LZMA method with dictionary + size D = 2^N you need about D bytes of memory (RAM). + + -fb{N}: set number of fast bytes - [5, 273], default: 128 + Usually big number gives a little bit better compression ratio + and slower compression process. + + -lc{N}: set number of literal context bits - [0, 8], default: 3 + Sometimes lc=4 gives gain for big files. + + -lp{N}: set number of literal pos bits - [0, 4], default: 0 + lp switch is intended for periodical data when period is + equal 2^N. For example, for 32-bit (4 bytes) + periodical data you can use lp=2. Often it's better to set lc0, + if you change lp switch. + + -pb{N}: set number of pos bits - [0, 4], default: 2 + pb switch is intended for periodical data + when period is equal 2^N. + + -mf{MF_ID}: set Match Finder. Default: bt4. + Algorithms from hc* group doesn't provide good compression + ratio, but they often works pretty fast in combination with + fast mode (-a0). + + Memory requirements depend from dictionary size + (parameter "d" in table below). + + MF_ID Memory Description + + bt2 d * 9.5 + 4MB Binary Tree with 2 bytes hashing. + bt3 d * 11.5 + 4MB Binary Tree with 3 bytes hashing. + bt4 d * 11.5 + 4MB Binary Tree with 4 bytes hashing. + bt5 d * 11.5 + 4MB Binary Tree with 5 bytes hashing. + hc4 d * 7.5 + 4MB Hash Chain with 4 bytes hashing. + hc5 d * 7.5 + 4MB Hash Chain with 5 bytes hashing. + + -eos: write End Of Stream marker. By default LZMA doesn't write + eos marker, since LZMA decoder knows uncompressed size + stored in .lzma file header. + + -si: Read data from stdin (it will write End Of Stream marker). + -so: Write data to stdout + + +Examples: + +1) LZMA e file.bin file.lzma -d16 -lc0 + +compresses file.bin to file.lzma with 64 KB dictionary (2^16=64K) +and 0 literal context bits. -lc0 allows to reduce memory requirements +for decompression. + + +2) LZMA e file.bin file.lzma -lc0 -lp2 + +compresses file.bin to file.lzma with settings suitable +for 32-bit periodical data (for example, ARM or MIPS code). + +3) LZMA d file.lzma file.bin + +decompresses file.lzma to file.bin. + + +Compression ratio hints +----------------------- + +Recommendations +--------------- + +To increase the compression ratio for LZMA compressing it's desirable +to have aligned data (if it's possible) and also it's desirable to locate +data in such order, where code is grouped in one place and data is +grouped in other place (it's better than such mixing: code, data, code, +data, ...). + + +Filters +------- +You can increase the compression ratio for some data types, using +special filters before compressing. For example, it's possible to +increase the compression ratio on 5-10% for code for those CPU ISAs: +x86, IA-64, ARM, ARM-Thumb, PowerPC, SPARC. + +You can find C source code of such filters in C/Bra*.* files + +You can check the compression ratio gain of these filters with such +7-Zip commands (example for ARM code): +No filter: + 7z a a1.7z a.bin -m0=lzma + +With filter for little-endian ARM code: + 7z a a2.7z a.bin -m0=arm -m1=lzma + +It works in such manner: +Compressing = Filter_encoding + LZMA_encoding +Decompressing = LZMA_decoding + Filter_decoding + +Compressing and decompressing speed of such filters is very high, +so it will not increase decompressing time too much. +Moreover, it reduces decompression time for LZMA_decoding, +since compression ratio with filtering is higher. + +These filters convert CALL (calling procedure) instructions +from relative offsets to absolute addresses, so such data becomes more +compressible. + +For some ISAs (for example, for MIPS) it's impossible to get gain from such filter. + + + +--- + +http://www.7-zip.org +http://www.7-zip.org/sdk.html +http://www.7-zip.org/support.html diff --git a/src/sdk/DOC/lzma-specification.txt b/src/sdk/DOC/lzma-specification.txt index ac0cce7..b6796df 100644 --- a/src/sdk/DOC/lzma-specification.txt +++ b/src/sdk/DOC/lzma-specification.txt @@ -1,1176 +1,1176 @@ -LZMA specification (DRAFT version) ----------------------------------- - -Author: Igor Pavlov -Date: 2015-06-14 - -This specification defines the format of LZMA compressed data and lzma file format. - -Notation --------- - -We use the syntax of C++ programming language. -We use the following types in C++ code: - unsigned - unsigned integer, at least 16 bits in size - int - signed integer, at least 16 bits in size - UInt64 - 64-bit unsigned integer - UInt32 - 32-bit unsigned integer - UInt16 - 16-bit unsigned integer - Byte - 8-bit unsigned integer - bool - boolean type with two possible values: false, true - - -lzma file format -================ - -The lzma file contains the raw LZMA stream and the header with related properties. - -The files in that format use ".lzma" extension. - -The lzma file format layout: - -Offset Size Description - - 0 1 LZMA model properties (lc, lp, pb) in encoded form - 1 4 Dictionary size (32-bit unsigned integer, little-endian) - 5 8 Uncompressed size (64-bit unsigned integer, little-endian) - 13 Compressed data (LZMA stream) - -LZMA properties: - - name Range Description - - lc [0, 8] the number of "literal context" bits - lp [0, 4] the number of "literal pos" bits - pb [0, 4] the number of "pos" bits -dictSize [0, 2^32 - 1] the dictionary size - -The following code encodes LZMA properties: - -void EncodeProperties(Byte *properties) -{ - properties[0] = (Byte)((pb * 5 + lp) * 9 + lc); - Set_UInt32_LittleEndian(properties + 1, dictSize); -} - -If the value of dictionary size in properties is smaller than (1 << 12), -the LZMA decoder must set the dictionary size variable to (1 << 12). - -#define LZMA_DIC_MIN (1 << 12) - - unsigned lc, pb, lp; - UInt32 dictSize; - UInt32 dictSizeInProperties; - - void DecodeProperties(const Byte *properties) - { - unsigned d = properties[0]; - if (d >= (9 * 5 * 5)) - throw "Incorrect LZMA properties"; - lc = d % 9; - d /= 9; - pb = d / 5; - lp = d % 5; - dictSizeInProperties = 0; - for (int i = 0; i < 4; i++) - dictSizeInProperties |= (UInt32)properties[i + 1] << (8 * i); - dictSize = dictSizeInProperties; - if (dictSize < LZMA_DIC_MIN) - dictSize = LZMA_DIC_MIN; - } - -If "Uncompressed size" field contains ones in all 64 bits, it means that -uncompressed size is unknown and there is the "end marker" in stream, -that indicates the end of decoding point. -In opposite case, if the value from "Uncompressed size" field is not -equal to ((2^64) - 1), the LZMA stream decoding must be finished after -specified number of bytes (Uncompressed size) is decoded. And if there -is the "end marker", the LZMA decoder must read that marker also. - - -The new scheme to encode LZMA properties ----------------------------------------- - -If LZMA compression is used for some another format, it's recommended to -use a new improved scheme to encode LZMA properties. That new scheme was -used in xz format that uses the LZMA2 compression algorithm. -The LZMA2 is a new compression algorithm that is based on the LZMA algorithm. - -The dictionary size in LZMA2 is encoded with just one byte and LZMA2 supports -only reduced set of dictionary sizes: - (2 << 11), (3 << 11), - (2 << 12), (3 << 12), - ... - (2 << 30), (3 << 30), - (2 << 31) - 1 - -The dictionary size can be extracted from encoded value with the following code: - - dictSize = (p == 40) ? 0xFFFFFFFF : (((UInt32)2 | ((p) & 1)) << ((p) / 2 + 11)); - -Also there is additional limitation (lc + lp <= 4) in LZMA2 for values of -"lc" and "lp" properties: - - if (lc + lp > 4) - throw "Unsupported properties: (lc + lp) > 4"; - -There are some advantages for LZMA decoder with such (lc + lp) value -limitation. It reduces the maximum size of tables allocated by decoder. -And it reduces the complexity of initialization procedure, that can be -important to keep high speed of decoding of big number of small LZMA streams. - -It's recommended to use that limitation (lc + lp <= 4) for any new format -that uses LZMA compression. Note that the combinations of "lc" and "lp" -parameters, where (lc + lp > 4), can provide significant improvement in -compression ratio only in some rare cases. - -The LZMA properties can be encoded into two bytes in new scheme: - -Offset Size Description - - 0 1 The dictionary size encoded with LZMA2 scheme - 1 1 LZMA model properties (lc, lp, pb) in encoded form - - -The RAM usage -============= - -The RAM usage for LZMA decoder is determined by the following parts: - -1) The Sliding Window (from 4 KiB to 4 GiB). -2) The probability model counter arrays (arrays of 16-bit variables). -3) Some additional state variables (about 10 variables of 32-bit integers). - - -The RAM usage for Sliding Window --------------------------------- - -There are two main scenarios of decoding: - -1) The decoding of full stream to one RAM buffer. - - If we decode full LZMA stream to one output buffer in RAM, the decoder - can use that output buffer as sliding window. So the decoder doesn't - need additional buffer allocated for sliding window. - -2) The decoding to some external storage. - - If we decode LZMA stream to external storage, the decoder must allocate - the buffer for sliding window. The size of that buffer must be equal - or larger than the value of dictionary size from properties of LZMA stream. - -In this specification we describe the code for decoding to some external -storage. The optimized version of code for decoding of full stream to one -output RAM buffer can require some minor changes in code. - - -The RAM usage for the probability model counters ------------------------------------------------- - -The size of the probability model counter arrays is calculated with the -following formula: - -size_of_prob_arrays = 1846 + 768 * (1 << (lp + lc)) - -Each probability model counter is 11-bit unsigned integer. -If we use 16-bit integer variables (2-byte integers) for these probability -model counters, the RAM usage required by probability model counter arrays -can be estimated with the following formula: - - RAM = 4 KiB + 1.5 KiB * (1 << (lp + lc)) - -For example, for default LZMA parameters (lp = 0 and lc = 3), the RAM usage is - - RAM_lc3_lp0 = 4 KiB + 1.5 KiB * 8 = 16 KiB - -The maximum RAM state usage is required for decoding the stream with lp = 4 -and lc = 8: - - RAM_lc8_lp4 = 4 KiB + 1.5 KiB * 4096 = 6148 KiB - -If the decoder uses LZMA2's limited property condition -(lc + lp <= 4), the RAM usage will be not larger than - - RAM_lc_lp_4 = 4 KiB + 1.5 KiB * 16 = 28 KiB - - -The RAM usage for encoder -------------------------- - -There are many variants for LZMA encoding code. -These variants have different values for memory consumption. -Note that memory consumption for LZMA Encoder can not be -smaller than memory consumption of LZMA Decoder for same stream. - -The RAM usage required by modern effective implementation of -LZMA Encoder can be estimated with the following formula: - - Encoder_RAM_Usage = 4 MiB + 11 * dictionarySize. - -But there are some modes of the encoder that require less memory. - - -LZMA Decoding -============= - -The LZMA compression algorithm uses LZ-based compression with Sliding Window -and Range Encoding as entropy coding method. - - -Sliding Window --------------- - -LZMA uses Sliding Window compression similar to LZ77 algorithm. - -LZMA stream must be decoded to the sequence that consists -of MATCHES and LITERALS: - - - a LITERAL is a 8-bit character (one byte). - The decoder just puts that LITERAL to the uncompressed stream. - - - a MATCH is a pair of two numbers (DISTANCE-LENGTH pair). - The decoder takes one byte exactly "DISTANCE" characters behind - current position in the uncompressed stream and puts it to - uncompressed stream. The decoder must repeat it "LENGTH" times. - -The "DISTANCE" can not be larger than dictionary size. -And the "DISTANCE" can not be larger than the number of bytes in -the uncompressed stream that were decoded before that match. - -In this specification we use cyclic buffer to implement Sliding Window -for LZMA decoder: - -class COutWindow -{ - Byte *Buf; - UInt32 Pos; - UInt32 Size; - bool IsFull; - -public: - unsigned TotalPos; - COutStream OutStream; - - COutWindow(): Buf(NULL) {} - ~COutWindow() { delete []Buf; } - - void Create(UInt32 dictSize) - { - Buf = new Byte[dictSize]; - Pos = 0; - Size = dictSize; - IsFull = false; - TotalPos = 0; - } - - void PutByte(Byte b) - { - TotalPos++; - Buf[Pos++] = b; - if (Pos == Size) - { - Pos = 0; - IsFull = true; - } - OutStream.WriteByte(b); - } - - Byte GetByte(UInt32 dist) const - { - return Buf[dist <= Pos ? Pos - dist : Size - dist + Pos]; - } - - void CopyMatch(UInt32 dist, unsigned len) - { - for (; len > 0; len--) - PutByte(GetByte(dist)); - } - - bool CheckDistance(UInt32 dist) const - { - return dist <= Pos || IsFull; - } - - bool IsEmpty() const - { - return Pos == 0 && !IsFull; - } -}; - - -In another implementation it's possible to use one buffer that contains -Sliding Window and the whole data stream after uncompressing. - - -Range Decoder -------------- - -LZMA algorithm uses Range Encoding (1) as entropy coding method. - -LZMA stream contains just one very big number in big-endian encoding. -LZMA decoder uses the Range Decoder to extract a sequence of binary -symbols from that big number. - -The state of the Range Decoder: - -struct CRangeDecoder -{ - UInt32 Range; - UInt32 Code; - InputStream *InStream; - - bool Corrupted; -} - -The notes about UInt32 type for the "Range" and "Code" variables: - - It's possible to use 64-bit (unsigned or signed) integer type - for the "Range" and the "Code" variables instead of 32-bit unsigned, - but some additional code must be used to truncate the values to - low 32-bits after some operations. - - If the programming language does not support 32-bit unsigned integer type - (like in case of JAVA language), it's possible to use 32-bit signed integer, - but some code must be changed. For example, it's required to change the code - that uses comparison operations for UInt32 variables in this specification. - -The Range Decoder can be in some states that can be treated as -"Corruption" in LZMA stream. The Range Decoder uses the variable "Corrupted": - - (Corrupted == false), if the Range Decoder has not detected any corruption. - (Corrupted == true), if the Range Decoder has detected some corruption. - -The reference LZMA Decoder ignores the value of the "Corrupted" variable. -So it continues to decode the stream, even if the corruption can be detected -in the Range Decoder. To provide the full compatibility with output of the -reference LZMA Decoder, another LZMA Decoder implementations must also -ignore the value of the "Corrupted" variable. - -The LZMA Encoder is required to create only such LZMA streams, that will not -lead the Range Decoder to states, where the "Corrupted" variable is set to true. - -The Range Decoder reads first 5 bytes from input stream to initialize -the state: - -bool CRangeDecoder::Init() -{ - Corrupted = false; - Range = 0xFFFFFFFF; - Code = 0; - - Byte b = InStream->ReadByte(); - - for (int i = 0; i < 4; i++) - Code = (Code << 8) | InStream->ReadByte(); - - if (b != 0 || Code == Range) - Corrupted = true; - return b == 0; -} - -The LZMA Encoder always writes ZERO in initial byte of compressed stream. -That scheme allows to simplify the code of the Range Encoder in the -LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must -stop decoding and report error. - -After the last bit of data was decoded by Range Decoder, the value of the -"Code" variable must be equal to 0. The LZMA Decoder must check it by -calling the IsFinishedOK() function: - - bool IsFinishedOK() const { return Code == 0; } - -If there is corruption in data stream, there is big probability that -the "Code" value will be not equal to 0 in the Finish() function. So that -check in the IsFinishedOK() function provides very good feature for -corruption detection. - -The value of the "Range" variable before each bit decoding can not be smaller -than ((UInt32)1 << 24). The Normalize() function keeps the "Range" value in -described range. - -#define kTopValue ((UInt32)1 << 24) - -void CRangeDecoder::Normalize() -{ - if (Range < kTopValue) - { - Range <<= 8; - Code = (Code << 8) | InStream->ReadByte(); - } -} - -Notes: if the size of the "Code" variable is larger than 32 bits, it's -required to keep only low 32 bits of the "Code" variable after the change -in Normalize() function. - -If the LZMA Stream is not corrupted, the value of the "Code" variable is -always smaller than value of the "Range" variable. -But the Range Decoder ignores some types of corruptions, so the value of -the "Code" variable can be equal or larger than value of the "Range" variable -for some "Corrupted" archives. - - -LZMA uses Range Encoding only with binary symbols of two types: - 1) binary symbols with fixed and equal probabilities (direct bits) - 2) binary symbols with predicted probabilities - -The DecodeDirectBits() function decodes the sequence of direct bits: - -UInt32 CRangeDecoder::DecodeDirectBits(unsigned numBits) -{ - UInt32 res = 0; - do - { - Range >>= 1; - Code -= Range; - UInt32 t = 0 - ((UInt32)Code >> 31); - Code += Range & t; - - if (Code == Range) - Corrupted = true; - - Normalize(); - res <<= 1; - res += t + 1; - } - while (--numBits); - return res; -} - - -The Bit Decoding with Probability Model ---------------------------------------- - -The task of Bit Probability Model is to estimate probabilities of binary -symbols. And then it provides the Range Decoder with that information. -The better prediction provides better compression ratio. -The Bit Probability Model uses statistical data of previous decoded -symbols. - -That estimated probability is presented as 11-bit unsigned integer value -that represents the probability of symbol "0". - -#define kNumBitModelTotalBits 11 - -Mathematical probabilities can be presented with the following formulas: - probability(symbol_0) = prob / 2048. - probability(symbol_1) = 1 - Probability(symbol_0) = - = 1 - prob / 2048 = - = (2048 - prob) / 2048 -where the "prob" variable contains 11-bit integer probability counter. - -It's recommended to use 16-bit unsigned integer type, to store these 11-bit -probability values: - -typedef UInt16 CProb; - -Each probability value must be initialized with value ((1 << 11) / 2), -that represents the state, where probabilities of symbols 0 and 1 -are equal to 0.5: - -#define PROB_INIT_VAL ((1 << kNumBitModelTotalBits) / 2) - -The INIT_PROBS macro is used to initialize the array of CProb variables: - -#define INIT_PROBS(p) \ - { for (unsigned i = 0; i < sizeof(p) / sizeof(p[0]); i++) p[i] = PROB_INIT_VAL; } - - -The DecodeBit() function decodes one bit. -The LZMA decoder provides the pointer to CProb variable that contains -information about estimated probability for symbol 0 and the Range Decoder -updates that CProb variable after decoding. The Range Decoder increases -estimated probability of the symbol that was decoded: - -#define kNumMoveBits 5 - -unsigned CRangeDecoder::DecodeBit(CProb *prob) -{ - unsigned v = *prob; - UInt32 bound = (Range >> kNumBitModelTotalBits) * v; - unsigned symbol; - if (Code < bound) - { - v += ((1 << kNumBitModelTotalBits) - v) >> kNumMoveBits; - Range = bound; - symbol = 0; - } - else - { - v -= v >> kNumMoveBits; - Code -= bound; - Range -= bound; - symbol = 1; - } - *prob = (CProb)v; - Normalize(); - return symbol; -} - - -The Binary Tree of bit model counters -------------------------------------- - -LZMA uses a tree of Bit model variables to decode symbol that needs -several bits for storing. There are two versions of such trees in LZMA: - 1) the tree that decodes bits from high bit to low bit (the normal scheme). - 2) the tree that decodes bits from low bit to high bit (the reverse scheme). - -Each binary tree structure supports different size of decoded symbol -(the size of binary sequence that contains value of symbol). -If that size of decoded symbol is "NumBits" bits, the tree structure -uses the array of (2 << NumBits) counters of CProb type. -But only ((2 << NumBits) - 1) items are used by encoder and decoder. -The first item (the item with index equal to 0) in array is unused. -That scheme with unused array's item allows to simplify the code. - -unsigned BitTreeReverseDecode(CProb *probs, unsigned numBits, CRangeDecoder *rc) -{ - unsigned m = 1; - unsigned symbol = 0; - for (unsigned i = 0; i < numBits; i++) - { - unsigned bit = rc->DecodeBit(&probs[m]); - m <<= 1; - m += bit; - symbol |= (bit << i); - } - return symbol; -} - -template -class CBitTreeDecoder -{ - CProb Probs[(unsigned)1 << NumBits]; - -public: - - void Init() - { - INIT_PROBS(Probs); - } - - unsigned Decode(CRangeDecoder *rc) - { - unsigned m = 1; - for (unsigned i = 0; i < NumBits; i++) - m = (m << 1) + rc->DecodeBit(&Probs[m]); - return m - ((unsigned)1 << NumBits); - } - - unsigned ReverseDecode(CRangeDecoder *rc) - { - return BitTreeReverseDecode(Probs, NumBits, rc); - } -}; - - -LZ part of LZMA ---------------- - -LZ part of LZMA describes details about the decoding of MATCHES and LITERALS. - - -The Literal Decoding --------------------- - -The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where -each table contains 0x300 CProb values: - - CProb *LitProbs; - - void CreateLiterals() - { - LitProbs = new CProb[(UInt32)0x300 << (lc + lp)]; - } - - void InitLiterals() - { - UInt32 num = (UInt32)0x300 << (lc + lp); - for (UInt32 i = 0; i < num; i++) - LitProbs[i] = PROB_INIT_VAL; - } - -To select the table for decoding it uses the context that consists of -(lc) high bits from previous literal and (lp) low bits from value that -represents current position in outputStream. - -If (State > 7), the Literal Decoder also uses "matchByte" that represents -the byte in OutputStream at position the is the DISTANCE bytes before -current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair -of latest decoded match. - -The following code decodes one literal and puts it to Sliding Window buffer: - - void DecodeLiteral(unsigned state, UInt32 rep0) - { - unsigned prevByte = 0; - if (!OutWindow.IsEmpty()) - prevByte = OutWindow.GetByte(1); - - unsigned symbol = 1; - unsigned litState = ((OutWindow.TotalPos & ((1 << lp) - 1)) << lc) + (prevByte >> (8 - lc)); - CProb *probs = &LitProbs[(UInt32)0x300 * litState]; - - if (state >= 7) - { - unsigned matchByte = OutWindow.GetByte(rep0 + 1); - do - { - unsigned matchBit = (matchByte >> 7) & 1; - matchByte <<= 1; - unsigned bit = RangeDec.DecodeBit(&probs[((1 + matchBit) << 8) + symbol]); - symbol = (symbol << 1) | bit; - if (matchBit != bit) - break; - } - while (symbol < 0x100); - } - while (symbol < 0x100) - symbol = (symbol << 1) | RangeDec.DecodeBit(&probs[symbol]); - OutWindow.PutByte((Byte)(symbol - 0x100)); - } - - -The match length decoding -------------------------- - -The match length decoder returns normalized (zero-based value) -length of match. That value can be converted to real length of the match -with the following code: - -#define kMatchMinLen 2 - - matchLen = len + kMatchMinLen; - -The match length decoder can return the values from 0 to 271. -And the corresponded real match length values can be in the range -from 2 to 273. - -The following scheme is used for the match length encoding: - - Binary encoding Binary Tree structure Zero-based match length - sequence (binary + decimal): - - 0 xxx LowCoder[posState] xxx - 1 0 yyy MidCoder[posState] yyy + 8 - 1 1 zzzzzzzz HighCoder zzzzzzzz + 16 - -LZMA uses bit model variable "Choice" to decode the first selection bit. - -If the first selection bit is equal to 0, the decoder uses binary tree - LowCoder[posState] to decode 3-bit zero-based match length (xxx). - -If the first selection bit is equal to 1, the decoder uses bit model - variable "Choice2" to decode the second selection bit. - - If the second selection bit is equal to 0, the decoder uses binary tree - MidCoder[posState] to decode 3-bit "yyy" value, and zero-based match - length is equal to (yyy + 8). - - If the second selection bit is equal to 1, the decoder uses binary tree - HighCoder to decode 8-bit "zzzzzzzz" value, and zero-based - match length is equal to (zzzzzzzz + 16). - -LZMA uses "posState" value as context to select the binary tree -from LowCoder and MidCoder binary tree arrays: - - unsigned posState = OutWindow.TotalPos & ((1 << pb) - 1); - -The full code of the length decoder: - -class CLenDecoder -{ - CProb Choice; - CProb Choice2; - CBitTreeDecoder<3> LowCoder[1 << kNumPosBitsMax]; - CBitTreeDecoder<3> MidCoder[1 << kNumPosBitsMax]; - CBitTreeDecoder<8> HighCoder; - -public: - - void Init() - { - Choice = PROB_INIT_VAL; - Choice2 = PROB_INIT_VAL; - HighCoder.Init(); - for (unsigned i = 0; i < (1 << kNumPosBitsMax); i++) - { - LowCoder[i].Init(); - MidCoder[i].Init(); - } - } - - unsigned Decode(CRangeDecoder *rc, unsigned posState) - { - if (rc->DecodeBit(&Choice) == 0) - return LowCoder[posState].Decode(rc); - if (rc->DecodeBit(&Choice2) == 0) - return 8 + MidCoder[posState].Decode(rc); - return 16 + HighCoder.Decode(rc); - } -}; - -The LZMA decoder uses two instances of CLenDecoder class. -The first instance is for the matches of "Simple Match" type, -and the second instance is for the matches of "Rep Match" type: - - CLenDecoder LenDecoder; - CLenDecoder RepLenDecoder; - - -The match distance decoding ---------------------------- - -LZMA supports dictionary sizes up to 4 GiB minus 1. -The value of match distance (decoded by distance decoder) can be -from 1 to 2^32. But the distance value that is equal to 2^32 is used to -indicate the "End of stream" marker. So real largest match distance -that is used for LZ-window match is (2^32 - 1). - -LZMA uses normalized match length (zero-based length) -to calculate the context state "lenState" do decode the distance value: - -#define kNumLenToPosStates 4 - - unsigned lenState = len; - if (lenState > kNumLenToPosStates - 1) - lenState = kNumLenToPosStates - 1; - -The distance decoder returns the "dist" value that is zero-based value -of match distance. The real match distance can be calculated with the -following code: - - matchDistance = dist + 1; - -The state of the distance decoder and the initialization code: - - #define kEndPosModelIndex 14 - #define kNumFullDistances (1 << (kEndPosModelIndex >> 1)) - #define kNumAlignBits 4 - - CBitTreeDecoder<6> PosSlotDecoder[kNumLenToPosStates]; - CProb PosDecoders[1 + kNumFullDistances - kEndPosModelIndex]; - CBitTreeDecoder AlignDecoder; - - void InitDist() - { - for (unsigned i = 0; i < kNumLenToPosStates; i++) - PosSlotDecoder[i].Init(); - AlignDecoder.Init(); - INIT_PROBS(PosDecoders); - } - -At first stage the distance decoder decodes 6-bit "posSlot" value with bit -tree decoder from PosSlotDecoder array. It's possible to get 2^6=64 different -"posSlot" values. - - unsigned posSlot = PosSlotDecoder[lenState].Decode(&RangeDec); - -The encoding scheme for distance value is shown in the following table: - -posSlot (decimal) / - zero-based distance (binary) - 0 0 - 1 1 - 2 10 - 3 11 - - 4 10 x - 5 11 x - 6 10 xx - 7 11 xx - 8 10 xxx - 9 11 xxx -10 10 xxxx -11 11 xxxx -12 10 xxxxx -13 11 xxxxx - -14 10 yy zzzz -15 11 yy zzzz -16 10 yyy zzzz -17 11 yyy zzzz -... -62 10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz -63 11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz - -where - "x ... x" means the sequence of binary symbols encoded with binary tree and - "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13. - "y" means direct bit encoded with range coder. - "zzzz" means the sequence of four binary symbols encoded with binary - tree with "Reverse" scheme, where one common binary tree "AlignDecoder" - is used for all posSlot values. - -If (posSlot < 4), the "dist" value is equal to posSlot value. - -If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of - the high bits of "dist" value and the number of the low bits. - - If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders. - (one separated bit tree decoder per one posSlot value) and "Reverse" scheme. - In this implementation we use one CProb array "PosDecoders" that contains - all CProb variables for all these bit decoders. - - if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct - bits from RangeDecoder and the low 4 bits are decoded with a bit tree - decoder "AlignDecoder" with "Reverse" scheme. - -The code to decode zero-based match distance: - - unsigned DecodeDistance(unsigned len) - { - unsigned lenState = len; - if (lenState > kNumLenToPosStates - 1) - lenState = kNumLenToPosStates - 1; - - unsigned posSlot = PosSlotDecoder[lenState].Decode(&RangeDec); - if (posSlot < 4) - return posSlot; - - unsigned numDirectBits = (unsigned)((posSlot >> 1) - 1); - UInt32 dist = ((2 | (posSlot & 1)) << numDirectBits); - if (posSlot < kEndPosModelIndex) - dist += BitTreeReverseDecode(PosDecoders + dist - posSlot, numDirectBits, &RangeDec); - else - { - dist += RangeDec.DecodeDirectBits(numDirectBits - kNumAlignBits) << kNumAlignBits; - dist += AlignDecoder.ReverseDecode(&RangeDec); - } - return dist; - } - - - -LZMA Decoding modes -------------------- - -There are 2 types of LZMA streams: - -1) The stream with "End of stream" marker. -2) The stream without "End of stream" marker. - -And the LZMA Decoder supports 3 modes of decoding: - -1) The unpack size is undefined. The LZMA decoder stops decoding after - getting "End of stream" marker. - The input variables for that case: - - markerIsMandatory = true - unpackSizeDefined = false - unpackSize contains any value - -2) The unpack size is defined and LZMA decoder supports both variants, - where the stream can contain "End of stream" marker or the stream is - finished without "End of stream" marker. The LZMA decoder must detect - any of these situations. - The input variables for that case: - - markerIsMandatory = false - unpackSizeDefined = true - unpackSize contains unpack size - -3) The unpack size is defined and the LZMA stream must contain - "End of stream" marker - The input variables for that case: - - markerIsMandatory = true - unpackSizeDefined = true - unpackSize contains unpack size - - -The main loop of decoder ------------------------- - -The main loop of LZMA decoder: - -Initialize the LZMA state. -loop -{ - // begin of loop - Check "end of stream" conditions. - Decode Type of MATCH / LITERAL. - If it's LITERAL, decode LITERAL value and put the LITERAL to Window. - If it's MATCH, decode the length of match and the match distance. - Check error conditions, check end of stream conditions and copy - the sequence of match bytes from sliding window to current position - in window. - Go to begin of loop -} - -The reference implementation of LZMA decoder uses "unpackSize" variable -to keep the number of remaining bytes in output stream. So it reduces -"unpackSize" value after each decoded LITERAL or MATCH. - -The following code contains the "end of stream" condition check at the start -of the loop: - - if (unpackSizeDefined && unpackSize == 0 && !markerIsMandatory) - if (RangeDec.IsFinishedOK()) - return LZMA_RES_FINISHED_WITHOUT_MARKER; - -LZMA uses three types of matches: - -1) "Simple Match" - the match with distance value encoded with bit models. - -2) "Rep Match" - the match that uses the distance from distance - history table. - -3) "Short Rep Match" - the match of single byte length, that uses the latest - distance from distance history table. - -The LZMA decoder keeps the history of latest 4 match distances that were used -by decoder. That set of 4 variables contains zero-based match distances and -these variables are initialized with zero values: - - UInt32 rep0 = 0, rep1 = 0, rep2 = 0, rep3 = 0; - -The LZMA decoder uses binary model variables to select type of MATCH or LITERAL: - -#define kNumStates 12 -#define kNumPosBitsMax 4 - - CProb IsMatch[kNumStates << kNumPosBitsMax]; - CProb IsRep[kNumStates]; - CProb IsRepG0[kNumStates]; - CProb IsRepG1[kNumStates]; - CProb IsRepG2[kNumStates]; - CProb IsRep0Long[kNumStates << kNumPosBitsMax]; - -The decoder uses "state" variable value to select exact variable -from "IsRep", "IsRepG0", "IsRepG1" and "IsRepG2" arrays. -The "state" variable can get the value from 0 to 11. -Initial value for "state" variable is zero: - - unsigned state = 0; - -The "state" variable is updated after each LITERAL or MATCH with one of the -following functions: - -unsigned UpdateState_Literal(unsigned state) -{ - if (state < 4) return 0; - else if (state < 10) return state - 3; - else return state - 6; -} -unsigned UpdateState_Match (unsigned state) { return state < 7 ? 7 : 10; } -unsigned UpdateState_Rep (unsigned state) { return state < 7 ? 8 : 11; } -unsigned UpdateState_ShortRep(unsigned state) { return state < 7 ? 9 : 11; } - -The decoder calculates "state2" variable value to select exact variable from -"IsMatch" and "IsRep0Long" arrays: - -unsigned posState = OutWindow.TotalPos & ((1 << pb) - 1); -unsigned state2 = (state << kNumPosBitsMax) + posState; - -The decoder uses the following code flow scheme to select exact -type of LITERAL or MATCH: - -IsMatch[state2] decode - 0 - the Literal - 1 - the Match - IsRep[state] decode - 0 - Simple Match - 1 - Rep Match - IsRepG0[state] decode - 0 - the distance is rep0 - IsRep0Long[state2] decode - 0 - Short Rep Match - 1 - Rep Match 0 - 1 - - IsRepG1[state] decode - 0 - Rep Match 1 - 1 - - IsRepG2[state] decode - 0 - Rep Match 2 - 1 - Rep Match 3 - - -LITERAL symbol --------------- -If the value "0" was decoded with IsMatch[state2] decoding, we have "LITERAL" type. - -At first the LZMA decoder must check that it doesn't exceed -specified uncompressed size: - - if (unpackSizeDefined && unpackSize == 0) - return LZMA_RES_ERROR; - -Then it decodes literal value and puts it to sliding window: - - DecodeLiteral(state, rep0); - -Then the decoder must update the "state" value and "unpackSize" value; - - state = UpdateState_Literal(state); - unpackSize--; - -Then the decoder must go to the begin of main loop to decode next Match or Literal. - - -Simple Match ------------- - -If the value "1" was decoded with IsMatch[state2] decoding, -we have the "Simple Match" type. - -The distance history table is updated with the following scheme: - - rep3 = rep2; - rep2 = rep1; - rep1 = rep0; - -The zero-based length is decoded with "LenDecoder": - - len = LenDecoder.Decode(&RangeDec, posState); - -The state is update with UpdateState_Match function: - - state = UpdateState_Match(state); - -and the new "rep0" value is decoded with DecodeDistance: - - rep0 = DecodeDistance(len); - -That "rep0" will be used as zero-based distance for current match. - -If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have -"End of stream" marker, so we can stop decoding and check finishing -condition in Range Decoder: - - if (rep0 == 0xFFFFFFFF) - return RangeDec.IsFinishedOK() ? - LZMA_RES_FINISHED_WITH_MARKER : - LZMA_RES_ERROR; - -If uncompressed size is defined, LZMA decoder must check that it doesn't -exceed that specified uncompressed size: - - if (unpackSizeDefined && unpackSize == 0) - return LZMA_RES_ERROR; - -Also the decoder must check that "rep0" value is not larger than dictionary size -and is not larger than the number of already decoded bytes: - - if (rep0 >= dictSize || !OutWindow.CheckDistance(rep0)) - return LZMA_RES_ERROR; - -Then the decoder must copy match bytes as described in -"The match symbols copying" section. - - -Rep Match ---------- - -If the LZMA decoder has decoded the value "1" with IsRep[state] variable, -we have "Rep Match" type. - -At first the LZMA decoder must check that it doesn't exceed -specified uncompressed size: - - if (unpackSizeDefined && unpackSize == 0) - return LZMA_RES_ERROR; - -Also the decoder must return error, if the LZ window is empty: - - if (OutWindow.IsEmpty()) - return LZMA_RES_ERROR; - -If the match type is "Rep Match", the decoder uses one of the 4 variables of -distance history table to get the value of distance for current match. -And there are 4 corresponding ways of decoding flow. - -The decoder updates the distance history with the following scheme -depending from type of match: - -- "Rep Match 0" or "Short Rep Match": - ; LZMA doesn't update the distance history - -- "Rep Match 1": - UInt32 dist = rep1; - rep1 = rep0; - rep0 = dist; - -- "Rep Match 2": - UInt32 dist = rep2; - rep2 = rep1; - rep1 = rep0; - rep0 = dist; - -- "Rep Match 3": - UInt32 dist = rep3; - rep3 = rep2; - rep2 = rep1; - rep1 = rep0; - rep0 = dist; - -Then the decoder decodes exact subtype of "Rep Match" using "IsRepG0", "IsRep0Long", -"IsRepG1", "IsRepG2". - -If the subtype is "Short Rep Match", the decoder updates the state, puts -the one byte from window to current position in window and goes to next -MATCH/LITERAL symbol (the begin of main loop): - - state = UpdateState_ShortRep(state); - OutWindow.PutByte(OutWindow.GetByte(rep0 + 1)); - unpackSize--; - continue; - -In other cases (Rep Match 0/1/2/3), it decodes the zero-based -length of match with "RepLenDecoder" decoder: - - len = RepLenDecoder.Decode(&RangeDec, posState); - -Then it updates the state: - - state = UpdateState_Rep(state); - -Then the decoder must copy match bytes as described in -"The Match symbols copying" section. - - -The match symbols copying -------------------------- - -If we have the match (Simple Match or Rep Match 0/1/2/3), the decoder must -copy the sequence of bytes with calculated match distance and match length. -If uncompressed size is defined, LZMA decoder must check that it doesn't -exceed that specified uncompressed size: - - len += kMatchMinLen; - bool isError = false; - if (unpackSizeDefined && unpackSize < len) - { - len = (unsigned)unpackSize; - isError = true; - } - OutWindow.CopyMatch(rep0 + 1, len); - unpackSize -= len; - if (isError) - return LZMA_RES_ERROR; - -Then the decoder must go to the begin of main loop to decode next MATCH or LITERAL. - - - -NOTES ------ - -This specification doesn't describe the variant of decoder implementation -that supports partial decoding. Such partial decoding case can require some -changes in "end of stream" condition checks code. Also such code -can use additional status codes, returned by decoder. - -This specification uses C++ code with templates to simplify describing. -The optimized version of LZMA decoder doesn't need templates. -Such optimized version can use just two arrays of CProb variables: - 1) The dynamic array of CProb variables allocated for the Literal Decoder. - 2) The one common array that contains all other CProb variables. - - -References: - -1. G. N. N. Martin, Range encoding: an algorithm for removing redundancy - from a digitized message, Video & Data Recording Conference, - Southampton, UK, July 24-27, 1979. +LZMA specification (DRAFT version) +---------------------------------- + +Author: Igor Pavlov +Date: 2015-06-14 + +This specification defines the format of LZMA compressed data and lzma file format. + +Notation +-------- + +We use the syntax of C++ programming language. +We use the following types in C++ code: + unsigned - unsigned integer, at least 16 bits in size + int - signed integer, at least 16 bits in size + UInt64 - 64-bit unsigned integer + UInt32 - 32-bit unsigned integer + UInt16 - 16-bit unsigned integer + Byte - 8-bit unsigned integer + bool - boolean type with two possible values: false, true + + +lzma file format +================ + +The lzma file contains the raw LZMA stream and the header with related properties. + +The files in that format use ".lzma" extension. + +The lzma file format layout: + +Offset Size Description + + 0 1 LZMA model properties (lc, lp, pb) in encoded form + 1 4 Dictionary size (32-bit unsigned integer, little-endian) + 5 8 Uncompressed size (64-bit unsigned integer, little-endian) + 13 Compressed data (LZMA stream) + +LZMA properties: + + name Range Description + + lc [0, 8] the number of "literal context" bits + lp [0, 4] the number of "literal pos" bits + pb [0, 4] the number of "pos" bits +dictSize [0, 2^32 - 1] the dictionary size + +The following code encodes LZMA properties: + +void EncodeProperties(Byte *properties) +{ + properties[0] = (Byte)((pb * 5 + lp) * 9 + lc); + Set_UInt32_LittleEndian(properties + 1, dictSize); +} + +If the value of dictionary size in properties is smaller than (1 << 12), +the LZMA decoder must set the dictionary size variable to (1 << 12). + +#define LZMA_DIC_MIN (1 << 12) + + unsigned lc, pb, lp; + UInt32 dictSize; + UInt32 dictSizeInProperties; + + void DecodeProperties(const Byte *properties) + { + unsigned d = properties[0]; + if (d >= (9 * 5 * 5)) + throw "Incorrect LZMA properties"; + lc = d % 9; + d /= 9; + pb = d / 5; + lp = d % 5; + dictSizeInProperties = 0; + for (int i = 0; i < 4; i++) + dictSizeInProperties |= (UInt32)properties[i + 1] << (8 * i); + dictSize = dictSizeInProperties; + if (dictSize < LZMA_DIC_MIN) + dictSize = LZMA_DIC_MIN; + } + +If "Uncompressed size" field contains ones in all 64 bits, it means that +uncompressed size is unknown and there is the "end marker" in stream, +that indicates the end of decoding point. +In opposite case, if the value from "Uncompressed size" field is not +equal to ((2^64) - 1), the LZMA stream decoding must be finished after +specified number of bytes (Uncompressed size) is decoded. And if there +is the "end marker", the LZMA decoder must read that marker also. + + +The new scheme to encode LZMA properties +---------------------------------------- + +If LZMA compression is used for some another format, it's recommended to +use a new improved scheme to encode LZMA properties. That new scheme was +used in xz format that uses the LZMA2 compression algorithm. +The LZMA2 is a new compression algorithm that is based on the LZMA algorithm. + +The dictionary size in LZMA2 is encoded with just one byte and LZMA2 supports +only reduced set of dictionary sizes: + (2 << 11), (3 << 11), + (2 << 12), (3 << 12), + ... + (2 << 30), (3 << 30), + (2 << 31) - 1 + +The dictionary size can be extracted from encoded value with the following code: + + dictSize = (p == 40) ? 0xFFFFFFFF : (((UInt32)2 | ((p) & 1)) << ((p) / 2 + 11)); + +Also there is additional limitation (lc + lp <= 4) in LZMA2 for values of +"lc" and "lp" properties: + + if (lc + lp > 4) + throw "Unsupported properties: (lc + lp) > 4"; + +There are some advantages for LZMA decoder with such (lc + lp) value +limitation. It reduces the maximum size of tables allocated by decoder. +And it reduces the complexity of initialization procedure, that can be +important to keep high speed of decoding of big number of small LZMA streams. + +It's recommended to use that limitation (lc + lp <= 4) for any new format +that uses LZMA compression. Note that the combinations of "lc" and "lp" +parameters, where (lc + lp > 4), can provide significant improvement in +compression ratio only in some rare cases. + +The LZMA properties can be encoded into two bytes in new scheme: + +Offset Size Description + + 0 1 The dictionary size encoded with LZMA2 scheme + 1 1 LZMA model properties (lc, lp, pb) in encoded form + + +The RAM usage +============= + +The RAM usage for LZMA decoder is determined by the following parts: + +1) The Sliding Window (from 4 KiB to 4 GiB). +2) The probability model counter arrays (arrays of 16-bit variables). +3) Some additional state variables (about 10 variables of 32-bit integers). + + +The RAM usage for Sliding Window +-------------------------------- + +There are two main scenarios of decoding: + +1) The decoding of full stream to one RAM buffer. + + If we decode full LZMA stream to one output buffer in RAM, the decoder + can use that output buffer as sliding window. So the decoder doesn't + need additional buffer allocated for sliding window. + +2) The decoding to some external storage. + + If we decode LZMA stream to external storage, the decoder must allocate + the buffer for sliding window. The size of that buffer must be equal + or larger than the value of dictionary size from properties of LZMA stream. + +In this specification we describe the code for decoding to some external +storage. The optimized version of code for decoding of full stream to one +output RAM buffer can require some minor changes in code. + + +The RAM usage for the probability model counters +------------------------------------------------ + +The size of the probability model counter arrays is calculated with the +following formula: + +size_of_prob_arrays = 1846 + 768 * (1 << (lp + lc)) + +Each probability model counter is 11-bit unsigned integer. +If we use 16-bit integer variables (2-byte integers) for these probability +model counters, the RAM usage required by probability model counter arrays +can be estimated with the following formula: + + RAM = 4 KiB + 1.5 KiB * (1 << (lp + lc)) + +For example, for default LZMA parameters (lp = 0 and lc = 3), the RAM usage is + + RAM_lc3_lp0 = 4 KiB + 1.5 KiB * 8 = 16 KiB + +The maximum RAM state usage is required for decoding the stream with lp = 4 +and lc = 8: + + RAM_lc8_lp4 = 4 KiB + 1.5 KiB * 4096 = 6148 KiB + +If the decoder uses LZMA2's limited property condition +(lc + lp <= 4), the RAM usage will be not larger than + + RAM_lc_lp_4 = 4 KiB + 1.5 KiB * 16 = 28 KiB + + +The RAM usage for encoder +------------------------- + +There are many variants for LZMA encoding code. +These variants have different values for memory consumption. +Note that memory consumption for LZMA Encoder can not be +smaller than memory consumption of LZMA Decoder for same stream. + +The RAM usage required by modern effective implementation of +LZMA Encoder can be estimated with the following formula: + + Encoder_RAM_Usage = 4 MiB + 11 * dictionarySize. + +But there are some modes of the encoder that require less memory. + + +LZMA Decoding +============= + +The LZMA compression algorithm uses LZ-based compression with Sliding Window +and Range Encoding as entropy coding method. + + +Sliding Window +-------------- + +LZMA uses Sliding Window compression similar to LZ77 algorithm. + +LZMA stream must be decoded to the sequence that consists +of MATCHES and LITERALS: + + - a LITERAL is a 8-bit character (one byte). + The decoder just puts that LITERAL to the uncompressed stream. + + - a MATCH is a pair of two numbers (DISTANCE-LENGTH pair). + The decoder takes one byte exactly "DISTANCE" characters behind + current position in the uncompressed stream and puts it to + uncompressed stream. The decoder must repeat it "LENGTH" times. + +The "DISTANCE" can not be larger than dictionary size. +And the "DISTANCE" can not be larger than the number of bytes in +the uncompressed stream that were decoded before that match. + +In this specification we use cyclic buffer to implement Sliding Window +for LZMA decoder: + +class COutWindow +{ + Byte *Buf; + UInt32 Pos; + UInt32 Size; + bool IsFull; + +public: + unsigned TotalPos; + COutStream OutStream; + + COutWindow(): Buf(NULL) {} + ~COutWindow() { delete []Buf; } + + void Create(UInt32 dictSize) + { + Buf = new Byte[dictSize]; + Pos = 0; + Size = dictSize; + IsFull = false; + TotalPos = 0; + } + + void PutByte(Byte b) + { + TotalPos++; + Buf[Pos++] = b; + if (Pos == Size) + { + Pos = 0; + IsFull = true; + } + OutStream.WriteByte(b); + } + + Byte GetByte(UInt32 dist) const + { + return Buf[dist <= Pos ? Pos - dist : Size - dist + Pos]; + } + + void CopyMatch(UInt32 dist, unsigned len) + { + for (; len > 0; len--) + PutByte(GetByte(dist)); + } + + bool CheckDistance(UInt32 dist) const + { + return dist <= Pos || IsFull; + } + + bool IsEmpty() const + { + return Pos == 0 && !IsFull; + } +}; + + +In another implementation it's possible to use one buffer that contains +Sliding Window and the whole data stream after uncompressing. + + +Range Decoder +------------- + +LZMA algorithm uses Range Encoding (1) as entropy coding method. + +LZMA stream contains just one very big number in big-endian encoding. +LZMA decoder uses the Range Decoder to extract a sequence of binary +symbols from that big number. + +The state of the Range Decoder: + +struct CRangeDecoder +{ + UInt32 Range; + UInt32 Code; + InputStream *InStream; + + bool Corrupted; +} + +The notes about UInt32 type for the "Range" and "Code" variables: + + It's possible to use 64-bit (unsigned or signed) integer type + for the "Range" and the "Code" variables instead of 32-bit unsigned, + but some additional code must be used to truncate the values to + low 32-bits after some operations. + + If the programming language does not support 32-bit unsigned integer type + (like in case of JAVA language), it's possible to use 32-bit signed integer, + but some code must be changed. For example, it's required to change the code + that uses comparison operations for UInt32 variables in this specification. + +The Range Decoder can be in some states that can be treated as +"Corruption" in LZMA stream. The Range Decoder uses the variable "Corrupted": + + (Corrupted == false), if the Range Decoder has not detected any corruption. + (Corrupted == true), if the Range Decoder has detected some corruption. + +The reference LZMA Decoder ignores the value of the "Corrupted" variable. +So it continues to decode the stream, even if the corruption can be detected +in the Range Decoder. To provide the full compatibility with output of the +reference LZMA Decoder, another LZMA Decoder implementations must also +ignore the value of the "Corrupted" variable. + +The LZMA Encoder is required to create only such LZMA streams, that will not +lead the Range Decoder to states, where the "Corrupted" variable is set to true. + +The Range Decoder reads first 5 bytes from input stream to initialize +the state: + +bool CRangeDecoder::Init() +{ + Corrupted = false; + Range = 0xFFFFFFFF; + Code = 0; + + Byte b = InStream->ReadByte(); + + for (int i = 0; i < 4; i++) + Code = (Code << 8) | InStream->ReadByte(); + + if (b != 0 || Code == Range) + Corrupted = true; + return b == 0; +} + +The LZMA Encoder always writes ZERO in initial byte of compressed stream. +That scheme allows to simplify the code of the Range Encoder in the +LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must +stop decoding and report error. + +After the last bit of data was decoded by Range Decoder, the value of the +"Code" variable must be equal to 0. The LZMA Decoder must check it by +calling the IsFinishedOK() function: + + bool IsFinishedOK() const { return Code == 0; } + +If there is corruption in data stream, there is big probability that +the "Code" value will be not equal to 0 in the Finish() function. So that +check in the IsFinishedOK() function provides very good feature for +corruption detection. + +The value of the "Range" variable before each bit decoding can not be smaller +than ((UInt32)1 << 24). The Normalize() function keeps the "Range" value in +described range. + +#define kTopValue ((UInt32)1 << 24) + +void CRangeDecoder::Normalize() +{ + if (Range < kTopValue) + { + Range <<= 8; + Code = (Code << 8) | InStream->ReadByte(); + } +} + +Notes: if the size of the "Code" variable is larger than 32 bits, it's +required to keep only low 32 bits of the "Code" variable after the change +in Normalize() function. + +If the LZMA Stream is not corrupted, the value of the "Code" variable is +always smaller than value of the "Range" variable. +But the Range Decoder ignores some types of corruptions, so the value of +the "Code" variable can be equal or larger than value of the "Range" variable +for some "Corrupted" archives. + + +LZMA uses Range Encoding only with binary symbols of two types: + 1) binary symbols with fixed and equal probabilities (direct bits) + 2) binary symbols with predicted probabilities + +The DecodeDirectBits() function decodes the sequence of direct bits: + +UInt32 CRangeDecoder::DecodeDirectBits(unsigned numBits) +{ + UInt32 res = 0; + do + { + Range >>= 1; + Code -= Range; + UInt32 t = 0 - ((UInt32)Code >> 31); + Code += Range & t; + + if (Code == Range) + Corrupted = true; + + Normalize(); + res <<= 1; + res += t + 1; + } + while (--numBits); + return res; +} + + +The Bit Decoding with Probability Model +--------------------------------------- + +The task of Bit Probability Model is to estimate probabilities of binary +symbols. And then it provides the Range Decoder with that information. +The better prediction provides better compression ratio. +The Bit Probability Model uses statistical data of previous decoded +symbols. + +That estimated probability is presented as 11-bit unsigned integer value +that represents the probability of symbol "0". + +#define kNumBitModelTotalBits 11 + +Mathematical probabilities can be presented with the following formulas: + probability(symbol_0) = prob / 2048. + probability(symbol_1) = 1 - Probability(symbol_0) = + = 1 - prob / 2048 = + = (2048 - prob) / 2048 +where the "prob" variable contains 11-bit integer probability counter. + +It's recommended to use 16-bit unsigned integer type, to store these 11-bit +probability values: + +typedef UInt16 CProb; + +Each probability value must be initialized with value ((1 << 11) / 2), +that represents the state, where probabilities of symbols 0 and 1 +are equal to 0.5: + +#define PROB_INIT_VAL ((1 << kNumBitModelTotalBits) / 2) + +The INIT_PROBS macro is used to initialize the array of CProb variables: + +#define INIT_PROBS(p) \ + { for (unsigned i = 0; i < sizeof(p) / sizeof(p[0]); i++) p[i] = PROB_INIT_VAL; } + + +The DecodeBit() function decodes one bit. +The LZMA decoder provides the pointer to CProb variable that contains +information about estimated probability for symbol 0 and the Range Decoder +updates that CProb variable after decoding. The Range Decoder increases +estimated probability of the symbol that was decoded: + +#define kNumMoveBits 5 + +unsigned CRangeDecoder::DecodeBit(CProb *prob) +{ + unsigned v = *prob; + UInt32 bound = (Range >> kNumBitModelTotalBits) * v; + unsigned symbol; + if (Code < bound) + { + v += ((1 << kNumBitModelTotalBits) - v) >> kNumMoveBits; + Range = bound; + symbol = 0; + } + else + { + v -= v >> kNumMoveBits; + Code -= bound; + Range -= bound; + symbol = 1; + } + *prob = (CProb)v; + Normalize(); + return symbol; +} + + +The Binary Tree of bit model counters +------------------------------------- + +LZMA uses a tree of Bit model variables to decode symbol that needs +several bits for storing. There are two versions of such trees in LZMA: + 1) the tree that decodes bits from high bit to low bit (the normal scheme). + 2) the tree that decodes bits from low bit to high bit (the reverse scheme). + +Each binary tree structure supports different size of decoded symbol +(the size of binary sequence that contains value of symbol). +If that size of decoded symbol is "NumBits" bits, the tree structure +uses the array of (2 << NumBits) counters of CProb type. +But only ((2 << NumBits) - 1) items are used by encoder and decoder. +The first item (the item with index equal to 0) in array is unused. +That scheme with unused array's item allows to simplify the code. + +unsigned BitTreeReverseDecode(CProb *probs, unsigned numBits, CRangeDecoder *rc) +{ + unsigned m = 1; + unsigned symbol = 0; + for (unsigned i = 0; i < numBits; i++) + { + unsigned bit = rc->DecodeBit(&probs[m]); + m <<= 1; + m += bit; + symbol |= (bit << i); + } + return symbol; +} + +template +class CBitTreeDecoder +{ + CProb Probs[(unsigned)1 << NumBits]; + +public: + + void Init() + { + INIT_PROBS(Probs); + } + + unsigned Decode(CRangeDecoder *rc) + { + unsigned m = 1; + for (unsigned i = 0; i < NumBits; i++) + m = (m << 1) + rc->DecodeBit(&Probs[m]); + return m - ((unsigned)1 << NumBits); + } + + unsigned ReverseDecode(CRangeDecoder *rc) + { + return BitTreeReverseDecode(Probs, NumBits, rc); + } +}; + + +LZ part of LZMA +--------------- + +LZ part of LZMA describes details about the decoding of MATCHES and LITERALS. + + +The Literal Decoding +-------------------- + +The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where +each table contains 0x300 CProb values: + + CProb *LitProbs; + + void CreateLiterals() + { + LitProbs = new CProb[(UInt32)0x300 << (lc + lp)]; + } + + void InitLiterals() + { + UInt32 num = (UInt32)0x300 << (lc + lp); + for (UInt32 i = 0; i < num; i++) + LitProbs[i] = PROB_INIT_VAL; + } + +To select the table for decoding it uses the context that consists of +(lc) high bits from previous literal and (lp) low bits from value that +represents current position in outputStream. + +If (State > 7), the Literal Decoder also uses "matchByte" that represents +the byte in OutputStream at position the is the DISTANCE bytes before +current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair +of latest decoded match. + +The following code decodes one literal and puts it to Sliding Window buffer: + + void DecodeLiteral(unsigned state, UInt32 rep0) + { + unsigned prevByte = 0; + if (!OutWindow.IsEmpty()) + prevByte = OutWindow.GetByte(1); + + unsigned symbol = 1; + unsigned litState = ((OutWindow.TotalPos & ((1 << lp) - 1)) << lc) + (prevByte >> (8 - lc)); + CProb *probs = &LitProbs[(UInt32)0x300 * litState]; + + if (state >= 7) + { + unsigned matchByte = OutWindow.GetByte(rep0 + 1); + do + { + unsigned matchBit = (matchByte >> 7) & 1; + matchByte <<= 1; + unsigned bit = RangeDec.DecodeBit(&probs[((1 + matchBit) << 8) + symbol]); + symbol = (symbol << 1) | bit; + if (matchBit != bit) + break; + } + while (symbol < 0x100); + } + while (symbol < 0x100) + symbol = (symbol << 1) | RangeDec.DecodeBit(&probs[symbol]); + OutWindow.PutByte((Byte)(symbol - 0x100)); + } + + +The match length decoding +------------------------- + +The match length decoder returns normalized (zero-based value) +length of match. That value can be converted to real length of the match +with the following code: + +#define kMatchMinLen 2 + + matchLen = len + kMatchMinLen; + +The match length decoder can return the values from 0 to 271. +And the corresponded real match length values can be in the range +from 2 to 273. + +The following scheme is used for the match length encoding: + + Binary encoding Binary Tree structure Zero-based match length + sequence (binary + decimal): + + 0 xxx LowCoder[posState] xxx + 1 0 yyy MidCoder[posState] yyy + 8 + 1 1 zzzzzzzz HighCoder zzzzzzzz + 16 + +LZMA uses bit model variable "Choice" to decode the first selection bit. + +If the first selection bit is equal to 0, the decoder uses binary tree + LowCoder[posState] to decode 3-bit zero-based match length (xxx). + +If the first selection bit is equal to 1, the decoder uses bit model + variable "Choice2" to decode the second selection bit. + + If the second selection bit is equal to 0, the decoder uses binary tree + MidCoder[posState] to decode 3-bit "yyy" value, and zero-based match + length is equal to (yyy + 8). + + If the second selection bit is equal to 1, the decoder uses binary tree + HighCoder to decode 8-bit "zzzzzzzz" value, and zero-based + match length is equal to (zzzzzzzz + 16). + +LZMA uses "posState" value as context to select the binary tree +from LowCoder and MidCoder binary tree arrays: + + unsigned posState = OutWindow.TotalPos & ((1 << pb) - 1); + +The full code of the length decoder: + +class CLenDecoder +{ + CProb Choice; + CProb Choice2; + CBitTreeDecoder<3> LowCoder[1 << kNumPosBitsMax]; + CBitTreeDecoder<3> MidCoder[1 << kNumPosBitsMax]; + CBitTreeDecoder<8> HighCoder; + +public: + + void Init() + { + Choice = PROB_INIT_VAL; + Choice2 = PROB_INIT_VAL; + HighCoder.Init(); + for (unsigned i = 0; i < (1 << kNumPosBitsMax); i++) + { + LowCoder[i].Init(); + MidCoder[i].Init(); + } + } + + unsigned Decode(CRangeDecoder *rc, unsigned posState) + { + if (rc->DecodeBit(&Choice) == 0) + return LowCoder[posState].Decode(rc); + if (rc->DecodeBit(&Choice2) == 0) + return 8 + MidCoder[posState].Decode(rc); + return 16 + HighCoder.Decode(rc); + } +}; + +The LZMA decoder uses two instances of CLenDecoder class. +The first instance is for the matches of "Simple Match" type, +and the second instance is for the matches of "Rep Match" type: + + CLenDecoder LenDecoder; + CLenDecoder RepLenDecoder; + + +The match distance decoding +--------------------------- + +LZMA supports dictionary sizes up to 4 GiB minus 1. +The value of match distance (decoded by distance decoder) can be +from 1 to 2^32. But the distance value that is equal to 2^32 is used to +indicate the "End of stream" marker. So real largest match distance +that is used for LZ-window match is (2^32 - 1). + +LZMA uses normalized match length (zero-based length) +to calculate the context state "lenState" do decode the distance value: + +#define kNumLenToPosStates 4 + + unsigned lenState = len; + if (lenState > kNumLenToPosStates - 1) + lenState = kNumLenToPosStates - 1; + +The distance decoder returns the "dist" value that is zero-based value +of match distance. The real match distance can be calculated with the +following code: + + matchDistance = dist + 1; + +The state of the distance decoder and the initialization code: + + #define kEndPosModelIndex 14 + #define kNumFullDistances (1 << (kEndPosModelIndex >> 1)) + #define kNumAlignBits 4 + + CBitTreeDecoder<6> PosSlotDecoder[kNumLenToPosStates]; + CProb PosDecoders[1 + kNumFullDistances - kEndPosModelIndex]; + CBitTreeDecoder AlignDecoder; + + void InitDist() + { + for (unsigned i = 0; i < kNumLenToPosStates; i++) + PosSlotDecoder[i].Init(); + AlignDecoder.Init(); + INIT_PROBS(PosDecoders); + } + +At first stage the distance decoder decodes 6-bit "posSlot" value with bit +tree decoder from PosSlotDecoder array. It's possible to get 2^6=64 different +"posSlot" values. + + unsigned posSlot = PosSlotDecoder[lenState].Decode(&RangeDec); + +The encoding scheme for distance value is shown in the following table: + +posSlot (decimal) / + zero-based distance (binary) + 0 0 + 1 1 + 2 10 + 3 11 + + 4 10 x + 5 11 x + 6 10 xx + 7 11 xx + 8 10 xxx + 9 11 xxx +10 10 xxxx +11 11 xxxx +12 10 xxxxx +13 11 xxxxx + +14 10 yy zzzz +15 11 yy zzzz +16 10 yyy zzzz +17 11 yyy zzzz +... +62 10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz +63 11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz + +where + "x ... x" means the sequence of binary symbols encoded with binary tree and + "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13. + "y" means direct bit encoded with range coder. + "zzzz" means the sequence of four binary symbols encoded with binary + tree with "Reverse" scheme, where one common binary tree "AlignDecoder" + is used for all posSlot values. + +If (posSlot < 4), the "dist" value is equal to posSlot value. + +If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of + the high bits of "dist" value and the number of the low bits. + + If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders. + (one separated bit tree decoder per one posSlot value) and "Reverse" scheme. + In this implementation we use one CProb array "PosDecoders" that contains + all CProb variables for all these bit decoders. + + if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct + bits from RangeDecoder and the low 4 bits are decoded with a bit tree + decoder "AlignDecoder" with "Reverse" scheme. + +The code to decode zero-based match distance: + + unsigned DecodeDistance(unsigned len) + { + unsigned lenState = len; + if (lenState > kNumLenToPosStates - 1) + lenState = kNumLenToPosStates - 1; + + unsigned posSlot = PosSlotDecoder[lenState].Decode(&RangeDec); + if (posSlot < 4) + return posSlot; + + unsigned numDirectBits = (unsigned)((posSlot >> 1) - 1); + UInt32 dist = ((2 | (posSlot & 1)) << numDirectBits); + if (posSlot < kEndPosModelIndex) + dist += BitTreeReverseDecode(PosDecoders + dist - posSlot, numDirectBits, &RangeDec); + else + { + dist += RangeDec.DecodeDirectBits(numDirectBits - kNumAlignBits) << kNumAlignBits; + dist += AlignDecoder.ReverseDecode(&RangeDec); + } + return dist; + } + + + +LZMA Decoding modes +------------------- + +There are 2 types of LZMA streams: + +1) The stream with "End of stream" marker. +2) The stream without "End of stream" marker. + +And the LZMA Decoder supports 3 modes of decoding: + +1) The unpack size is undefined. The LZMA decoder stops decoding after + getting "End of stream" marker. + The input variables for that case: + + markerIsMandatory = true + unpackSizeDefined = false + unpackSize contains any value + +2) The unpack size is defined and LZMA decoder supports both variants, + where the stream can contain "End of stream" marker or the stream is + finished without "End of stream" marker. The LZMA decoder must detect + any of these situations. + The input variables for that case: + + markerIsMandatory = false + unpackSizeDefined = true + unpackSize contains unpack size + +3) The unpack size is defined and the LZMA stream must contain + "End of stream" marker + The input variables for that case: + + markerIsMandatory = true + unpackSizeDefined = true + unpackSize contains unpack size + + +The main loop of decoder +------------------------ + +The main loop of LZMA decoder: + +Initialize the LZMA state. +loop +{ + // begin of loop + Check "end of stream" conditions. + Decode Type of MATCH / LITERAL. + If it's LITERAL, decode LITERAL value and put the LITERAL to Window. + If it's MATCH, decode the length of match and the match distance. + Check error conditions, check end of stream conditions and copy + the sequence of match bytes from sliding window to current position + in window. + Go to begin of loop +} + +The reference implementation of LZMA decoder uses "unpackSize" variable +to keep the number of remaining bytes in output stream. So it reduces +"unpackSize" value after each decoded LITERAL or MATCH. + +The following code contains the "end of stream" condition check at the start +of the loop: + + if (unpackSizeDefined && unpackSize == 0 && !markerIsMandatory) + if (RangeDec.IsFinishedOK()) + return LZMA_RES_FINISHED_WITHOUT_MARKER; + +LZMA uses three types of matches: + +1) "Simple Match" - the match with distance value encoded with bit models. + +2) "Rep Match" - the match that uses the distance from distance + history table. + +3) "Short Rep Match" - the match of single byte length, that uses the latest + distance from distance history table. + +The LZMA decoder keeps the history of latest 4 match distances that were used +by decoder. That set of 4 variables contains zero-based match distances and +these variables are initialized with zero values: + + UInt32 rep0 = 0, rep1 = 0, rep2 = 0, rep3 = 0; + +The LZMA decoder uses binary model variables to select type of MATCH or LITERAL: + +#define kNumStates 12 +#define kNumPosBitsMax 4 + + CProb IsMatch[kNumStates << kNumPosBitsMax]; + CProb IsRep[kNumStates]; + CProb IsRepG0[kNumStates]; + CProb IsRepG1[kNumStates]; + CProb IsRepG2[kNumStates]; + CProb IsRep0Long[kNumStates << kNumPosBitsMax]; + +The decoder uses "state" variable value to select exact variable +from "IsRep", "IsRepG0", "IsRepG1" and "IsRepG2" arrays. +The "state" variable can get the value from 0 to 11. +Initial value for "state" variable is zero: + + unsigned state = 0; + +The "state" variable is updated after each LITERAL or MATCH with one of the +following functions: + +unsigned UpdateState_Literal(unsigned state) +{ + if (state < 4) return 0; + else if (state < 10) return state - 3; + else return state - 6; +} +unsigned UpdateState_Match (unsigned state) { return state < 7 ? 7 : 10; } +unsigned UpdateState_Rep (unsigned state) { return state < 7 ? 8 : 11; } +unsigned UpdateState_ShortRep(unsigned state) { return state < 7 ? 9 : 11; } + +The decoder calculates "state2" variable value to select exact variable from +"IsMatch" and "IsRep0Long" arrays: + +unsigned posState = OutWindow.TotalPos & ((1 << pb) - 1); +unsigned state2 = (state << kNumPosBitsMax) + posState; + +The decoder uses the following code flow scheme to select exact +type of LITERAL or MATCH: + +IsMatch[state2] decode + 0 - the Literal + 1 - the Match + IsRep[state] decode + 0 - Simple Match + 1 - Rep Match + IsRepG0[state] decode + 0 - the distance is rep0 + IsRep0Long[state2] decode + 0 - Short Rep Match + 1 - Rep Match 0 + 1 - + IsRepG1[state] decode + 0 - Rep Match 1 + 1 - + IsRepG2[state] decode + 0 - Rep Match 2 + 1 - Rep Match 3 + + +LITERAL symbol +-------------- +If the value "0" was decoded with IsMatch[state2] decoding, we have "LITERAL" type. + +At first the LZMA decoder must check that it doesn't exceed +specified uncompressed size: + + if (unpackSizeDefined && unpackSize == 0) + return LZMA_RES_ERROR; + +Then it decodes literal value and puts it to sliding window: + + DecodeLiteral(state, rep0); + +Then the decoder must update the "state" value and "unpackSize" value; + + state = UpdateState_Literal(state); + unpackSize--; + +Then the decoder must go to the begin of main loop to decode next Match or Literal. + + +Simple Match +------------ + +If the value "1" was decoded with IsMatch[state2] decoding, +we have the "Simple Match" type. + +The distance history table is updated with the following scheme: + + rep3 = rep2; + rep2 = rep1; + rep1 = rep0; + +The zero-based length is decoded with "LenDecoder": + + len = LenDecoder.Decode(&RangeDec, posState); + +The state is update with UpdateState_Match function: + + state = UpdateState_Match(state); + +and the new "rep0" value is decoded with DecodeDistance: + + rep0 = DecodeDistance(len); + +That "rep0" will be used as zero-based distance for current match. + +If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have +"End of stream" marker, so we can stop decoding and check finishing +condition in Range Decoder: + + if (rep0 == 0xFFFFFFFF) + return RangeDec.IsFinishedOK() ? + LZMA_RES_FINISHED_WITH_MARKER : + LZMA_RES_ERROR; + +If uncompressed size is defined, LZMA decoder must check that it doesn't +exceed that specified uncompressed size: + + if (unpackSizeDefined && unpackSize == 0) + return LZMA_RES_ERROR; + +Also the decoder must check that "rep0" value is not larger than dictionary size +and is not larger than the number of already decoded bytes: + + if (rep0 >= dictSize || !OutWindow.CheckDistance(rep0)) + return LZMA_RES_ERROR; + +Then the decoder must copy match bytes as described in +"The match symbols copying" section. + + +Rep Match +--------- + +If the LZMA decoder has decoded the value "1" with IsRep[state] variable, +we have "Rep Match" type. + +At first the LZMA decoder must check that it doesn't exceed +specified uncompressed size: + + if (unpackSizeDefined && unpackSize == 0) + return LZMA_RES_ERROR; + +Also the decoder must return error, if the LZ window is empty: + + if (OutWindow.IsEmpty()) + return LZMA_RES_ERROR; + +If the match type is "Rep Match", the decoder uses one of the 4 variables of +distance history table to get the value of distance for current match. +And there are 4 corresponding ways of decoding flow. + +The decoder updates the distance history with the following scheme +depending from type of match: + +- "Rep Match 0" or "Short Rep Match": + ; LZMA doesn't update the distance history + +- "Rep Match 1": + UInt32 dist = rep1; + rep1 = rep0; + rep0 = dist; + +- "Rep Match 2": + UInt32 dist = rep2; + rep2 = rep1; + rep1 = rep0; + rep0 = dist; + +- "Rep Match 3": + UInt32 dist = rep3; + rep3 = rep2; + rep2 = rep1; + rep1 = rep0; + rep0 = dist; + +Then the decoder decodes exact subtype of "Rep Match" using "IsRepG0", "IsRep0Long", +"IsRepG1", "IsRepG2". + +If the subtype is "Short Rep Match", the decoder updates the state, puts +the one byte from window to current position in window and goes to next +MATCH/LITERAL symbol (the begin of main loop): + + state = UpdateState_ShortRep(state); + OutWindow.PutByte(OutWindow.GetByte(rep0 + 1)); + unpackSize--; + continue; + +In other cases (Rep Match 0/1/2/3), it decodes the zero-based +length of match with "RepLenDecoder" decoder: + + len = RepLenDecoder.Decode(&RangeDec, posState); + +Then it updates the state: + + state = UpdateState_Rep(state); + +Then the decoder must copy match bytes as described in +"The Match symbols copying" section. + + +The match symbols copying +------------------------- + +If we have the match (Simple Match or Rep Match 0/1/2/3), the decoder must +copy the sequence of bytes with calculated match distance and match length. +If uncompressed size is defined, LZMA decoder must check that it doesn't +exceed that specified uncompressed size: + + len += kMatchMinLen; + bool isError = false; + if (unpackSizeDefined && unpackSize < len) + { + len = (unsigned)unpackSize; + isError = true; + } + OutWindow.CopyMatch(rep0 + 1, len); + unpackSize -= len; + if (isError) + return LZMA_RES_ERROR; + +Then the decoder must go to the begin of main loop to decode next MATCH or LITERAL. + + + +NOTES +----- + +This specification doesn't describe the variant of decoder implementation +that supports partial decoding. Such partial decoding case can require some +changes in "end of stream" condition checks code. Also such code +can use additional status codes, returned by decoder. + +This specification uses C++ code with templates to simplify describing. +The optimized version of LZMA decoder doesn't need templates. +Such optimized version can use just two arrays of CProb variables: + 1) The dynamic array of CProb variables allocated for the Literal Decoder. + 2) The one common array that contains all other CProb variables. + + +References: + +1. G. N. N. Martin, Range encoding: an algorithm for removing redundancy + from a digitized message, Video & Data Recording Conference, + Southampton, UK, July 24-27, 1979. diff --git a/src/sdk/DOC/lzma.txt b/src/sdk/DOC/lzma.txt index a65988f..62cf094 100644 --- a/src/sdk/DOC/lzma.txt +++ b/src/sdk/DOC/lzma.txt @@ -1,328 +1,345 @@ -LZMA compression ----------------- -Version: 9.35 - -This file describes LZMA encoding and decoding functions written in C language. - -LZMA is an improved version of famous LZ77 compression algorithm. -It was improved in way of maximum increasing of compression ratio, -keeping high decompression speed and low memory requirements for -decompressing. - -Note: you can read also LZMA Specification (lzma-specification.txt from LZMA SDK) - -Also you can look source code for LZMA encoding and decoding: - C/Util/Lzma/LzmaUtil.c - - -LZMA compressed file format ---------------------------- -Offset Size Description - 0 1 Special LZMA properties (lc,lp, pb in encoded form) - 1 4 Dictionary size (little endian) - 5 8 Uncompressed size (little endian). -1 means unknown size - 13 Compressed data - - - -ANSI-C LZMA Decoder -~~~~~~~~~~~~~~~~~~~ - -Please note that interfaces for ANSI-C code were changed in LZMA SDK 4.58. -If you want to use old interfaces you can download previous version of LZMA SDK -from sourceforge.net site. - -To use ANSI-C LZMA Decoder you need the following files: -1) LzmaDec.h + LzmaDec.c + 7zTypes.h + Precomp.h + Compiler.h - -Look example code: - C/Util/Lzma/LzmaUtil.c - - -Memory requirements for LZMA decoding -------------------------------------- - -Stack usage of LZMA decoding function for local variables is not -larger than 200-400 bytes. - -LZMA Decoder uses dictionary buffer and internal state structure. -Internal state structure consumes - state_size = (4 + (1.5 << (lc + lp))) KB -by default (lc=3, lp=0), state_size = 16 KB. - - -How To decompress data ----------------------- - -LZMA Decoder (ANSI-C version) now supports 2 interfaces: -1) Single-call Decompressing -2) Multi-call State Decompressing (zlib-like interface) - -You must use external allocator: -Example: -void *SzAlloc(void *p, size_t size) { p = p; return malloc(size); } -void SzFree(void *p, void *address) { p = p; free(address); } -ISzAlloc alloc = { SzAlloc, SzFree }; - -You can use p = p; operator to disable compiler warnings. - - -Single-call Decompressing -------------------------- -When to use: RAM->RAM decompressing -Compile files: LzmaDec.h + LzmaDec.c + 7zTypes.h -Compile defines: no defines -Memory Requirements: - - Input buffer: compressed size - - Output buffer: uncompressed size - - LZMA Internal Structures: state_size (16 KB for default settings) - -Interface: - int LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, - const Byte *propData, unsigned propSize, ELzmaFinishMode finishMode, - ELzmaStatus *status, ISzAlloc *alloc); - In: - dest - output data - destLen - output data size - src - input data - srcLen - input data size - propData - LZMA properties (5 bytes) - propSize - size of propData buffer (5 bytes) - finishMode - It has meaning only if the decoding reaches output limit (*destLen). - LZMA_FINISH_ANY - Decode just destLen bytes. - LZMA_FINISH_END - Stream must be finished after (*destLen). - You can use LZMA_FINISH_END, when you know that - current output buffer covers last bytes of stream. - alloc - Memory allocator. - - Out: - destLen - processed output size - srcLen - processed input size - - Output: - SZ_OK - status: - LZMA_STATUS_FINISHED_WITH_MARK - LZMA_STATUS_NOT_FINISHED - LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK - SZ_ERROR_DATA - Data error - SZ_ERROR_MEM - Memory allocation error - SZ_ERROR_UNSUPPORTED - Unsupported properties - SZ_ERROR_INPUT_EOF - It needs more bytes in input buffer (src). - - If LZMA decoder sees end_marker before reaching output limit, it returns OK result, - and output value of destLen will be less than output buffer size limit. - - You can use multiple checks to test data integrity after full decompression: - 1) Check Result and "status" variable. - 2) Check that output(destLen) = uncompressedSize, if you know real uncompressedSize. - 3) Check that output(srcLen) = compressedSize, if you know real compressedSize. - You must use correct finish mode in that case. */ - - -Multi-call State Decompressing (zlib-like interface) ----------------------------------------------------- - -When to use: file->file decompressing -Compile files: LzmaDec.h + LzmaDec.c + 7zTypes.h - -Memory Requirements: - - Buffer for input stream: any size (for example, 16 KB) - - Buffer for output stream: any size (for example, 16 KB) - - LZMA Internal Structures: state_size (16 KB for default settings) - - LZMA dictionary (dictionary size is encoded in LZMA properties header) - -1) read LZMA properties (5 bytes) and uncompressed size (8 bytes, little-endian) to header: - unsigned char header[LZMA_PROPS_SIZE + 8]; - ReadFile(inFile, header, sizeof(header) - -2) Allocate CLzmaDec structures (state + dictionary) using LZMA properties - - CLzmaDec state; - LzmaDec_Constr(&state); - res = LzmaDec_Allocate(&state, header, LZMA_PROPS_SIZE, &g_Alloc); - if (res != SZ_OK) - return res; - -3) Init LzmaDec structure before any new LZMA stream. And call LzmaDec_DecodeToBuf in loop - - LzmaDec_Init(&state); - for (;;) - { - ... - int res = LzmaDec_DecodeToBuf(CLzmaDec *p, Byte *dest, SizeT *destLen, - const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode); - ... - } - - -4) Free all allocated structures - LzmaDec_Free(&state, &g_Alloc); - -Look example code: - C/Util/Lzma/LzmaUtil.c - - -How To compress data --------------------- - -Compile files: - 7zTypes.h - Threads.h - LzmaEnc.h - LzmaEnc.c - LzFind.h - LzFind.c - LzFindMt.h - LzFindMt.c - LzHash.h - -Memory Requirements: - - (dictSize * 11.5 + 6 MB) + state_size - -Lzma Encoder can use two memory allocators: -1) alloc - for small arrays. -2) allocBig - for big arrays. - -For example, you can use Large RAM Pages (2 MB) in allocBig allocator for -better compression speed. Note that Windows has bad implementation for -Large RAM Pages. -It's OK to use same allocator for alloc and allocBig. - - -Single-call Compression with callbacks --------------------------------------- - -Look example code: - C/Util/Lzma/LzmaUtil.c - -When to use: file->file compressing - -1) you must implement callback structures for interfaces: -ISeqInStream -ISeqOutStream -ICompressProgress -ISzAlloc - -static void *SzAlloc(void *p, size_t size) { p = p; return MyAlloc(size); } -static void SzFree(void *p, void *address) { p = p; MyFree(address); } -static ISzAlloc g_Alloc = { SzAlloc, SzFree }; - - CFileSeqInStream inStream; - CFileSeqOutStream outStream; - - inStream.funcTable.Read = MyRead; - inStream.file = inFile; - outStream.funcTable.Write = MyWrite; - outStream.file = outFile; - - -2) Create CLzmaEncHandle object; - - CLzmaEncHandle enc; - - enc = LzmaEnc_Create(&g_Alloc); - if (enc == 0) - return SZ_ERROR_MEM; - - -3) initialize CLzmaEncProps properties; - - LzmaEncProps_Init(&props); - - Then you can change some properties in that structure. - -4) Send LZMA properties to LZMA Encoder - - res = LzmaEnc_SetProps(enc, &props); - -5) Write encoded properties to header - - Byte header[LZMA_PROPS_SIZE + 8]; - size_t headerSize = LZMA_PROPS_SIZE; - UInt64 fileSize; - int i; - - res = LzmaEnc_WriteProperties(enc, header, &headerSize); - fileSize = MyGetFileLength(inFile); - for (i = 0; i < 8; i++) - header[headerSize++] = (Byte)(fileSize >> (8 * i)); - MyWriteFileAndCheck(outFile, header, headerSize) - -6) Call encoding function: - res = LzmaEnc_Encode(enc, &outStream.funcTable, &inStream.funcTable, - NULL, &g_Alloc, &g_Alloc); - -7) Destroy LZMA Encoder Object - LzmaEnc_Destroy(enc, &g_Alloc, &g_Alloc); - - -If callback function return some error code, LzmaEnc_Encode also returns that code -or it can return the code like SZ_ERROR_READ, SZ_ERROR_WRITE or SZ_ERROR_PROGRESS. - - -Single-call RAM->RAM Compression --------------------------------- - -Single-call RAM->RAM Compression is similar to Compression with callbacks, -but you provide pointers to buffers instead of pointers to stream callbacks: - -SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, - const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark, - ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig); - -Return code: - SZ_OK - OK - SZ_ERROR_MEM - Memory allocation error - SZ_ERROR_PARAM - Incorrect paramater - SZ_ERROR_OUTPUT_EOF - output buffer overflow - SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version) - - - -Defines -------- - -_LZMA_SIZE_OPT - Enable some optimizations in LZMA Decoder to get smaller executable code. - -_LZMA_PROB32 - It can increase the speed on some 32-bit CPUs, but memory usage for - some structures will be doubled in that case. - -_LZMA_UINT32_IS_ULONG - Define it if int is 16-bit on your compiler and long is 32-bit. - -_LZMA_NO_SYSTEM_SIZE_T - Define it if you don't want to use size_t type. - - -_7ZIP_PPMD_SUPPPORT - Define it if you don't want to support PPMD method in AMSI-C .7z decoder. - - -C++ LZMA Encoder/Decoder -~~~~~~~~~~~~~~~~~~~~~~~~ -C++ LZMA code use COM-like interfaces. So if you want to use it, -you can study basics of COM/OLE. -C++ LZMA code is just wrapper over ANSI-C code. - - -C++ Notes -~~~~~~~~~~~~~~~~~~~~~~~~ -If you use some C++ code folders in 7-Zip (for example, C++ code for .7z handling), -you must check that you correctly work with "new" operator. -7-Zip can be compiled with MSVC 6.0 that doesn't throw "exception" from "new" operator. -So 7-Zip uses "CPP\Common\NewHandler.cpp" that redefines "new" operator: -operator new(size_t size) -{ - void *p = ::malloc(size); - if (p == 0) - throw CNewException(); - return p; -} -If you use MSCV that throws exception for "new" operator, you can compile without -"NewHandler.cpp". So standard exception will be used. Actually some code of -7-Zip catches any exception in internal code and converts it to HRESULT code. -So you don't need to catch CNewException, if you call COM interfaces of 7-Zip. - ---- - -http://www.7-zip.org -http://www.7-zip.org/sdk.html -http://www.7-zip.org/support.html +LZMA compression +---------------- +Version: 24.07 + +This file describes LZMA encoding and decoding functions written in C language. + +LZMA is an improved version of famous LZ77 compression algorithm. +It was improved in way of maximum increasing of compression ratio, +keeping high decompression speed and low memory requirements for +decompressing. + +Note: you can read also LZMA Specification (lzma-specification.txt from LZMA SDK) + +Also you can look source code for LZMA encoding and decoding: + C/Util/Lzma/LzmaUtil.c + + +LZMA compressed file format +--------------------------- +Offset Size Description + 0 1 Special LZMA properties (lc,lp, pb in encoded form) + 1 4 Dictionary size (little endian) + 5 8 Uncompressed size (little endian). -1 means unknown size + 13 Compressed data + + + +ANSI-C LZMA Decoder +~~~~~~~~~~~~~~~~~~~ + +Please note that interfaces for ANSI-C code were changed in LZMA SDK 4.58. +If you want to use old interfaces you can download previous version of LZMA SDK +from sourceforge.net site. + +To use ANSI-C LZMA Decoder you need the following files: +1) LzmaDec.h + LzmaDec.c + 7zTypes.h + Precomp.h + Compiler.h + +Look example code: + C/Util/Lzma/LzmaUtil.c + + +Memory requirements for LZMA decoding +------------------------------------- + +Stack usage of LZMA decoding function for local variables is not +larger than 200-400 bytes. + +LZMA Decoder uses dictionary buffer and internal state structure. +Internal state structure consumes + state_size = (4 + (1.5 << (lc + lp))) KB +by default (lc=3, lp=0), state_size = 16 KB. + + +How To decompress data +---------------------- + +LZMA Decoder (ANSI-C version) now supports 2 interfaces: +1) Single-call Decompressing +2) Multi-call State Decompressing (zlib-like interface) + +You must use external allocator: +Example: +void *SzAlloc(void *p, size_t size) { p = p; return malloc(size); } +void SzFree(void *p, void *address) { p = p; free(address); } +ISzAlloc alloc = { SzAlloc, SzFree }; + +You can use p = p; operator to disable compiler warnings. + + +Single-call Decompressing +------------------------- +When to use: RAM->RAM decompressing +Compile files: LzmaDec.h + LzmaDec.c + 7zTypes.h +Compile defines: no defines +Memory Requirements: + - Input buffer: compressed size + - Output buffer: uncompressed size + - LZMA Internal Structures: state_size (16 KB for default settings) + +Interface: + int LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, + const Byte *propData, unsigned propSize, ELzmaFinishMode finishMode, + ELzmaStatus *status, ISzAlloc *alloc); + In: + dest - output data + destLen - output data size + src - input data + srcLen - input data size + propData - LZMA properties (5 bytes) + propSize - size of propData buffer (5 bytes) + finishMode - It has meaning only if the decoding reaches output limit (*destLen). + LZMA_FINISH_ANY - Decode just destLen bytes. + LZMA_FINISH_END - Stream must be finished after (*destLen). + You can use LZMA_FINISH_END, when you know that + current output buffer covers last bytes of stream. + alloc - Memory allocator. + + Out: + destLen - processed output size + srcLen - processed input size + + Output: + SZ_OK + status: + LZMA_STATUS_FINISHED_WITH_MARK + LZMA_STATUS_NOT_FINISHED + LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK + SZ_ERROR_DATA - Data error + SZ_ERROR_MEM - Memory allocation error + SZ_ERROR_UNSUPPORTED - Unsupported properties + SZ_ERROR_INPUT_EOF - It needs more bytes in input buffer (src). + + If LZMA decoder sees end_marker before reaching output limit, it returns OK result, + and output value of destLen will be less than output buffer size limit. + + You can use multiple checks to test data integrity after full decompression: + 1) Check Result and "status" variable. + 2) Check that output(destLen) = uncompressedSize, if you know real uncompressedSize. + 3) Check that output(srcLen) = compressedSize, if you know real compressedSize. + You must use correct finish mode in that case. */ + + +Multi-call State Decompressing (zlib-like interface) +---------------------------------------------------- + +When to use: file->file decompressing +Compile files: LzmaDec.h + LzmaDec.c + 7zTypes.h + +Memory Requirements: + - Buffer for input stream: any size (for example, 16 KB) + - Buffer for output stream: any size (for example, 16 KB) + - LZMA Internal Structures: state_size (16 KB for default settings) + - LZMA dictionary (dictionary size is encoded in LZMA properties header) + +1) read LZMA properties (5 bytes) and uncompressed size (8 bytes, little-endian) to header: + unsigned char header[LZMA_PROPS_SIZE + 8]; + ReadFile(inFile, header, sizeof(header) + +2) Allocate CLzmaDec structures (state + dictionary) using LZMA properties + + CLzmaDec state; + LzmaDec_Constr(&state); + res = LzmaDec_Allocate(&state, header, LZMA_PROPS_SIZE, &g_Alloc); + if (res != SZ_OK) + return res; + +3) Init LzmaDec structure before any new LZMA stream. And call LzmaDec_DecodeToBuf in loop + + LzmaDec_Init(&state); + for (;;) + { + ... + int res = LzmaDec_DecodeToBuf(CLzmaDec *p, Byte *dest, SizeT *destLen, + const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode); + ... + } + + +4) Free all allocated structures + LzmaDec_Free(&state, &g_Alloc); + +Look example code: + C/Util/Lzma/LzmaUtil.c + + +How To compress data +-------------------- + +Compile files: + 7zTypes.h + Threads.h + Threads.c + LzmaEnc.h + LzmaEnc.c + LzFind.h + LzFind.c + LzFindMt.h + LzFindMt.c + LzFindOpt.c + LzHash.h + +Memory Requirements: + - (dictSize * 11.5 + 6 MB) + state_size + +Lzma Encoder can use two memory allocators: +1) alloc - for small arrays. +2) allocBig - for big arrays. + +For example, you can use Large RAM Pages (2 MB) in allocBig allocator for +better compression speed. Note that Windows has bad implementation for +Large RAM Pages. +It's OK to use same allocator for alloc and allocBig. + + +Single-call Compression with callbacks +-------------------------------------- + +Look example code: + C/Util/Lzma/LzmaUtil.c + +When to use: file->file compressing + +1) you must implement callback structures for interfaces: +ISeqInStream +ISeqOutStream +ICompressProgress +ISzAlloc + +static void *SzAlloc(void *p, size_t size) { p = p; return MyAlloc(size); } +static void SzFree(void *p, void *address) { p = p; MyFree(address); } +static ISzAlloc g_Alloc = { SzAlloc, SzFree }; + + CFileSeqInStream inStream; + CFileSeqOutStream outStream; + + inStream.funcTable.Read = MyRead; + inStream.file = inFile; + outStream.funcTable.Write = MyWrite; + outStream.file = outFile; + + +2) Create CLzmaEncHandle object; + + CLzmaEncHandle enc; + + enc = LzmaEnc_Create(&g_Alloc); + if (enc == 0) + return SZ_ERROR_MEM; + + +3) initialize CLzmaEncProps properties; + + LzmaEncProps_Init(&props); + + Then you can change some properties in that structure. + +4) Send LZMA properties to LZMA Encoder + + res = LzmaEnc_SetProps(enc, &props); + +5) Write encoded properties to header + + Byte header[LZMA_PROPS_SIZE + 8]; + size_t headerSize = LZMA_PROPS_SIZE; + UInt64 fileSize; + int i; + + res = LzmaEnc_WriteProperties(enc, header, &headerSize); + fileSize = MyGetFileLength(inFile); + for (i = 0; i < 8; i++) + header[headerSize++] = (Byte)(fileSize >> (8 * i)); + MyWriteFileAndCheck(outFile, header, headerSize) + +6) Call encoding function: + res = LzmaEnc_Encode(enc, &outStream.funcTable, &inStream.funcTable, + NULL, &g_Alloc, &g_Alloc); + +7) Destroy LZMA Encoder Object + LzmaEnc_Destroy(enc, &g_Alloc, &g_Alloc); + + +If callback function return some error code, LzmaEnc_Encode also returns that code +or it can return the code like SZ_ERROR_READ, SZ_ERROR_WRITE or SZ_ERROR_PROGRESS. + + +Single-call RAM->RAM Compression +-------------------------------- + +Single-call RAM->RAM Compression is similar to Compression with callbacks, +but you provide pointers to buffers instead of pointers to stream callbacks: + +SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, + const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark, + ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig); + +Return code: + SZ_OK - OK + SZ_ERROR_MEM - Memory allocation error + SZ_ERROR_PARAM - Incorrect paramater + SZ_ERROR_OUTPUT_EOF - output buffer overflow + SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version) + + + +Defines +------- + +Z7_LZMA_SIZE_OPT - Enable some code size optimizations in LZMA Decoder to get smaller executable code. + +Z7_LZMA_PROB32 - It can increase the speed on some 32-bit CPUs, but memory usage for + some structures will be doubled in that case. + +Z7_DECL_Int32_AS_long - Define it if int is 16-bit on your compiler and long is 32-bit. + +Z7_DECL_SizeT_AS_unsigned_int - Define it if you don't want to use size_t type. + + +Defines for 7z decoder written in C +----------------------------------- +These defines are for 7zDec.c only (the decoder in C). +C++ 7z decoder doesn't uses these macros. + +Z7_PPMD_SUPPORT - define it if you need PPMD method support. +Z7_NO_METHODS_FILTERS - do not use filters (except of BCJ2 filter). +Z7_USE_NATIVE_BRANCH_FILTER - use filter for native ISA: + use x86 filter, if compiled to x86 executable, + use arm64 filter, if compiled to arm64 executable. + + +C++ LZMA Encoder/Decoder +~~~~~~~~~~~~~~~~~~~~~~~~ +C++ LZMA code use COM-like interfaces. So if you want to use it, +you can study basics of COM/OLE. +C++ LZMA code is just wrapper over ANSI-C code. + + +C++ Notes +~~~~~~~~~~~~~~~~~~~~~~~~ +If you use some C++ code folders in 7-Zip (for example, C++ code for 7z archive handling), +you must check that you correctly work with "new" operator. +7-Zip can be compiled with MSVC 6.0 that doesn't throw "exception" from "new" operator. +So 7-Zip uses "CPP\Common\NewHandler.cpp" that redefines "new" operator, +if compiled by old MSVC compilers (MSVC before version VS 2010): + +operator new(size_t size) +{ + void *p = ::malloc(size); + if (!p) + throw CNewException(); + return p; +} + +If the compiler is VS 2010 or newer, NewHandler.cpp doesn't redefine "new" operator. +Sp if you use new compiler (VS 2010 or newer), you still can include "NewHandler.cpp" +to compilation, and it will not redefine operator new. +Also you can compile without "NewHandler.cpp" with new compilers. +If 7-zip doesn't redefine operator "new", standard exception will be used instead of CNewException. +Some code of 7-Zip catches any exception in internal code and converts it to HRESULT code. +So you don't need to catch CNewException, if you call COM interfaces of 7-Zip. + +--- + +http://www.7-zip.org +http://www.7-zip.org/sdk.html +http://www.7-zip.org/support.html