diff --git a/.gitignore b/.gitignore
index 9739bee..fe9b640 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ dist
 *.pyc
 *.egg-info
 *.so
+*.patch~
 *.pyd
 RELEASE-VERSION
 /.pc
diff --git a/README.md b/README.md
index 3e2bf36..3d1a913 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # Python bindings for LZMA
 
-* **PyLZMA** Copyright (C) 2004-2015 Joachim Bauch
-* **7-Zip** Copyright (C) 1999-2010 Igor Pavlov
-* **LZMA SDK** Copyright (C) 1999-2010 Igor Pavlov
+* **PyLZMA** Copyright (C) 2004-2025 Joachim Bauch
+* **7-Zip** Copyright (C) 1999-2025 Igor Pavlov
+* **LZMA SDK** Copyright (C) 1999-2025 Igor Pavlov
 
 [![Linux Build Status](https://github.com/fancycode/pylzma/workflows/test/badge.svg)](https://github.com/fancycode/pylzma/actions)
 [![Windows Build Status](https://ci.appveyor.com/api/projects/status/5a7k7v9k2a0eiuom/branch/master?svg=true
diff --git a/patches/series b/patches/series
index 1502ea8..6bcb25c 100644
--- a/patches/series
+++ b/patches/series
@@ -1,2 +1 @@
 streaming_encoder.patch
-strict_prototype.patch
diff --git a/patches/streaming_encoder.patch b/patches/streaming_encoder.patch
index 5dbeec8..2e1cedf 100644
--- a/patches/streaming_encoder.patch
+++ b/patches/streaming_encoder.patch
@@ -2,10 +2,11 @@ Index: pylzma/src/sdk/C/LzmaEnc.c
 ===================================================================
 --- pylzma.orig/src/sdk/C/LzmaEnc.c
 +++ pylzma/src/sdk/C/LzmaEnc.c
-@@ -2259,8 +2259,9 @@ void LzmaEnc_Destroy(CLzmaEncHandle p, I
+@@ -2385,9 +2385,9 @@ void LzmaEnc_Destroy(CLzmaEncHandle p, I
  }
  
  
+-Z7_NO_INLINE
 -static SRes LzmaEnc_CodeOneBlock(CLzmaEnc *p, UInt32 maxPackSize, UInt32 maxUnpackSize)
 +SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpackSize)
  {
@@ -13,36 +14,39 @@ Index: pylzma/src/sdk/C/LzmaEnc.c
    UInt32 nowPos32, startPos32;
    if (p->needInit)
    {
-@@ -2715,7 +2716,7 @@ static SRes LzmaEnc_AllocAndInit(CLzmaEn
+@@ -2872,7 +2872,7 @@ static SRes LzmaEnc_AllocAndInit(CLzmaEn
    return SZ_OK;
  }
  
--static SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream,
-+SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream,
+-static SRes LzmaEnc_Prepare(CLzmaEncHandle p,
++SRes LzmaEnc_Prepare(CLzmaEncHandle p,
+     ISeqOutStreamPtr outStream,
+     ISeqInStreamPtr inStream,
      ISzAllocPtr alloc, ISzAllocPtr allocBig)
- {
-   CLzmaEnc *p = (CLzmaEnc *)pp;
-@@ -2974,3 +2975,9 @@ SRes LzmaEncode(Byte *dest, SizeT *destL
-   LzmaEnc_Destroy(p, alloc, allocBig);
-   return res;
+@@ -2883,6 +2883,12 @@ static SRes LzmaEnc_Prepare(CLzmaEncHand
+   return LzmaEnc_AllocAndInit(p, 0, alloc, allocBig);
  }
-+
+ 
 +BoolInt LzmaEnc_IsFinished(CLzmaEncHandle pp)
 +{
 +  CLzmaEnc *p = (CLzmaEnc *)pp;
 +  return p->finished;
 +}
++
+ SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle p,
+     ISeqInStreamPtr inStream, UInt32 keepWindowSize,
+     ISzAllocPtr alloc, ISzAllocPtr allocBig)
 Index: pylzma/src/sdk/C/LzmaEnc.h
 ===================================================================
 --- pylzma.orig/src/sdk/C/LzmaEnc.h
 +++ pylzma/src/sdk/C/LzmaEnc.h
-@@ -73,4 +73,11 @@ SRes LzmaEncode(Byte *dest, SizeT *destL
+@@ -82,4 +82,11 @@ SRes LzmaEncode(Byte *dest, SizeT *destL
  
  EXTERN_C_END
  
 +/* ---------- Streaming Interface ---------- */
 +
-+SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ISzAllocPtr alloc, ISzAllocPtr allocBig);
++SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, ISzAllocPtr alloc, ISzAllocPtr allocBig);
 +SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpackSize);
 +BoolInt LzmaEnc_IsFinished(CLzmaEncHandle pp);
 +void LzmaEnc_Finish(CLzmaEncHandle pp);
diff --git a/patches/strict_prototype.patch b/patches/strict_prototype.patch
deleted file mode 100644
index b31c5a7..0000000
--- a/patches/strict_prototype.patch
+++ /dev/null
@@ -1,17 +0,0 @@
-Index: pylzma/src/sdk/C/CpuArch.h
-===================================================================
---- pylzma.orig/src/sdk/C/CpuArch.h
-+++ pylzma/src/sdk/C/CpuArch.h
-@@ -325,9 +325,9 @@ int x86cpuid_GetFirm(const Cx86cpuid *p)
- #define x86cpuid_GetModel(ver)  (((ver >> 12) &  0xF0) | ((ver >> 4) & 0xF))
- #define x86cpuid_GetStepping(ver) (ver & 0xF)
- 
--BoolInt CPU_Is_InOrder();
--BoolInt CPU_Is_Aes_Supported();
--BoolInt CPU_IsSupported_PageGB();
-+BoolInt CPU_Is_InOrder(void);
-+BoolInt CPU_Is_Aes_Supported(void);
-+BoolInt CPU_IsSupported_PageGB(void);
- 
- #endif
- 
diff --git a/setup.py b/setup.py
index 4cc4d10..549efc2 100644
--- a/setup.py
+++ b/setup.py
@@ -85,11 +85,6 @@ def __new__(cls, s):
 
 library_dirs = []
 
-# platforms that multithreaded compression is supported on
-mt_platforms = (
-    'win32',
-)
-
 if IS_WINDOWS:
     # don't try to import MSVC compiler on non-windows platforms
     # as this triggers unnecessary warnings
@@ -107,14 +102,7 @@ class MSVCCompiler(object):
 class build_ext(_build_ext):
 
     def build_extension(self, ext):
-        self.with_mt = ENABLE_MULTITHREADING
-        if self.with_mt and not sys.platform in mt_platforms:
-            warn("""\
-Multithreading is not supported on the platform "%s",
-please contact mail@joachim-bauch.de for more informations.""" % (sys.platform), UnsupportedPlatformWarning)
-            self.with_mt = False
-
-        if self.with_mt:
+        if ENABLE_MULTITHREADING:
             log.info('adding support for multithreaded compression')
             ext.define_macros.append(('COMPRESS_MF_MT', 1))
             ext.sources += (
@@ -124,7 +112,7 @@ def build_extension(self, ext):
                 'src/sdk/C/Threads.c',
             )
         else:
-            ext.define_macros.append(('_7ZIP_ST', 1))
+            ext.define_macros.append(('Z7_ST', 1))
 
         if isinstance(self.compiler, MSVCCompiler) or getattr(self.compiler, 'compiler_type', '') == 'msvc':
             # set flags only available when using MSVC
@@ -163,6 +151,7 @@ def build_extension(self, ext):
     ('PY_SSIZE_T_CLEAN', 1),
 ]
 lzma_files = (
+    'src/sdk/C/7zStream.c',
     'src/sdk/C/Aes.c',
     'src/sdk/C/AesOpt.c',
     'src/sdk/C/Bcj2.c',
@@ -172,11 +161,14 @@ def build_extension(self, ext):
     'src/sdk/C/CpuArch.c',
     'src/sdk/C/Delta.c',
     'src/sdk/C/LzFind.c',
+    'src/sdk/C/LzFindOpt.c',
     'src/sdk/C/LzmaDec.c',
     'src/sdk/C/LzmaEnc.c',
     'src/sdk/C/Lzma2Dec.c',
     'src/sdk/C/Lzma2Enc.c',
     'src/sdk/C/Sha256.c',
+    'src/sdk/C/Sha256Opt.c',
+    'src/sdk/C/SwapBytes.c',
     'src/sdk/C/Ppmd7.c',
     'src/sdk/C/Ppmd7Dec.c',
 )
diff --git a/src/pylzma/pylzma.c b/src/pylzma/pylzma.c
index 0a00c55..978044b 100644
--- a/src/pylzma/pylzma.c
+++ b/src/pylzma/pylzma.c
@@ -130,7 +130,7 @@ pylzma_calculate_key(PyObject *self, PyObject *args, PyObject *kwargs)
 
 const char
 doc_bcj_x86_convert[] = \
-    "bcj_x86_convert(data) -- Perform BCJ x86 conversion.";
+    "bcj_x86_convert(data, [encoding]) -- Perform BCJ x86 conversion.";
 
 static PyObject *
 pylzma_bcj_x86_convert(PyObject *self, PyObject *args)
@@ -150,10 +150,13 @@ pylzma_bcj_x86_convert(PyObject *self, PyObject *args)
 
     result = PyBytes_FromStringAndSize(data, length);
     if (result != NULL) {
-        UInt32 state;
+        UInt32 state = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL;
         Py_BEGIN_ALLOW_THREADS
-        x86_Convert_Init(state);
-        x86_Convert((Byte *) PyBytes_AS_STRING(result), length, 0, &state, encoding);
+        if (encoding) {
+            Z7_BRANCH_CONV_ST_ENC(X86)((Byte *) PyBytes_AS_STRING(result), length, 0, &state);
+        } else {
+            Z7_BRANCH_CONV_ST_DEC(X86)((Byte *) PyBytes_AS_STRING(result), length, 0, &state);
+        }
         Py_END_ALLOW_THREADS
     }
 
@@ -163,7 +166,7 @@ pylzma_bcj_x86_convert(PyObject *self, PyObject *args)
 #define DEFINE_BCJ_CONVERTER(id, name) \
 const char \
 doc_bcj_##id##_convert[] = \
-    "bcj_" #id "_convert(data) -- Perform BCJ " #name " conversion."; \
+    "bcj_" #id "_convert(data, [encoding]) -- Perform BCJ " #name " conversion."; \
 \
 static PyObject * \
 pylzma_bcj_##id##_convert(PyObject *self, PyObject *args) \
@@ -184,7 +187,11 @@ pylzma_bcj_##id##_convert(PyObject *self, PyObject *args) \
     result = PyBytes_FromStringAndSize(data, length); \
     if (result != NULL) { \
         Py_BEGIN_ALLOW_THREADS \
-        name##_Convert((Byte *) PyBytes_AS_STRING(result), length, 0, encoding); \
+        if (encoding) { \
+            Z7_BRANCH_CONV_ENC(name)((Byte *) PyBytes_AS_STRING(result), length, 0); \
+        } else { \
+            Z7_BRANCH_CONV_DEC(name)((Byte *) PyBytes_AS_STRING(result), length, 0); \
+        } \
         Py_END_ALLOW_THREADS \
     } \
      \
@@ -193,7 +200,9 @@ pylzma_bcj_##id##_convert(PyObject *self, PyObject *args) \
 
 DEFINE_BCJ_CONVERTER(arm, ARM);
 DEFINE_BCJ_CONVERTER(armt, ARMT);
+DEFINE_BCJ_CONVERTER(arm64, ARM64);
 DEFINE_BCJ_CONVERTER(ppc, PPC);
+DEFINE_BCJ_CONVERTER(riscv, RISCV);
 DEFINE_BCJ_CONVERTER(sparc, SPARC);
 DEFINE_BCJ_CONVERTER(ia64, IA64);
 
@@ -259,7 +268,7 @@ pylzma_bcj2_decode(PyObject *self, PyObject *args)
     }
 
     // Conversion must be finished and the output buffer filled completely.
-    if (!Bcj2Dec_IsFinished(&dec)) {
+    if (!Bcj2Dec_IsMaybeFinished(&dec)) {
         goto error;
     } else if (dec.dest != dec.destLim || dec.state != BCJ2_STREAM_MAIN) {
         goto error;
@@ -370,7 +379,7 @@ typedef struct
 
 static Byte
 ReadByte(const IByteIn *pp) {
-    CByteInToLook *p = CONTAINER_FROM_VTBL(pp, CByteInToLook, vt);
+    CByteInToLook *p = Z7_CONTAINER_FROM_VTBL(pp, CByteInToLook, vt);
     if (p->cur != p->end) {
         return *p->cur++;
     }
@@ -404,7 +413,6 @@ pylzma_ppmd_decompress(PyObject *self, PyObject *args)
     unsigned order;
     UInt32 memSize;
     CPpmd7 ppmd;
-    CPpmd7z_RangeDec rc;
     CByteInToLook s;
     SRes res = SZ_OK;
     CMemoryLookInStream stream;
@@ -446,22 +454,21 @@ pylzma_ppmd_decompress(PyObject *self, PyObject *args)
     CreateMemoryLookInStream(&stream, (Byte*) data, length);
     tmp = (Byte *) PyBytes_AS_STRING(result);
     Py_BEGIN_ALLOW_THREADS
-    Ppmd7z_RangeDec_CreateVTable(&rc);
     s.vt.Read = ReadByte;
     s.inStream = &stream.s;
     s.begin = s.end = s.cur = NULL;
     s.extra = False;
     s.res = SZ_OK;
     s.processed = 0;
-    rc.Stream = &s.vt;
-    if (!Ppmd7z_RangeDec_Init(&rc)) {
+    ppmd.rc.dec.Stream = &s.vt;
+    if (!Ppmd7z_RangeDec_Init(&ppmd.rc.dec)) {
         res = SZ_ERROR_DATA;
     } else if (s.extra) {
         res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA);
     } else {
         SizeT i;
         for (i = 0; i < outsize; i++) {
-            int sym = Ppmd7_DecodeSymbol(&ppmd, &rc.vt);
+            int sym = Ppmd7z_DecodeSymbol(&ppmd);
             if (s.extra || sym < 0) {
                 break;
             }
@@ -469,7 +476,7 @@ pylzma_ppmd_decompress(PyObject *self, PyObject *args)
         }
         if (i != outsize) {
             res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA);
-        } else if (s.processed + (s.cur - s.begin) != (UInt64)length || !Ppmd7z_RangeDec_IsFinishedOK(&rc)) {
+        } else if (s.processed + (s.cur - s.begin) != (UInt64)length || !Ppmd7z_RangeDec_IsFinishedOK(&ppmd.rc.dec)) {
             res = SZ_ERROR_DATA;
         }
     }
@@ -498,7 +505,9 @@ methods[] = {
     {"bcj_x86_convert",     (PyCFunction)pylzma_bcj_x86_convert,    METH_VARARGS,   (char *)&doc_bcj_x86_convert},
     {"bcj_arm_convert",     (PyCFunction)pylzma_bcj_arm_convert,    METH_VARARGS,   (char *)&doc_bcj_arm_convert},
     {"bcj_armt_convert",    (PyCFunction)pylzma_bcj_armt_convert,   METH_VARARGS,   (char *)&doc_bcj_armt_convert},
+    {"bcj_arm64_convert",   (PyCFunction)pylzma_bcj_arm64_convert,  METH_VARARGS,   (char *)&doc_bcj_arm64_convert},
     {"bcj_ppc_convert",     (PyCFunction)pylzma_bcj_ppc_convert,    METH_VARARGS,   (char *)&doc_bcj_ppc_convert},
+    {"bcj_riscv_convert",   (PyCFunction)pylzma_bcj_riscv_convert,  METH_VARARGS,   (char *)&doc_bcj_riscv_convert},
     {"bcj_sparc_convert",   (PyCFunction)pylzma_bcj_sparc_convert,  METH_VARARGS,   (char *)&doc_bcj_sparc_convert},
     {"bcj_ia64_convert",    (PyCFunction)pylzma_bcj_ia64_convert,   METH_VARARGS,   (char *)&doc_bcj_ia64_convert},
     {"bcj2_decode", (PyCFunction)pylzma_bcj2_decode,   METH_VARARGS,   (char *)&doc_bcj2_decode},
diff --git a/src/sdk/C/7z.h b/src/sdk/C/7z.h
index 6c7886e..9e27c01 100644
--- a/src/sdk/C/7z.h
+++ b/src/sdk/C/7z.h
@@ -1,8 +1,8 @@
 /* 7z.h -- 7z interface
-2017-04-03 : Igor Pavlov : Public domain */
+2023-04-02 : Igor Pavlov : Public domain */
 
-#ifndef __7Z_H
-#define __7Z_H
+#ifndef ZIP7_INC_7Z_H
+#define ZIP7_INC_7Z_H
 
 #include "7zTypes.h"
 
@@ -91,12 +91,14 @@ typedef struct
   UInt64 *CoderUnpackSizes;       // for all coders in all folders
 
   Byte *CodersData;
+
+  UInt64 RangeLimit;
 } CSzAr;
 
 UInt64 SzAr_GetFolderUnpackSize(const CSzAr *p, UInt32 folderIndex);
 
 SRes SzAr_DecodeFolder(const CSzAr *p, UInt32 folderIndex,
-    ILookInStream *stream, UInt64 startPos,
+    ILookInStreamPtr stream, UInt64 startPos,
     Byte *outBuffer, size_t outSize,
     ISzAllocPtr allocMain);
 
@@ -172,7 +174,7 @@ UInt16 *SzArEx_GetFullNameUtf16_Back(const CSzArEx *p, size_t fileIndex, UInt16
 
 SRes SzArEx_Extract(
     const CSzArEx *db,
-    ILookInStream *inStream,
+    ILookInStreamPtr inStream,
     UInt32 fileIndex,         /* index of file */
     UInt32 *blockIndex,       /* index of solid block */
     Byte **outBuffer,         /* pointer to pointer to output buffer (allocated with allocMain) */
@@ -194,7 +196,7 @@ SZ_ERROR_INPUT_EOF
 SZ_ERROR_FAIL
 */
 
-SRes SzArEx_Open(CSzArEx *p, ILookInStream *inStream,
+SRes SzArEx_Open(CSzArEx *p, ILookInStreamPtr inStream,
     ISzAllocPtr allocMain, ISzAllocPtr allocTemp);
 
 EXTERN_C_END
diff --git a/src/sdk/C/7zAlloc.c b/src/sdk/C/7zAlloc.c
index c924a52..2f0659a 100644
--- a/src/sdk/C/7zAlloc.c
+++ b/src/sdk/C/7zAlloc.c
@@ -1,5 +1,5 @@
-/* 7zAlloc.c -- Allocation functions
-2017-04-03 : Igor Pavlov : Public domain */
+/* 7zAlloc.c -- Allocation functions for 7z processing
+2023-03-04 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -7,74 +7,83 @@
 
 #include "7zAlloc.h"
 
-/* #define _SZ_ALLOC_DEBUG */
-/* use _SZ_ALLOC_DEBUG to debug alloc/free operations */
+/* #define SZ_ALLOC_DEBUG */
+/* use SZ_ALLOC_DEBUG to debug alloc/free operations */
 
-#ifdef _SZ_ALLOC_DEBUG
+#ifdef SZ_ALLOC_DEBUG
 
+/*
 #ifdef _WIN32
-#include <windows.h>
+#include "7zWindows.h"
 #endif
+*/
 
 #include <stdio.h>
-int g_allocCount = 0;
-int g_allocCountTemp = 0;
+static int g_allocCount = 0;
+static int g_allocCountTemp = 0;
 
+static void Print_Alloc(const char *s, size_t size, int *counter)
+{
+  const unsigned size2 = (unsigned)size;
+  fprintf(stderr, "\n%s count = %10d : %10u bytes; ", s, *counter, size2);
+  (*counter)++;
+}
+static void Print_Free(const char *s, int *counter)
+{
+  (*counter)--;
+  fprintf(stderr, "\n%s count = %10d", s, *counter);
+}
 #endif
 
 void *SzAlloc(ISzAllocPtr p, size_t size)
 {
-  UNUSED_VAR(p);
+  UNUSED_VAR(p)
   if (size == 0)
     return 0;
-  #ifdef _SZ_ALLOC_DEBUG
-  fprintf(stderr, "\nAlloc %10u bytes; count = %10d", (unsigned)size, g_allocCount);
-  g_allocCount++;
+  #ifdef SZ_ALLOC_DEBUG
+  Print_Alloc("Alloc", size, &g_allocCount);
   #endif
   return malloc(size);
 }
 
 void SzFree(ISzAllocPtr p, void *address)
 {
-  UNUSED_VAR(p);
-  #ifdef _SZ_ALLOC_DEBUG
-  if (address != 0)
-  {
-    g_allocCount--;
-    fprintf(stderr, "\nFree; count = %10d", g_allocCount);
-  }
+  UNUSED_VAR(p)
+  #ifdef SZ_ALLOC_DEBUG
+  if (address)
+    Print_Free("Free ", &g_allocCount);
   #endif
   free(address);
 }
 
 void *SzAllocTemp(ISzAllocPtr p, size_t size)
 {
-  UNUSED_VAR(p);
+  UNUSED_VAR(p)
   if (size == 0)
     return 0;
-  #ifdef _SZ_ALLOC_DEBUG
-  fprintf(stderr, "\nAlloc_temp %10u bytes;  count = %10d", (unsigned)size, g_allocCountTemp);
-  g_allocCountTemp++;
+  #ifdef SZ_ALLOC_DEBUG
+  Print_Alloc("Alloc_temp", size, &g_allocCountTemp);
+  /*
   #ifdef _WIN32
   return HeapAlloc(GetProcessHeap(), 0, size);
   #endif
+  */
   #endif
   return malloc(size);
 }
 
 void SzFreeTemp(ISzAllocPtr p, void *address)
 {
-  UNUSED_VAR(p);
-  #ifdef _SZ_ALLOC_DEBUG
-  if (address != 0)
-  {
-    g_allocCountTemp--;
-    fprintf(stderr, "\nFree_temp; count = %10d", g_allocCountTemp);
-  }
+  UNUSED_VAR(p)
+  #ifdef SZ_ALLOC_DEBUG
+  if (address)
+    Print_Free("Free_temp ", &g_allocCountTemp);
+  /*
   #ifdef _WIN32
   HeapFree(GetProcessHeap(), 0, address);
   return;
   #endif
+  */
   #endif
   free(address);
 }
diff --git a/src/sdk/C/7zAlloc.h b/src/sdk/C/7zAlloc.h
index 44778f9..b2b8b0c 100644
--- a/src/sdk/C/7zAlloc.h
+++ b/src/sdk/C/7zAlloc.h
@@ -1,8 +1,8 @@
 /* 7zAlloc.h -- Allocation functions
-2017-04-03 : Igor Pavlov : Public domain */
+2023-03-04 : Igor Pavlov : Public domain */
 
-#ifndef __7Z_ALLOC_H
-#define __7Z_ALLOC_H
+#ifndef ZIP7_INC_7Z_ALLOC_H
+#define ZIP7_INC_7Z_ALLOC_H
 
 #include "7zTypes.h"
 
diff --git a/src/sdk/C/7zArcIn.c b/src/sdk/C/7zArcIn.c
index f74d0fa..23f2949 100644
--- a/src/sdk/C/7zArcIn.c
+++ b/src/sdk/C/7zArcIn.c
@@ -1,5 +1,5 @@
 /* 7zArcIn.c -- 7z Input functions
-2018-12-31 : Igor Pavlov : Public domain */
+2023-09-07 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -10,10 +10,11 @@
 #include "7zCrc.h"
 #include "CpuArch.h"
 
-#define MY_ALLOC(T, p, size, alloc) { \
-  if ((p = (T *)ISzAlloc_Alloc(alloc, (size) * sizeof(T))) == NULL) return SZ_ERROR_MEM; }
+#define MY_ALLOC(T, p, size, alloc) \
+  { if ((p = (T *)ISzAlloc_Alloc(alloc, (size) * sizeof(T))) == NULL) return SZ_ERROR_MEM; }
 
-#define MY_ALLOC_ZE(T, p, size, alloc) { if ((size) == 0) p = NULL; else MY_ALLOC(T, p, size, alloc) }
+#define MY_ALLOC_ZE(T, p, size, alloc) \
+  { if ((size) == 0) p = NULL; else MY_ALLOC(T, p, size, alloc) }
 
 #define MY_ALLOC_AND_CPY(to, size, from, alloc) \
   { MY_ALLOC(Byte, to, size, alloc); memcpy(to, from, size); }
@@ -58,7 +59,7 @@ enum EIdEnum
 
 const Byte k7zSignature[k7zSignatureSize] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
 
-#define SzBitUi32s_Init(p) { (p)->Defs = NULL; (p)->Vals = NULL; }
+#define SzBitUi32s_INIT(p) { (p)->Defs = NULL; (p)->Vals = NULL; }
 
 static SRes SzBitUi32s_Alloc(CSzBitUi32s *p, size_t num, ISzAllocPtr alloc)
 {
@@ -69,21 +70,21 @@ static SRes SzBitUi32s_Alloc(CSzBitUi32s *p, size_t num, ISzAllocPtr alloc)
   }
   else
   {
-    MY_ALLOC(Byte, p->Defs, (num + 7) >> 3, alloc);
-    MY_ALLOC(UInt32, p->Vals, num, alloc);
+    MY_ALLOC(Byte, p->Defs, (num + 7) >> 3, alloc)
+    MY_ALLOC(UInt32, p->Vals, num, alloc)
   }
   return SZ_OK;
 }
 
-void SzBitUi32s_Free(CSzBitUi32s *p, ISzAllocPtr alloc)
+static void SzBitUi32s_Free(CSzBitUi32s *p, ISzAllocPtr alloc)
 {
   ISzAlloc_Free(alloc, p->Defs); p->Defs = NULL;
   ISzAlloc_Free(alloc, p->Vals); p->Vals = NULL;
 }
 
-#define SzBitUi64s_Init(p) { (p)->Defs = NULL; (p)->Vals = NULL; }
+#define SzBitUi64s_INIT(p) { (p)->Defs = NULL; (p)->Vals = NULL; }
 
-void SzBitUi64s_Free(CSzBitUi64s *p, ISzAllocPtr alloc)
+static void SzBitUi64s_Free(CSzBitUi64s *p, ISzAllocPtr alloc)
 {
   ISzAlloc_Free(alloc, p->Defs); p->Defs = NULL;
   ISzAlloc_Free(alloc, p->Vals); p->Vals = NULL;
@@ -96,7 +97,7 @@ static void SzAr_Init(CSzAr *p)
   p->NumFolders = 0;
   
   p->PackPositions = NULL;
-  SzBitUi32s_Init(&p->FolderCRCs);
+  SzBitUi32s_INIT(&p->FolderCRCs)
 
   p->FoCodersOffsets = NULL;
   p->FoStartPackStreamIndex = NULL;
@@ -105,6 +106,8 @@ static void SzAr_Init(CSzAr *p)
   p->CoderUnpackSizes = NULL;
 
   p->CodersData = NULL;
+
+  p->RangeLimit = 0;
 }
 
 static void SzAr_Free(CSzAr *p, ISzAllocPtr alloc)
@@ -140,11 +143,11 @@ void SzArEx_Init(CSzArEx *p)
   p->FileNameOffsets = NULL;
   p->FileNames = NULL;
   
-  SzBitUi32s_Init(&p->CRCs);
-  SzBitUi32s_Init(&p->Attribs);
-  // SzBitUi32s_Init(&p->Parents);
-  SzBitUi64s_Init(&p->MTime);
-  SzBitUi64s_Init(&p->CTime);
+  SzBitUi32s_INIT(&p->CRCs)
+  SzBitUi32s_INIT(&p->Attribs)
+  // SzBitUi32s_INIT(&p->Parents)
+  SzBitUi64s_INIT(&p->MTime)
+  SzBitUi64s_INIT(&p->CTime)
 }
 
 void SzArEx_Free(CSzArEx *p, ISzAllocPtr alloc)
@@ -178,11 +181,20 @@ static int TestSignatureCandidate(const Byte *testBytes)
   return 1;
 }
 
-#define SzData_Clear(p) { (p)->Data = NULL; (p)->Size = 0; }
+#define SzData_CLEAR(p) { (p)->Data = NULL; (p)->Size = 0; }
+
+#define SZ_READ_BYTE_SD_NOCHECK(_sd_, dest) \
+    (_sd_)->Size--; dest = *(_sd_)->Data++;
+
+#define SZ_READ_BYTE_SD(_sd_, dest) \
+    if ((_sd_)->Size == 0) return SZ_ERROR_ARCHIVE; \
+    SZ_READ_BYTE_SD_NOCHECK(_sd_, dest)
 
-#define SZ_READ_BYTE_SD(_sd_, dest) if ((_sd_)->Size == 0) return SZ_ERROR_ARCHIVE; (_sd_)->Size--; dest = *(_sd_)->Data++;
 #define SZ_READ_BYTE(dest) SZ_READ_BYTE_SD(sd, dest)
-#define SZ_READ_BYTE_2(dest) if (sd.Size == 0) return SZ_ERROR_ARCHIVE; sd.Size--; dest = *sd.Data++;
+
+#define SZ_READ_BYTE_2(dest) \
+    if (sd.Size == 0) return SZ_ERROR_ARCHIVE; \
+    sd.Size--; dest = *sd.Data++;
 
 #define SKIP_DATA(sd, size) { sd->Size -= (size_t)(size); sd->Data += (size_t)(size); }
 #define SKIP_DATA2(sd, size) { sd.Size -= (size_t)(size); sd.Data += (size_t)(size); }
@@ -190,25 +202,25 @@ static int TestSignatureCandidate(const Byte *testBytes)
 #define SZ_READ_32(dest) if (sd.Size < 4) return SZ_ERROR_ARCHIVE; \
    dest = GetUi32(sd.Data); SKIP_DATA2(sd, 4);
 
-static MY_NO_INLINE SRes ReadNumber(CSzData *sd, UInt64 *value)
+static Z7_NO_INLINE SRes ReadNumber(CSzData *sd, UInt64 *value)
 {
   Byte firstByte, mask;
   unsigned i;
   UInt32 v;
 
-  SZ_READ_BYTE(firstByte);
+  SZ_READ_BYTE(firstByte)
   if ((firstByte & 0x80) == 0)
   {
     *value = firstByte;
     return SZ_OK;
   }
-  SZ_READ_BYTE(v);
+  SZ_READ_BYTE(v)
   if ((firstByte & 0x40) == 0)
   {
     *value = (((UInt32)firstByte & 0x3F) << 8) | v;
     return SZ_OK;
   }
-  SZ_READ_BYTE(mask);
+  SZ_READ_BYTE(mask)
   *value = v | ((UInt32)mask << 8);
   mask = 0x20;
   for (i = 2; i < 8; i++)
@@ -216,11 +228,11 @@ static MY_NO_INLINE SRes ReadNumber(CSzData *sd, UInt64 *value)
     Byte b;
     if ((firstByte & mask) == 0)
     {
-      UInt64 highPart = (unsigned)firstByte & (unsigned)(mask - 1);
+      const UInt64 highPart = (unsigned)firstByte & (unsigned)(mask - 1);
       *value |= (highPart << (8 * i));
       return SZ_OK;
     }
-    SZ_READ_BYTE(b);
+    SZ_READ_BYTE(b)
     *value |= ((UInt64)b << (8 * i));
     mask >>= 1;
   }
@@ -228,7 +240,7 @@ static MY_NO_INLINE SRes ReadNumber(CSzData *sd, UInt64 *value)
 }
 
 
-static MY_NO_INLINE SRes SzReadNumber32(CSzData *sd, UInt32 *value)
+static Z7_NO_INLINE SRes SzReadNumber32(CSzData *sd, UInt32 *value)
 {
   Byte firstByte;
   UInt64 value64;
@@ -242,7 +254,7 @@ static MY_NO_INLINE SRes SzReadNumber32(CSzData *sd, UInt32 *value)
     sd->Size--;
     return SZ_OK;
   }
-  RINOK(ReadNumber(sd, &value64));
+  RINOK(ReadNumber(sd, &value64))
   if (value64 >= (UInt32)0x80000000 - 1)
     return SZ_ERROR_UNSUPPORTED;
   if (value64 >= ((UInt64)(1) << ((sizeof(size_t) - 1) * 8 + 4)))
@@ -256,10 +268,10 @@ static MY_NO_INLINE SRes SzReadNumber32(CSzData *sd, UInt32 *value)
 static SRes SkipData(CSzData *sd)
 {
   UInt64 size;
-  RINOK(ReadNumber(sd, &size));
+  RINOK(ReadNumber(sd, &size))
   if (size > sd->Size)
     return SZ_ERROR_ARCHIVE;
-  SKIP_DATA(sd, size);
+  SKIP_DATA(sd, size)
   return SZ_OK;
 }
 
@@ -268,28 +280,28 @@ static SRes WaitId(CSzData *sd, UInt32 id)
   for (;;)
   {
     UInt64 type;
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadID(sd, &type))
     if (type == id)
       return SZ_OK;
     if (type == k7zIdEnd)
       return SZ_ERROR_ARCHIVE;
-    RINOK(SkipData(sd));
+    RINOK(SkipData(sd))
   }
 }
 
 static SRes RememberBitVector(CSzData *sd, UInt32 numItems, const Byte **v)
 {
-  UInt32 numBytes = (numItems + 7) >> 3;
+  const UInt32 numBytes = (numItems + 7) >> 3;
   if (numBytes > sd->Size)
     return SZ_ERROR_ARCHIVE;
   *v = sd->Data;
-  SKIP_DATA(sd, numBytes);
+  SKIP_DATA(sd, numBytes)
   return SZ_OK;
 }
 
 static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems)
 {
-  Byte b = 0;
+  unsigned b = 0;
   unsigned m = 0;
   UInt32 sum = 0;
   for (; numItems != 0; numItems--)
@@ -300,53 +312,53 @@ static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems)
       m = 8;
     }
     m--;
-    sum += ((b >> m) & 1);
+    sum += (UInt32)((b >> m) & 1);
   }
   return sum;
 }
 
-static MY_NO_INLINE SRes ReadBitVector(CSzData *sd, UInt32 numItems, Byte **v, ISzAllocPtr alloc)
+static Z7_NO_INLINE SRes ReadBitVector(CSzData *sd, UInt32 numItems, Byte **v, ISzAllocPtr alloc)
 {
   Byte allAreDefined;
   Byte *v2;
-  UInt32 numBytes = (numItems + 7) >> 3;
+  const UInt32 numBytes = (numItems + 7) >> 3;
   *v = NULL;
-  SZ_READ_BYTE(allAreDefined);
+  SZ_READ_BYTE(allAreDefined)
   if (numBytes == 0)
     return SZ_OK;
   if (allAreDefined == 0)
   {
     if (numBytes > sd->Size)
       return SZ_ERROR_ARCHIVE;
-    MY_ALLOC_AND_CPY(*v, numBytes, sd->Data, alloc);
-    SKIP_DATA(sd, numBytes);
+    MY_ALLOC_AND_CPY(*v, numBytes, sd->Data, alloc)
+    SKIP_DATA(sd, numBytes)
     return SZ_OK;
   }
-  MY_ALLOC(Byte, *v, numBytes, alloc);
+  MY_ALLOC(Byte, *v, numBytes, alloc)
   v2 = *v;
   memset(v2, 0xFF, (size_t)numBytes);
   {
-    unsigned numBits = (unsigned)numItems & 7;
+    const unsigned numBits = (unsigned)numItems & 7;
     if (numBits != 0)
       v2[(size_t)numBytes - 1] = (Byte)((((UInt32)1 << numBits) - 1) << (8 - numBits));
   }
   return SZ_OK;
 }
 
-static MY_NO_INLINE SRes ReadUi32s(CSzData *sd2, UInt32 numItems, CSzBitUi32s *crcs, ISzAllocPtr alloc)
+static Z7_NO_INLINE SRes ReadUi32s(CSzData *sd2, UInt32 numItems, CSzBitUi32s *crcs, ISzAllocPtr alloc)
 {
   UInt32 i;
   CSzData sd;
   UInt32 *vals;
   const Byte *defs;
-  MY_ALLOC_ZE(UInt32, crcs->Vals, numItems, alloc);
+  MY_ALLOC_ZE(UInt32, crcs->Vals, numItems, alloc)
   sd = *sd2;
   defs = crcs->Defs;
   vals = crcs->Vals;
   for (i = 0; i < numItems; i++)
     if (SzBitArray_Check(defs, i))
     {
-      SZ_READ_32(vals[i]);
+      SZ_READ_32(vals[i])
     }
     else
       vals[i] = 0;
@@ -357,7 +369,7 @@ static MY_NO_INLINE SRes ReadUi32s(CSzData *sd2, UInt32 numItems, CSzBitUi32s *c
 static SRes ReadBitUi32s(CSzData *sd, UInt32 numItems, CSzBitUi32s *crcs, ISzAllocPtr alloc)
 {
   SzBitUi32s_Free(crcs, alloc);
-  RINOK(ReadBitVector(sd, numItems, &crcs->Defs, alloc));
+  RINOK(ReadBitVector(sd, numItems, &crcs->Defs, alloc))
   return ReadUi32s(sd, numItems, crcs, alloc);
 }
 
@@ -365,36 +377,36 @@ static SRes SkipBitUi32s(CSzData *sd, UInt32 numItems)
 {
   Byte allAreDefined;
   UInt32 numDefined = numItems;
-  SZ_READ_BYTE(allAreDefined);
+  SZ_READ_BYTE(allAreDefined)
   if (!allAreDefined)
   {
-    size_t numBytes = (numItems + 7) >> 3;
+    const size_t numBytes = (numItems + 7) >> 3;
     if (numBytes > sd->Size)
       return SZ_ERROR_ARCHIVE;
     numDefined = CountDefinedBits(sd->Data, numItems);
-    SKIP_DATA(sd, numBytes);
+    SKIP_DATA(sd, numBytes)
   }
   if (numDefined > (sd->Size >> 2))
     return SZ_ERROR_ARCHIVE;
-  SKIP_DATA(sd, (size_t)numDefined * 4);
+  SKIP_DATA(sd, (size_t)numDefined * 4)
   return SZ_OK;
 }
 
 static SRes ReadPackInfo(CSzAr *p, CSzData *sd, ISzAllocPtr alloc)
 {
-  RINOK(SzReadNumber32(sd, &p->NumPackStreams));
+  RINOK(SzReadNumber32(sd, &p->NumPackStreams))
 
-  RINOK(WaitId(sd, k7zIdSize));
-  MY_ALLOC(UInt64, p->PackPositions, (size_t)p->NumPackStreams + 1, alloc);
+  RINOK(WaitId(sd, k7zIdSize))
+  MY_ALLOC(UInt64, p->PackPositions, (size_t)p->NumPackStreams + 1, alloc)
   {
     UInt64 sum = 0;
     UInt32 i;
-    UInt32 numPackStreams = p->NumPackStreams;
+    const UInt32 numPackStreams = p->NumPackStreams;
     for (i = 0; i < numPackStreams; i++)
     {
       UInt64 packSize;
       p->PackPositions[i] = sum;
-      RINOK(ReadNumber(sd, &packSize));
+      RINOK(ReadNumber(sd, &packSize))
       sum += packSize;
       if (sum < packSize)
         return SZ_ERROR_ARCHIVE;
@@ -405,16 +417,16 @@ static SRes ReadPackInfo(CSzAr *p, CSzData *sd, ISzAllocPtr alloc)
   for (;;)
   {
     UInt64 type;
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadID(sd, &type))
     if (type == k7zIdEnd)
       return SZ_OK;
     if (type == k7zIdCRC)
     {
       /* CRC of packed streams is unused now */
-      RINOK(SkipBitUi32s(sd, p->NumPackStreams));
+      RINOK(SkipBitUi32s(sd, p->NumPackStreams))
       continue;
     }
-    RINOK(SkipData(sd));
+    RINOK(SkipData(sd))
   }
 }
 
@@ -440,7 +452,7 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd)
   f->NumPackStreams = 0;
   f->UnpackStream = 0;
   
-  RINOK(SzReadNumber32(sd, &numCoders));
+  RINOK(SzReadNumber32(sd, &numCoders))
   if (numCoders == 0 || numCoders > SZ_NUM_CODERS_IN_FOLDER_MAX)
     return SZ_ERROR_UNSUPPORTED;
   
@@ -451,7 +463,7 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd)
     unsigned idSize, j;
     UInt64 id;
     
-    SZ_READ_BYTE(mainByte);
+    SZ_READ_BYTE(mainByte)
     if ((mainByte & 0xC0) != 0)
       return SZ_ERROR_UNSUPPORTED;
     
@@ -479,12 +491,12 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd)
     {
       UInt32 numStreams;
       
-      RINOK(SzReadNumber32(sd, &numStreams));
+      RINOK(SzReadNumber32(sd, &numStreams))
       if (numStreams > k_NumCodersStreams_in_Folder_MAX)
         return SZ_ERROR_UNSUPPORTED;
       coder->NumStreams = (Byte)numStreams;
 
-      RINOK(SzReadNumber32(sd, &numStreams));
+      RINOK(SzReadNumber32(sd, &numStreams))
       if (numStreams != 1)
         return SZ_ERROR_UNSUPPORTED;
     }
@@ -497,12 +509,12 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd)
     if ((mainByte & 0x20) != 0)
     {
       UInt32 propsSize = 0;
-      RINOK(SzReadNumber32(sd, &propsSize));
+      RINOK(SzReadNumber32(sd, &propsSize))
       if (propsSize > sd->Size)
         return SZ_ERROR_ARCHIVE;
       if (propsSize >= 0x80)
         return SZ_ERROR_UNSUPPORTED;
-      coder->PropsOffset = sd->Data - dataStart;
+      coder->PropsOffset = (size_t)(sd->Data - dataStart);
       coder->PropsSize = (Byte)propsSize;
       sd->Data += (size_t)propsSize;
       sd->Size -= (size_t)propsSize;
@@ -547,12 +559,12 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd)
       {
         CSzBond *bp = f->Bonds + i;
         
-        RINOK(SzReadNumber32(sd, &bp->InIndex));
+        RINOK(SzReadNumber32(sd, &bp->InIndex))
         if (bp->InIndex >= numInStreams || streamUsed[bp->InIndex])
           return SZ_ERROR_ARCHIVE;
         streamUsed[bp->InIndex] = True;
         
-        RINOK(SzReadNumber32(sd, &bp->OutIndex));
+        RINOK(SzReadNumber32(sd, &bp->OutIndex))
         if (bp->OutIndex >= numCoders || coderUsed[bp->OutIndex])
           return SZ_ERROR_ARCHIVE;
         coderUsed[bp->OutIndex] = True;
@@ -582,7 +594,7 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd)
       for (i = 0; i < numPackStreams; i++)
       {
         UInt32 index;
-        RINOK(SzReadNumber32(sd, &index));
+        RINOK(SzReadNumber32(sd, &index))
         if (index >= numInStreams || streamUsed[index])
           return SZ_ERROR_ARCHIVE;
         streamUsed[index] = True;
@@ -596,7 +608,7 @@ SRes SzGetNextFolderItem(CSzFolder *f, CSzData *sd)
 }
 
 
-static MY_NO_INLINE SRes SkipNumbers(CSzData *sd2, UInt32 num)
+static Z7_NO_INLINE SRes SkipNumbers(CSzData *sd2, UInt32 num)
 {
   CSzData sd;
   sd = *sd2;
@@ -604,7 +616,7 @@ static MY_NO_INLINE SRes SkipNumbers(CSzData *sd2, UInt32 num)
   {
     Byte firstByte, mask;
     unsigned i;
-    SZ_READ_BYTE_2(firstByte);
+    SZ_READ_BYTE_2(firstByte)
     if ((firstByte & 0x80) == 0)
       continue;
     if ((firstByte & 0x40) == 0)
@@ -620,7 +632,7 @@ static MY_NO_INLINE SRes SkipNumbers(CSzData *sd2, UInt32 num)
       mask >>= 1;
     if (i > sd.Size)
       return SZ_ERROR_ARCHIVE;
-    SKIP_DATA2(sd, i);
+    SKIP_DATA2(sd, i)
   }
   *sd2 = sd;
   return SZ_OK;
@@ -643,30 +655,30 @@ static SRes ReadUnpackInfo(CSzAr *p,
   const Byte *startBufPtr;
   Byte external;
   
-  RINOK(WaitId(sd2, k7zIdFolder));
+  RINOK(WaitId(sd2, k7zIdFolder))
   
-  RINOK(SzReadNumber32(sd2, &numFolders));
+  RINOK(SzReadNumber32(sd2, &numFolders))
   if (numFolders > numFoldersMax)
     return SZ_ERROR_UNSUPPORTED;
   p->NumFolders = numFolders;
 
-  SZ_READ_BYTE_SD(sd2, external);
+  SZ_READ_BYTE_SD(sd2, external)
   if (external == 0)
     sd = *sd2;
   else
   {
     UInt32 index;
-    RINOK(SzReadNumber32(sd2, &index));
+    RINOK(SzReadNumber32(sd2, &index))
     if (index >= numTempBufs)
       return SZ_ERROR_ARCHIVE;
     sd.Data = tempBufs[index].data;
     sd.Size = tempBufs[index].size;
   }
   
-  MY_ALLOC(size_t, p->FoCodersOffsets, (size_t)numFolders + 1, alloc);
-  MY_ALLOC(UInt32, p->FoStartPackStreamIndex, (size_t)numFolders + 1, alloc);
-  MY_ALLOC(UInt32, p->FoToCoderUnpackSizes, (size_t)numFolders + 1, alloc);
-  MY_ALLOC_ZE(Byte, p->FoToMainUnpackSizeIndex, (size_t)numFolders, alloc);
+  MY_ALLOC(size_t, p->FoCodersOffsets, (size_t)numFolders + 1, alloc)
+  MY_ALLOC(UInt32, p->FoStartPackStreamIndex, (size_t)numFolders + 1, alloc)
+  MY_ALLOC(UInt32, p->FoToCoderUnpackSizes, (size_t)numFolders + 1, alloc)
+  MY_ALLOC_ZE(Byte, p->FoToMainUnpackSizeIndex, (size_t)numFolders, alloc)
   
   startBufPtr = sd.Data;
   
@@ -677,9 +689,9 @@ static SRes ReadUnpackInfo(CSzAr *p,
   {
     UInt32 numCoders, ci, numInStreams = 0;
     
-    p->FoCodersOffsets[fo] = sd.Data - startBufPtr;
+    p->FoCodersOffsets[fo] = (size_t)(sd.Data - startBufPtr);
     
-    RINOK(SzReadNumber32(&sd, &numCoders));
+    RINOK(SzReadNumber32(&sd, &numCoders))
     if (numCoders == 0 || numCoders > k_Scan_NumCoders_MAX)
       return SZ_ERROR_UNSUPPORTED;
     
@@ -689,7 +701,7 @@ static SRes ReadUnpackInfo(CSzAr *p,
       unsigned idSize;
       UInt32 coderInStreams;
       
-      SZ_READ_BYTE_2(mainByte);
+      SZ_READ_BYTE_2(mainByte)
       if ((mainByte & 0xC0) != 0)
         return SZ_ERROR_UNSUPPORTED;
       idSize = (mainByte & 0xF);
@@ -697,15 +709,15 @@ static SRes ReadUnpackInfo(CSzAr *p,
         return SZ_ERROR_UNSUPPORTED;
       if (idSize > sd.Size)
         return SZ_ERROR_ARCHIVE;
-      SKIP_DATA2(sd, idSize);
+      SKIP_DATA2(sd, idSize)
       
       coderInStreams = 1;
       
       if ((mainByte & 0x10) != 0)
       {
         UInt32 coderOutStreams;
-        RINOK(SzReadNumber32(&sd, &coderInStreams));
-        RINOK(SzReadNumber32(&sd, &coderOutStreams));
+        RINOK(SzReadNumber32(&sd, &coderInStreams))
+        RINOK(SzReadNumber32(&sd, &coderOutStreams))
         if (coderInStreams > k_Scan_NumCodersStreams_in_Folder_MAX || coderOutStreams != 1)
           return SZ_ERROR_UNSUPPORTED;
       }
@@ -715,10 +727,10 @@ static SRes ReadUnpackInfo(CSzAr *p,
       if ((mainByte & 0x20) != 0)
       {
         UInt32 propsSize;
-        RINOK(SzReadNumber32(&sd, &propsSize));
+        RINOK(SzReadNumber32(&sd, &propsSize))
         if (propsSize > sd.Size)
           return SZ_ERROR_ARCHIVE;
-        SKIP_DATA2(sd, propsSize);
+        SKIP_DATA2(sd, propsSize)
       }
     }
     
@@ -732,7 +744,7 @@ static SRes ReadUnpackInfo(CSzAr *p,
         Byte coderUsed[k_Scan_NumCoders_MAX];
     
         UInt32 i;
-        UInt32 numBonds = numCoders - 1;
+        const UInt32 numBonds = numCoders - 1;
         if (numInStreams < numBonds)
           return SZ_ERROR_ARCHIVE;
         
@@ -748,12 +760,12 @@ static SRes ReadUnpackInfo(CSzAr *p,
         {
           UInt32 index;
           
-          RINOK(SzReadNumber32(&sd, &index));
+          RINOK(SzReadNumber32(&sd, &index))
           if (index >= numInStreams || streamUsed[index])
             return SZ_ERROR_ARCHIVE;
           streamUsed[index] = True;
           
-          RINOK(SzReadNumber32(&sd, &index));
+          RINOK(SzReadNumber32(&sd, &index))
           if (index >= numCoders || coderUsed[index])
             return SZ_ERROR_ARCHIVE;
           coderUsed[index] = True;
@@ -765,7 +777,7 @@ static SRes ReadUnpackInfo(CSzAr *p,
           for (i = 0; i < numPackStreams; i++)
           {
             UInt32 index;
-            RINOK(SzReadNumber32(&sd, &index));
+            RINOK(SzReadNumber32(&sd, &index))
             if (index >= numInStreams || streamUsed[index])
               return SZ_ERROR_ARCHIVE;
             streamUsed[index] = True;
@@ -797,10 +809,10 @@ static SRes ReadUnpackInfo(CSzAr *p,
   p->FoToCoderUnpackSizes[fo] = numCodersOutStreams;
   
   {
-    size_t dataSize = sd.Data - startBufPtr;
+    const size_t dataSize = (size_t)(sd.Data - startBufPtr);
     p->FoStartPackStreamIndex[fo] = packStreamIndex;
     p->FoCodersOffsets[fo] = dataSize;
-    MY_ALLOC_ZE_AND_CPY(p->CodersData, dataSize, startBufPtr, alloc);
+    MY_ALLOC_ZE_AND_CPY(p->CodersData, dataSize, startBufPtr, alloc)
   }
   
   if (external != 0)
@@ -810,21 +822,21 @@ static SRes ReadUnpackInfo(CSzAr *p,
     sd = *sd2;
   }
   
-  RINOK(WaitId(&sd, k7zIdCodersUnpackSize));
+  RINOK(WaitId(&sd, k7zIdCodersUnpackSize))
   
-  MY_ALLOC_ZE(UInt64, p->CoderUnpackSizes, (size_t)numCodersOutStreams, alloc);
+  MY_ALLOC_ZE(UInt64, p->CoderUnpackSizes, (size_t)numCodersOutStreams, alloc)
   {
     UInt32 i;
     for (i = 0; i < numCodersOutStreams; i++)
     {
-      RINOK(ReadNumber(&sd, p->CoderUnpackSizes + i));
+      RINOK(ReadNumber(&sd, p->CoderUnpackSizes + i))
     }
   }
 
   for (;;)
   {
     UInt64 type;
-    RINOK(ReadID(&sd, &type));
+    RINOK(ReadID(&sd, &type))
     if (type == k7zIdEnd)
     {
       *sd2 = sd;
@@ -832,10 +844,10 @@ static SRes ReadUnpackInfo(CSzAr *p,
     }
     if (type == k7zIdCRC)
     {
-      RINOK(ReadBitUi32s(&sd, numFolders, &p->FolderCRCs, alloc));
+      RINOK(ReadBitUi32s(&sd, numFolders, &p->FolderCRCs, alloc))
       continue;
     }
-    RINOK(SkipData(&sd));
+    RINOK(SkipData(&sd))
   }
 }
 
@@ -860,13 +872,13 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi)
 {
   UInt64 type = 0;
   UInt32 numSubDigests = 0;
-  UInt32 numFolders = p->NumFolders;
+  const UInt32 numFolders = p->NumFolders;
   UInt32 numUnpackStreams = numFolders;
   UInt32 numUnpackSizesInData = 0;
 
   for (;;)
   {
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadID(sd, &type))
     if (type == k7zIdNumUnpackStream)
     {
       UInt32 i;
@@ -876,7 +888,7 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi)
       for (i = 0; i < numFolders; i++)
       {
         UInt32 numStreams;
-        RINOK(SzReadNumber32(sd, &numStreams));
+        RINOK(SzReadNumber32(sd, &numStreams))
         if (numUnpackStreams > numUnpackStreams + numStreams)
           return SZ_ERROR_UNSUPPORTED;
         numUnpackStreams += numStreams;
@@ -885,12 +897,12 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi)
         if (numStreams != 1 || !SzBitWithVals_Check(&p->FolderCRCs, i))
           numSubDigests += numStreams;
       }
-      ssi->sdNumSubStreams.Size = sd->Data - ssi->sdNumSubStreams.Data;
+      ssi->sdNumSubStreams.Size = (size_t)(sd->Data - ssi->sdNumSubStreams.Data);
       continue;
     }
     if (type == k7zIdCRC || type == k7zIdSize || type == k7zIdEnd)
       break;
-    RINOK(SkipData(sd));
+    RINOK(SkipData(sd))
   }
 
   if (!ssi->sdNumSubStreams.Data)
@@ -906,9 +918,9 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi)
   if (type == k7zIdSize)
   {
     ssi->sdSizes.Data = sd->Data;
-    RINOK(SkipNumbers(sd, numUnpackSizesInData));
-    ssi->sdSizes.Size = sd->Data - ssi->sdSizes.Data;
-    RINOK(ReadID(sd, &type));
+    RINOK(SkipNumbers(sd, numUnpackSizesInData))
+    ssi->sdSizes.Size = (size_t)(sd->Data - ssi->sdSizes.Data);
+    RINOK(ReadID(sd, &type))
   }
 
   for (;;)
@@ -918,14 +930,14 @@ static SRes ReadSubStreamsInfo(CSzAr *p, CSzData *sd, CSubStreamInfo *ssi)
     if (type == k7zIdCRC)
     {
       ssi->sdCRCs.Data = sd->Data;
-      RINOK(SkipBitUi32s(sd, numSubDigests));
-      ssi->sdCRCs.Size = sd->Data - ssi->sdCRCs.Data;
+      RINOK(SkipBitUi32s(sd, numSubDigests))
+      ssi->sdCRCs.Size = (size_t)(sd->Data - ssi->sdCRCs.Data);
     }
     else
     {
-      RINOK(SkipData(sd));
+      RINOK(SkipData(sd))
     }
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadID(sd, &type))
   }
 }
 
@@ -938,27 +950,31 @@ static SRes SzReadStreamsInfo(CSzAr *p,
 {
   UInt64 type;
 
-  SzData_Clear(&ssi->sdSizes);
-  SzData_Clear(&ssi->sdCRCs);
-  SzData_Clear(&ssi->sdNumSubStreams);
+  SzData_CLEAR(&ssi->sdSizes)
+  SzData_CLEAR(&ssi->sdCRCs)
+  SzData_CLEAR(&ssi->sdNumSubStreams)
 
   *dataOffset = 0;
-  RINOK(ReadID(sd, &type));
+  RINOK(ReadID(sd, &type))
   if (type == k7zIdPackInfo)
   {
-    RINOK(ReadNumber(sd, dataOffset));
-    RINOK(ReadPackInfo(p, sd, alloc));
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadNumber(sd, dataOffset))
+    if (*dataOffset > p->RangeLimit)
+      return SZ_ERROR_ARCHIVE;
+    RINOK(ReadPackInfo(p, sd, alloc))
+    if (p->PackPositions[p->NumPackStreams] > p->RangeLimit - *dataOffset)
+      return SZ_ERROR_ARCHIVE;
+    RINOK(ReadID(sd, &type))
   }
   if (type == k7zIdUnpackInfo)
   {
-    RINOK(ReadUnpackInfo(p, sd, numFoldersMax, tempBufs, numTempBufs, alloc));
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadUnpackInfo(p, sd, numFoldersMax, tempBufs, numTempBufs, alloc))
+    RINOK(ReadID(sd, &type))
   }
   if (type == k7zIdSubStreamsInfo)
   {
-    RINOK(ReadSubStreamsInfo(p, sd, ssi));
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadSubStreamsInfo(p, sd, ssi))
+    RINOK(ReadID(sd, &type))
   }
   else
   {
@@ -970,7 +986,7 @@ static SRes SzReadStreamsInfo(CSzAr *p,
 }
 
 static SRes SzReadAndDecodePackedStreams(
-    ILookInStream *inStream,
+    ILookInStreamPtr inStream,
     CSzData *sd,
     CBuf *tempBufs,
     UInt32 numFoldersMax,
@@ -982,7 +998,7 @@ static SRes SzReadAndDecodePackedStreams(
   UInt32 fo;
   CSubStreamInfo ssi;
 
-  RINOK(SzReadStreamsInfo(p, sd, numFoldersMax, NULL, 0, &dataStartPos, &ssi, allocTemp));
+  RINOK(SzReadStreamsInfo(p, sd, numFoldersMax, NULL, 0, &dataStartPos, &ssi, allocTemp))
   
   dataStartPos += baseOffset;
   if (p->NumFolders == 0)
@@ -994,7 +1010,7 @@ static SRes SzReadAndDecodePackedStreams(
   for (fo = 0; fo < p->NumFolders; fo++)
   {
     CBuf *tempBuf = tempBufs + fo;
-    UInt64 unpackSize = SzAr_GetFolderUnpackSize(p, fo);
+    const UInt64 unpackSize = SzAr_GetFolderUnpackSize(p, fo);
     if ((size_t)unpackSize != unpackSize)
       return SZ_ERROR_MEM;
     if (!Buf_Create(tempBuf, (size_t)unpackSize, allocTemp))
@@ -1004,8 +1020,8 @@ static SRes SzReadAndDecodePackedStreams(
   for (fo = 0; fo < p->NumFolders; fo++)
   {
     const CBuf *tempBuf = tempBufs + fo;
-    RINOK(LookInStream_SeekTo(inStream, dataStartPos));
-    RINOK(SzAr_DecodeFolder(p, fo, inStream, dataStartPos, tempBuf->data, tempBuf->size, allocTemp));
+    RINOK(LookInStream_SeekTo(inStream, dataStartPos))
+    RINOK(SzAr_DecodeFolder(p, fo, inStream, dataStartPos, tempBuf->data, tempBuf->size, allocTemp))
   }
   
   return SZ_OK;
@@ -1028,19 +1044,19 @@ static SRes SzReadFileNames(const Byte *data, size_t size, UInt32 numFiles, size
       return SZ_ERROR_ARCHIVE;
     for (p = data + pos;
       #ifdef _WIN32
-      *(const UInt16 *)p != 0
+      *(const UInt16 *)(const void *)p != 0
       #else
       p[0] != 0 || p[1] != 0
       #endif
       ; p += 2);
-    pos = p - data + 2;
+    pos = (size_t)(p - data) + 2;
     *offsets++ = (pos >> 1);
   }
   while (--numFiles);
   return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE;
 }
 
-static MY_NO_INLINE SRes ReadTime(CSzBitUi64s *p, UInt32 num,
+static Z7_NO_INLINE SRes ReadTime(CSzBitUi64s *p, UInt32 num,
     CSzData *sd2,
     const CBuf *tempBufs, UInt32 numTempBufs,
     ISzAllocPtr alloc)
@@ -1051,22 +1067,22 @@ static MY_NO_INLINE SRes ReadTime(CSzBitUi64s *p, UInt32 num,
   Byte *defs;
   Byte external;
   
-  RINOK(ReadBitVector(sd2, num, &p->Defs, alloc));
+  RINOK(ReadBitVector(sd2, num, &p->Defs, alloc))
   
-  SZ_READ_BYTE_SD(sd2, external);
+  SZ_READ_BYTE_SD(sd2, external)
   if (external == 0)
     sd = *sd2;
   else
   {
     UInt32 index;
-    RINOK(SzReadNumber32(sd2, &index));
+    RINOK(SzReadNumber32(sd2, &index))
     if (index >= numTempBufs)
       return SZ_ERROR_ARCHIVE;
     sd.Data = tempBufs[index].data;
     sd.Size = tempBufs[index].size;
   }
   
-  MY_ALLOC_ZE(CNtfsFileTime, p->Vals, num, alloc);
+  MY_ALLOC_ZE(CNtfsFileTime, p->Vals, num, alloc)
   vals = p->Vals;
   defs = p->Defs;
   for (i = 0; i < num; i++)
@@ -1076,7 +1092,7 @@ static MY_NO_INLINE SRes ReadTime(CSzBitUi64s *p, UInt32 num,
         return SZ_ERROR_ARCHIVE;
       vals[i].Low = GetUi32(sd.Data);
       vals[i].High = GetUi32(sd.Data + 4);
-      SKIP_DATA2(sd, 8);
+      SKIP_DATA2(sd, 8)
     }
     else
       vals[i].High = vals[i].Low = 0;
@@ -1094,7 +1110,7 @@ static MY_NO_INLINE SRes ReadTime(CSzBitUi64s *p, UInt32 num,
 static SRes SzReadHeader2(
     CSzArEx *p,   /* allocMain */
     CSzData *sd,
-    ILookInStream *inStream,
+    ILookInStreamPtr inStream,
     CBuf *tempBufs, UInt32 *numTempBufs,
     ISzAllocPtr allocMain,
     ISzAllocPtr allocTemp
@@ -1105,26 +1121,26 @@ static SRes SzReadHeader2(
 {
   UInt64 type;
   
-  SzData_Clear(&ssi.sdSizes);
-  SzData_Clear(&ssi.sdCRCs);
-  SzData_Clear(&ssi.sdNumSubStreams);
+  SzData_CLEAR(&ssi.sdSizes)
+  SzData_CLEAR(&ssi.sdCRCs)
+  SzData_CLEAR(&ssi.sdNumSubStreams)
 
   ssi.NumSubDigests = 0;
   ssi.NumTotalSubStreams = 0;
 
-  RINOK(ReadID(sd, &type));
+  RINOK(ReadID(sd, &type))
 
   if (type == k7zIdArchiveProperties)
   {
     for (;;)
     {
       UInt64 type2;
-      RINOK(ReadID(sd, &type2));
+      RINOK(ReadID(sd, &type2))
       if (type2 == k7zIdEnd)
         break;
-      RINOK(SkipData(sd));
+      RINOK(SkipData(sd))
     }
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadID(sd, &type))
   }
 
   if (type == k7zIdAdditionalStreamsInfo)
@@ -1133,6 +1149,8 @@ static SRes SzReadHeader2(
     SRes res;
     
     SzAr_Init(&tempAr);
+    tempAr.RangeLimit = p->db.RangeLimit;
+
     res = SzReadAndDecodePackedStreams(inStream, sd, tempBufs, NUM_ADDITIONAL_STREAMS_MAX,
         p->startPosAfterHeader, &tempAr, allocTemp);
     *numTempBufs = tempAr.NumFolders;
@@ -1140,15 +1158,15 @@ static SRes SzReadHeader2(
     
     if (res != SZ_OK)
       return res;
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadID(sd, &type))
   }
 
   if (type == k7zIdMainStreamsInfo)
   {
     RINOK(SzReadStreamsInfo(&p->db, sd, (UInt32)1 << 30, tempBufs, *numTempBufs,
-        &p->dataPos, &ssi, allocMain));
+        &p->dataPos, &ssi, allocMain))
     p->dataPos += p->startPosAfterHeader;
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadID(sd, &type))
   }
 
   if (type == k7zIdEnd)
@@ -1166,23 +1184,23 @@ static SRes SzReadHeader2(
   const Byte *emptyStreams = NULL;
   const Byte *emptyFiles = NULL;
   
-  RINOK(SzReadNumber32(sd, &numFiles));
+  RINOK(SzReadNumber32(sd, &numFiles))
   p->NumFiles = numFiles;
 
   for (;;)
   {
     UInt64 type;
     UInt64 size;
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadID(sd, &type))
     if (type == k7zIdEnd)
       break;
-    RINOK(ReadNumber(sd, &size));
+    RINOK(ReadNumber(sd, &size))
     if (size > sd->Size)
       return SZ_ERROR_ARCHIVE;
     
     if (type >= ((UInt32)1 << 8))
     {
-      SKIP_DATA(sd, size);
+      SKIP_DATA(sd, size)
     }
     else switch ((unsigned)type)
     {
@@ -1192,7 +1210,7 @@ static SRes SzReadHeader2(
         const Byte *namesData;
         Byte external;
 
-        SZ_READ_BYTE(external);
+        SZ_READ_BYTE(external)
         if (external == 0)
         {
           namesSize = (size_t)size - 1;
@@ -1201,7 +1219,7 @@ static SRes SzReadHeader2(
         else
         {
           UInt32 index;
-          RINOK(SzReadNumber32(sd, &index));
+          RINOK(SzReadNumber32(sd, &index))
           if (index >= *numTempBufs)
             return SZ_ERROR_ARCHIVE;
           namesData = (tempBufs)[index].data;
@@ -1210,25 +1228,25 @@ static SRes SzReadHeader2(
 
         if ((namesSize & 1) != 0)
           return SZ_ERROR_ARCHIVE;
-        MY_ALLOC(size_t, p->FileNameOffsets, numFiles + 1, allocMain);
-        MY_ALLOC_ZE_AND_CPY(p->FileNames, namesSize, namesData, allocMain);
+        MY_ALLOC(size_t, p->FileNameOffsets, numFiles + 1, allocMain)
+        MY_ALLOC_ZE_AND_CPY(p->FileNames, namesSize, namesData, allocMain)
         RINOK(SzReadFileNames(p->FileNames, namesSize, numFiles, p->FileNameOffsets))
         if (external == 0)
         {
-          SKIP_DATA(sd, namesSize);
+          SKIP_DATA(sd, namesSize)
         }
         break;
       }
       case k7zIdEmptyStream:
       {
-        RINOK(RememberBitVector(sd, numFiles, &emptyStreams));
+        RINOK(RememberBitVector(sd, numFiles, &emptyStreams))
         numEmptyStreams = CountDefinedBits(emptyStreams, numFiles);
         emptyFiles = NULL;
         break;
       }
       case k7zIdEmptyFile:
       {
-        RINOK(RememberBitVector(sd, numEmptyStreams, &emptyFiles));
+        RINOK(RememberBitVector(sd, numEmptyStreams, &emptyFiles))
         break;
       }
       case k7zIdWinAttrib:
@@ -1237,22 +1255,22 @@ static SRes SzReadHeader2(
         CSzData sdSwitch;
         CSzData *sdPtr;
         SzBitUi32s_Free(&p->Attribs, allocMain);
-        RINOK(ReadBitVector(sd, numFiles, &p->Attribs.Defs, allocMain));
+        RINOK(ReadBitVector(sd, numFiles, &p->Attribs.Defs, allocMain))
 
-        SZ_READ_BYTE(external);
+        SZ_READ_BYTE(external)
         if (external == 0)
           sdPtr = sd;
         else
         {
           UInt32 index;
-          RINOK(SzReadNumber32(sd, &index));
+          RINOK(SzReadNumber32(sd, &index))
           if (index >= *numTempBufs)
             return SZ_ERROR_ARCHIVE;
           sdSwitch.Data = (tempBufs)[index].data;
           sdSwitch.Size = (tempBufs)[index].size;
           sdPtr = &sdSwitch;
         }
-        RINOK(ReadUi32s(sdPtr, numFiles, &p->Attribs, allocMain));
+        RINOK(ReadUi32s(sdPtr, numFiles, &p->Attribs, allocMain))
         break;
       }
       /*
@@ -1265,11 +1283,11 @@ static SRes SzReadHeader2(
         break;
       }
       */
-      case k7zIdMTime: RINOK(ReadTime(&p->MTime, numFiles, sd, tempBufs, *numTempBufs, allocMain)); break;
-      case k7zIdCTime: RINOK(ReadTime(&p->CTime, numFiles, sd, tempBufs, *numTempBufs, allocMain)); break;
+      case k7zIdMTime: RINOK(ReadTime(&p->MTime, numFiles, sd, tempBufs, *numTempBufs, allocMain)) break;
+      case k7zIdCTime: RINOK(ReadTime(&p->CTime, numFiles, sd, tempBufs, *numTempBufs, allocMain)) break;
       default:
       {
-        SKIP_DATA(sd, size);
+        SKIP_DATA(sd, size)
       }
     }
   }
@@ -1280,10 +1298,10 @@ static SRes SzReadHeader2(
   for (;;)
   {
     UInt64 type;
-    RINOK(ReadID(sd, &type));
+    RINOK(ReadID(sd, &type))
     if (type == k7zIdEnd)
       break;
-    RINOK(SkipData(sd));
+    RINOK(SkipData(sd))
   }
 
   {
@@ -1295,40 +1313,37 @@ static SRes SzReadHeader2(
     UInt64 unpackPos = 0;
     const Byte *digestsDefs = NULL;
     const Byte *digestsVals = NULL;
-    UInt32 digestsValsIndex = 0;
-    UInt32 digestIndex;
-    Byte allDigestsDefined = 0;
+    UInt32 digestIndex = 0;
     Byte isDirMask = 0;
     Byte crcMask = 0;
     Byte mask = 0x80;
     
-    MY_ALLOC(UInt32, p->FolderToFile, p->db.NumFolders + 1, allocMain);
-    MY_ALLOC_ZE(UInt32, p->FileToFolder, p->NumFiles, allocMain);
-    MY_ALLOC(UInt64, p->UnpackPositions, p->NumFiles + 1, allocMain);
-    MY_ALLOC_ZE(Byte, p->IsDirs, (p->NumFiles + 7) >> 3, allocMain);
+    MY_ALLOC(UInt32, p->FolderToFile, p->db.NumFolders + 1, allocMain)
+    MY_ALLOC_ZE(UInt32, p->FileToFolder, p->NumFiles, allocMain)
+    MY_ALLOC(UInt64, p->UnpackPositions, p->NumFiles + 1, allocMain)
+    MY_ALLOC_ZE(Byte, p->IsDirs, (p->NumFiles + 7) >> 3, allocMain)
 
-    RINOK(SzBitUi32s_Alloc(&p->CRCs, p->NumFiles, allocMain));
+    RINOK(SzBitUi32s_Alloc(&p->CRCs, p->NumFiles, allocMain))
 
     if (ssi.sdCRCs.Size != 0)
     {
-      SZ_READ_BYTE_SD(&ssi.sdCRCs, allDigestsDefined);
+      Byte allDigestsDefined = 0;
+      SZ_READ_BYTE_SD_NOCHECK(&ssi.sdCRCs, allDigestsDefined)
       if (allDigestsDefined)
         digestsVals = ssi.sdCRCs.Data;
       else
       {
-        size_t numBytes = (ssi.NumSubDigests + 7) >> 3;
+        const size_t numBytes = (ssi.NumSubDigests + 7) >> 3;
         digestsDefs = ssi.sdCRCs.Data;
         digestsVals = digestsDefs + numBytes;
       }
     }
 
-    digestIndex = 0;
-    
     for (i = 0; i < numFiles; i++, mask >>= 1)
     {
       if (mask == 0)
       {
-        UInt32 byteIndex = (i - 1) >> 3;
+        const UInt32 byteIndex = (i - 1) >> 3;
         p->IsDirs[byteIndex] = isDirMask;
         p->CRCs.Defs[byteIndex] = crcMask;
         isDirMask = 0;
@@ -1366,18 +1381,17 @@ static SRes SzReadHeader2(
           numSubStreams = 1;
           if (ssi.sdNumSubStreams.Data)
           {
-            RINOK(SzReadNumber32(&ssi.sdNumSubStreams, &numSubStreams));
+            RINOK(SzReadNumber32(&ssi.sdNumSubStreams, &numSubStreams))
           }
           remSubStreams = numSubStreams;
           if (numSubStreams != 0)
             break;
           {
-            UInt64 folderUnpackSize = SzAr_GetFolderUnpackSize(&p->db, folderIndex);
+            const UInt64 folderUnpackSize = SzAr_GetFolderUnpackSize(&p->db, folderIndex);
             unpackPos += folderUnpackSize;
             if (unpackPos < folderUnpackSize)
               return SZ_ERROR_ARCHIVE;
           }
-
           folderIndex++;
         }
       }
@@ -1389,47 +1403,44 @@ static SRes SzReadHeader2(
       
       if (--remSubStreams == 0)
       {
-        UInt64 folderUnpackSize = SzAr_GetFolderUnpackSize(&p->db, folderIndex);
-        UInt64 startFolderUnpackPos = p->UnpackPositions[p->FolderToFile[folderIndex]];
+        const UInt64 folderUnpackSize = SzAr_GetFolderUnpackSize(&p->db, folderIndex);
+        const UInt64 startFolderUnpackPos = p->UnpackPositions[p->FolderToFile[folderIndex]];
         if (folderUnpackSize < unpackPos - startFolderUnpackPos)
           return SZ_ERROR_ARCHIVE;
         unpackPos = startFolderUnpackPos + folderUnpackSize;
         if (unpackPos < folderUnpackSize)
           return SZ_ERROR_ARCHIVE;
 
-        if (numSubStreams == 1 && SzBitWithVals_Check(&p->db.FolderCRCs, i))
+        if (numSubStreams == 1 && SzBitWithVals_Check(&p->db.FolderCRCs, folderIndex))
         {
           p->CRCs.Vals[i] = p->db.FolderCRCs.Vals[folderIndex];
           crcMask |= mask;
         }
-        else if (allDigestsDefined || (digestsDefs && SzBitArray_Check(digestsDefs, digestIndex)))
-        {
-          p->CRCs.Vals[i] = GetUi32(digestsVals + (size_t)digestsValsIndex * 4);
-          digestsValsIndex++;
-          crcMask |= mask;
-        }
-        
         folderIndex++;
       }
       else
       {
         UInt64 v;
-        RINOK(ReadNumber(&ssi.sdSizes, &v));
+        RINOK(ReadNumber(&ssi.sdSizes, &v))
         unpackPos += v;
         if (unpackPos < v)
           return SZ_ERROR_ARCHIVE;
-        if (allDigestsDefined || (digestsDefs && SzBitArray_Check(digestsDefs, digestIndex)))
+      }
+      if ((crcMask & mask) == 0 && digestsVals)
+      {
+        if (!digestsDefs || SzBitArray_Check(digestsDefs, digestIndex))
         {
-          p->CRCs.Vals[i] = GetUi32(digestsVals + (size_t)digestsValsIndex * 4);
-          digestsValsIndex++;
+          p->CRCs.Vals[i] = GetUi32(digestsVals);
+          digestsVals += 4;
           crcMask |= mask;
         }
+        digestIndex++;
       }
     }
 
     if (mask != 0x80)
     {
-      UInt32 byteIndex = (i - 1) >> 3;
+      const UInt32 byteIndex = (i - 1) >> 3;
       p->IsDirs[byteIndex] = isDirMask;
       p->CRCs.Defs[byteIndex] = crcMask;
     }
@@ -1446,7 +1457,7 @@ static SRes SzReadHeader2(
         break;
       if (!ssi.sdNumSubStreams.Data)
         return SZ_ERROR_ARCHIVE;
-      RINOK(SzReadNumber32(&ssi.sdNumSubStreams, &numSubStreams));
+      RINOK(SzReadNumber32(&ssi.sdNumSubStreams, &numSubStreams))
       if (numSubStreams != 0)
         return SZ_ERROR_ARCHIVE;
       /*
@@ -1471,7 +1482,7 @@ static SRes SzReadHeader2(
 static SRes SzReadHeader(
     CSzArEx *p,
     CSzData *sd,
-    ILookInStream *inStream,
+    ILookInStreamPtr inStream,
     ISzAllocPtr allocMain,
     ISzAllocPtr allocTemp)
 {
@@ -1490,7 +1501,7 @@ static SRes SzReadHeader(
   for (i = 0; i < NUM_ADDITIONAL_STREAMS_MAX; i++)
     Buf_Free(tempBufs + i, allocTemp);
 
-  RINOK(res);
+  RINOK(res)
 
   if (sd->Size != 0)
     return SZ_ERROR_FAIL;
@@ -1500,7 +1511,7 @@ static SRes SzReadHeader(
 
 static SRes SzArEx_Open2(
     CSzArEx *p,
-    ILookInStream *inStream,
+    ILookInStreamPtr inStream,
     ISzAllocPtr allocMain,
     ISzAllocPtr allocTemp)
 {
@@ -1513,9 +1524,9 @@ static SRes SzArEx_Open2(
   SRes res;
 
   startArcPos = 0;
-  RINOK(ILookInStream_Seek(inStream, &startArcPos, SZ_SEEK_CUR));
+  RINOK(ILookInStream_Seek(inStream, &startArcPos, SZ_SEEK_CUR))
 
-  RINOK(LookInStream_Read2(inStream, header, k7zStartHeaderSize, SZ_ERROR_NO_ARCHIVE));
+  RINOK(LookInStream_Read2(inStream, header, k7zStartHeaderSize, SZ_ERROR_NO_ARCHIVE))
 
   if (!TestSignatureCandidate(header))
     return SZ_ERROR_NO_ARCHIVE;
@@ -1526,11 +1537,13 @@ static SRes SzArEx_Open2(
   nextHeaderSize = GetUi64(header + 20);
   nextHeaderCRC = GetUi32(header + 28);
 
-  p->startPosAfterHeader = startArcPos + k7zStartHeaderSize;
+  p->startPosAfterHeader = (UInt64)startArcPos + k7zStartHeaderSize;
   
   if (CrcCalc(header + 12, 20) != GetUi32(header + 8))
     return SZ_ERROR_CRC;
 
+  p->db.RangeLimit = nextHeaderOffset;
+
   nextHeaderSizeT = (size_t)nextHeaderSize;
   if (nextHeaderSizeT != nextHeaderSize)
     return SZ_ERROR_MEM;
@@ -1542,14 +1555,14 @@ static SRes SzArEx_Open2(
 
   {
     Int64 pos = 0;
-    RINOK(ILookInStream_Seek(inStream, &pos, SZ_SEEK_END));
-    if ((UInt64)pos < startArcPos + nextHeaderOffset ||
-        (UInt64)pos < startArcPos + k7zStartHeaderSize + nextHeaderOffset ||
-        (UInt64)pos < startArcPos + k7zStartHeaderSize + nextHeaderOffset + nextHeaderSize)
+    RINOK(ILookInStream_Seek(inStream, &pos, SZ_SEEK_END))
+    if ((UInt64)pos < (UInt64)startArcPos + nextHeaderOffset ||
+        (UInt64)pos < (UInt64)startArcPos + k7zStartHeaderSize + nextHeaderOffset ||
+        (UInt64)pos < (UInt64)startArcPos + k7zStartHeaderSize + nextHeaderOffset + nextHeaderSize)
       return SZ_ERROR_INPUT_EOF;
   }
 
-  RINOK(LookInStream_SeekTo(inStream, startArcPos + k7zStartHeaderSize + nextHeaderOffset));
+  RINOK(LookInStream_SeekTo(inStream, (UInt64)startArcPos + k7zStartHeaderSize + nextHeaderOffset))
 
   if (!Buf_Create(&buf, nextHeaderSizeT, allocTemp))
     return SZ_ERROR_MEM;
@@ -1575,6 +1588,8 @@ static SRes SzArEx_Open2(
         Buf_Init(&tempBuf);
         
         SzAr_Init(&tempAr);
+        tempAr.RangeLimit = p->db.RangeLimit;
+
         res = SzReadAndDecodePackedStreams(inStream, &sd, &tempBuf, 1, p->startPosAfterHeader, &tempAr, allocTemp);
         SzAr_Free(&tempAr, allocTemp);
        
@@ -1622,10 +1637,10 @@ static SRes SzArEx_Open2(
 }
 
 
-SRes SzArEx_Open(CSzArEx *p, ILookInStream *inStream,
+SRes SzArEx_Open(CSzArEx *p, ILookInStreamPtr inStream,
     ISzAllocPtr allocMain, ISzAllocPtr allocTemp)
 {
-  SRes res = SzArEx_Open2(p, inStream, allocMain, allocTemp);
+  const SRes res = SzArEx_Open2(p, inStream, allocMain, allocTemp);
   if (res != SZ_OK)
     SzArEx_Free(p, allocMain);
   return res;
@@ -1634,7 +1649,7 @@ SRes SzArEx_Open(CSzArEx *p, ILookInStream *inStream,
 
 SRes SzArEx_Extract(
     const CSzArEx *p,
-    ILookInStream *inStream,
+    ILookInStreamPtr inStream,
     UInt32 fileIndex,
     UInt32 *blockIndex,
     Byte **tempBuf,
@@ -1644,7 +1659,7 @@ SRes SzArEx_Extract(
     ISzAllocPtr allocMain,
     ISzAllocPtr allocTemp)
 {
-  UInt32 folderIndex = p->FileToFolder[fileIndex];
+  const UInt32 folderIndex = p->FileToFolder[fileIndex];
   SRes res = SZ_OK;
   
   *offset = 0;
@@ -1661,13 +1676,13 @@ SRes SzArEx_Extract(
 
   if (*tempBuf == NULL || *blockIndex != folderIndex)
   {
-    UInt64 unpackSizeSpec = SzAr_GetFolderUnpackSize(&p->db, folderIndex);
+    const UInt64 unpackSizeSpec = SzAr_GetFolderUnpackSize(&p->db, folderIndex);
     /*
     UInt64 unpackSizeSpec =
         p->UnpackPositions[p->FolderToFile[(size_t)folderIndex + 1]] -
         p->UnpackPositions[p->FolderToFile[folderIndex]];
     */
-    size_t unpackSize = (size_t)unpackSizeSpec;
+    const size_t unpackSize = (size_t)unpackSizeSpec;
 
     if (unpackSize != unpackSizeSpec)
       return SZ_ERROR_MEM;
@@ -1695,7 +1710,7 @@ SRes SzArEx_Extract(
 
   if (res == SZ_OK)
   {
-    UInt64 unpackPos = p->UnpackPositions[fileIndex];
+    const UInt64 unpackPos = p->UnpackPositions[fileIndex];
     *offset = (size_t)(unpackPos - p->UnpackPositions[p->FolderToFile[folderIndex]]);
     *outSizeProcessed = (size_t)(p->UnpackPositions[(size_t)fileIndex + 1] - unpackPos);
     if (*offset + *outSizeProcessed > *outBufferSize)
@@ -1711,8 +1726,8 @@ SRes SzArEx_Extract(
 
 size_t SzArEx_GetFileNameUtf16(const CSzArEx *p, size_t fileIndex, UInt16 *dest)
 {
-  size_t offs = p->FileNameOffsets[fileIndex];
-  size_t len = p->FileNameOffsets[fileIndex + 1] - offs;
+  const size_t offs = p->FileNameOffsets[fileIndex];
+  const size_t len = p->FileNameOffsets[fileIndex + 1] - offs;
   if (dest != 0)
   {
     size_t i;
diff --git a/src/sdk/C/7zBuf.h b/src/sdk/C/7zBuf.h
index 81d1b5b..c0ba8a7 100644
--- a/src/sdk/C/7zBuf.h
+++ b/src/sdk/C/7zBuf.h
@@ -1,8 +1,8 @@
 /* 7zBuf.h -- Byte Buffer
-2017-04-03 : Igor Pavlov : Public domain */
+2023-03-04 : Igor Pavlov : Public domain */
 
-#ifndef __7Z_BUF_H
-#define __7Z_BUF_H
+#ifndef ZIP7_INC_7Z_BUF_H
+#define ZIP7_INC_7Z_BUF_H
 
 #include "7zTypes.h"
 
diff --git a/src/sdk/C/7zCrc.c b/src/sdk/C/7zCrc.c
index b4d84f0..6e2db9e 100644
--- a/src/sdk/C/7zCrc.c
+++ b/src/sdk/C/7zCrc.c
@@ -1,128 +1,420 @@
-/* 7zCrc.c -- CRC32 init
-2017-06-06 : Igor Pavlov : Public domain */
+/* 7zCrc.c -- CRC32 calculation and init
+2024-03-01 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include "7zCrc.h"
 #include "CpuArch.h"
 
-#define kCrcPoly 0xEDB88320
+// for debug:
+// #define __ARM_FEATURE_CRC32 1
 
-#ifdef MY_CPU_LE
-  #define CRC_NUM_TABLES 8
+#ifdef __ARM_FEATURE_CRC32
+// #pragma message("__ARM_FEATURE_CRC32")
+#define Z7_CRC_HW_FORCE
+#endif
+
+// #define Z7_CRC_DEBUG_BE
+#ifdef Z7_CRC_DEBUG_BE
+#undef MY_CPU_LE
+#define MY_CPU_BE
+#endif
+
+#ifdef Z7_CRC_HW_FORCE
+  #define Z7_CRC_NUM_TABLES_USE  1
 #else
-  #define CRC_NUM_TABLES 9
+#ifdef Z7_CRC_NUM_TABLES
+  #define Z7_CRC_NUM_TABLES_USE  Z7_CRC_NUM_TABLES
+#else
+  #define Z7_CRC_NUM_TABLES_USE  12
+#endif
+#endif
 
-  #define CRC_UINT32_SWAP(v) ((v >> 24) | ((v >> 8) & 0xFF00) | ((v << 8) & 0xFF0000) | (v << 24))
+#if Z7_CRC_NUM_TABLES_USE < 1
+  #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
+#endif
 
-  UInt32 MY_FAST_CALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table);
-  UInt32 MY_FAST_CALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table);
+#if defined(MY_CPU_LE) || (Z7_CRC_NUM_TABLES_USE == 1)
+  #define Z7_CRC_NUM_TABLES_TOTAL  Z7_CRC_NUM_TABLES_USE
+#else
+  #define Z7_CRC_NUM_TABLES_TOTAL  (Z7_CRC_NUM_TABLES_USE + 1)
 #endif
 
+#ifndef Z7_CRC_HW_FORCE
+
+#if Z7_CRC_NUM_TABLES_USE == 1 \
+   || (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
+#define CRC_UPDATE_BYTE_2(crc, b)   (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
+#define Z7_CRC_UPDATE_T1_FUNC_NAME  CrcUpdateGT1
+static UInt32 Z7_FASTCALL Z7_CRC_UPDATE_T1_FUNC_NAME(UInt32 v, const void *data, size_t size)
+{
+  const UInt32 *table = g_CrcTable;
+  const Byte *p = (const Byte *)data;
+  const Byte *lim = p + size;
+  for (; p != lim; p++)
+    v = CRC_UPDATE_BYTE_2(v, *p);
+  return v;
+}
+#endif
+
+
+#if Z7_CRC_NUM_TABLES_USE != 1
 #ifndef MY_CPU_BE
-  UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table);
-  UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table);
+  #define FUNC_NAME_LE_2(s)   CrcUpdateT ## s
+  #define FUNC_NAME_LE_1(s)   FUNC_NAME_LE_2(s)
+  #define FUNC_NAME_LE        FUNC_NAME_LE_1(Z7_CRC_NUM_TABLES_USE)
+  UInt32 Z7_FASTCALL FUNC_NAME_LE (UInt32 v, const void *data, size_t size, const UInt32 *table);
+#endif
+#ifndef MY_CPU_LE
+  #define FUNC_NAME_BE_2(s)   CrcUpdateT1_BeT ## s
+  #define FUNC_NAME_BE_1(s)   FUNC_NAME_BE_2(s)
+  #define FUNC_NAME_BE        FUNC_NAME_BE_1(Z7_CRC_NUM_TABLES_USE)
+  UInt32 Z7_FASTCALL FUNC_NAME_BE (UInt32 v, const void *data, size_t size, const UInt32 *table);
+#endif
 #endif
 
-typedef UInt32 (MY_FAST_CALL *CRC_FUNC)(UInt32 v, const void *data, size_t size, const UInt32 *table);
+#endif // Z7_CRC_HW_FORCE
+
+/* ---------- hardware CRC ---------- */
+
+#ifdef MY_CPU_LE
+
+#if defined(MY_CPU_ARM_OR_ARM64)
+// #pragma message("ARM*")
 
-CRC_FUNC g_CrcUpdateT4;
-CRC_FUNC g_CrcUpdateT8;
-CRC_FUNC g_CrcUpdate;
+  #if (defined(__clang__) && (__clang_major__ >= 3)) \
+     || defined(__GNUC__) && (__GNUC__ >= 6) && defined(MY_CPU_ARM64) \
+     || defined(__GNUC__) && (__GNUC__ >= 8)
+      #if !defined(__ARM_FEATURE_CRC32)
+//        #pragma message("!defined(__ARM_FEATURE_CRC32)")
+Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+        #define __ARM_FEATURE_CRC32 1
+Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+        #define Z7_ARM_FEATURE_CRC32_WAS_SET
+        #if defined(__clang__)
+          #if defined(MY_CPU_ARM64)
+            #define ATTRIB_CRC __attribute__((__target__("crc")))
+          #else
+            #define ATTRIB_CRC __attribute__((__target__("armv8-a,crc")))
+          #endif
+        #else
+          #if defined(MY_CPU_ARM64)
+#if !defined(Z7_GCC_VERSION) || (Z7_GCC_VERSION >= 60000)
+            #define ATTRIB_CRC __attribute__((__target__("+crc")))
+#endif
+          #else
+#if !defined(Z7_GCC_VERSION) || (__GNUC__  >= 8)
+#if defined(__ARM_FP) && __GNUC__ >= 8
+// for -mfloat-abi=hard: similar to <arm_acle.h>
+            #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc+simd")))
+#else
+            #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc")))
+#endif
+#endif
+          #endif
+        #endif
+      #endif
+      #if defined(__ARM_FEATURE_CRC32)
+      // #pragma message("<arm_acle.h>")
+/*
+arm_acle.h (GGC):
+    before Nov 17, 2017:
+#ifdef __ARM_FEATURE_CRC32
 
-UInt32 g_CrcTable[256 * CRC_NUM_TABLES];
+    Nov 17, 2017: gcc10.0  (gcc 9.2.0) checked"
+#if __ARM_ARCH >= 8
+#pragma GCC target ("arch=armv8-a+crc")
 
-UInt32 MY_FAST_CALL CrcUpdate(UInt32 v, const void *data, size_t size)
+    Aug 22, 2019: GCC 8.4?, 9.2.1, 10.1:
+#ifdef __ARM_FEATURE_CRC32
+#ifdef __ARM_FP
+#pragma GCC target ("arch=armv8-a+crc+simd")
+#else
+#pragma GCC target ("arch=armv8-a+crc")
+#endif
+*/
+#if defined(__ARM_ARCH) && __ARM_ARCH < 8
+#if defined(Z7_GCC_VERSION) && (__GNUC__ ==   8) && (Z7_GCC_VERSION <  80400) \
+ || defined(Z7_GCC_VERSION) && (__GNUC__ ==   9) && (Z7_GCC_VERSION <  90201) \
+ || defined(Z7_GCC_VERSION) && (__GNUC__ ==  10) && (Z7_GCC_VERSION < 100100)
+Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+// #pragma message("#define __ARM_ARCH 8")
+#undef  __ARM_ARCH
+#define __ARM_ARCH 8
+Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+#endif
+#endif
+        #define Z7_CRC_HW_USE
+        #include <arm_acle.h>
+      #endif
+  #elif defined(_MSC_VER)
+    #if defined(MY_CPU_ARM64)
+    #if (_MSC_VER >= 1910)
+    #ifdef __clang__
+       // #define Z7_CRC_HW_USE
+       // #include <arm_acle.h>
+    #else
+       #define Z7_CRC_HW_USE
+       #include <intrin.h>
+    #endif
+    #endif
+    #endif
+  #endif
+
+#else // non-ARM*
+
+// #define Z7_CRC_HW_USE // for debug : we can test HW-branch of code
+#ifdef Z7_CRC_HW_USE
+#include "7zCrcEmu.h"
+#endif
+
+#endif // non-ARM*
+
+
+
+#if defined(Z7_CRC_HW_USE)
+
+// #pragma message("USE ARM HW CRC")
+
+#ifdef MY_CPU_64BIT
+  #define CRC_HW_WORD_TYPE  UInt64
+  #define CRC_HW_WORD_FUNC  __crc32d
+#else
+  #define CRC_HW_WORD_TYPE  UInt32
+  #define CRC_HW_WORD_FUNC  __crc32w
+#endif
+
+#define CRC_HW_UNROLL_BYTES (sizeof(CRC_HW_WORD_TYPE) * 4)
+
+#ifdef ATTRIB_CRC
+  ATTRIB_CRC
+#endif
+Z7_NO_INLINE
+#ifdef Z7_CRC_HW_FORCE
+         UInt32 Z7_FASTCALL CrcUpdate
+#else
+  static UInt32 Z7_FASTCALL CrcUpdate_HW
+#endif
+    (UInt32 v, const void *data, size_t size)
 {
-  return g_CrcUpdate(v, data, size, g_CrcTable);
+  const Byte *p = (const Byte *)data;
+  for (; size != 0 && ((unsigned)(ptrdiff_t)p & (CRC_HW_UNROLL_BYTES - 1)) != 0; size--)
+    v = __crc32b(v, *p++);
+  if (size >= CRC_HW_UNROLL_BYTES)
+  {
+    const Byte *lim = p + size;
+    size &= CRC_HW_UNROLL_BYTES - 1;
+    lim -= size;
+    do
+    {
+      v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p));
+      v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE)));
+      p += 2 * sizeof(CRC_HW_WORD_TYPE);
+      v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p));
+      v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE)));
+      p += 2 * sizeof(CRC_HW_WORD_TYPE);
+    }
+    while (p != lim);
+  }
+  
+  for (; size != 0; size--)
+    v = __crc32b(v, *p++);
+
+  return v;
 }
 
-UInt32 MY_FAST_CALL CrcCalc(const void *data, size_t size)
+#ifdef Z7_ARM_FEATURE_CRC32_WAS_SET
+Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+#undef __ARM_FEATURE_CRC32
+Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+#undef Z7_ARM_FEATURE_CRC32_WAS_SET
+#endif
+
+#endif // defined(Z7_CRC_HW_USE)
+#endif // MY_CPU_LE
+
+
+
+#ifndef Z7_CRC_HW_FORCE
+
+#if defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
+/*
+typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_WITH_TABLE_FUNC)
+    (UInt32 v, const void *data, size_t size, const UInt32 *table);
+Z7_CRC_UPDATE_WITH_TABLE_FUNC g_CrcUpdate;
+*/
+static unsigned g_Crc_Algo;
+#if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
+static unsigned g_Crc_Be;
+#endif
+#endif // defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
+
+
+
+Z7_NO_INLINE
+#ifdef Z7_CRC_HW_USE
+  static UInt32 Z7_FASTCALL CrcUpdate_Base
+#else
+         UInt32 Z7_FASTCALL CrcUpdate
+#endif
+    (UInt32 crc, const void *data, size_t size)
 {
-  return g_CrcUpdate(CRC_INIT_VAL, data, size, g_CrcTable) ^ CRC_INIT_VAL;
+#if Z7_CRC_NUM_TABLES_USE == 1
+    return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size);
+#else // Z7_CRC_NUM_TABLES_USE != 1
+#ifdef Z7_CRC_UPDATE_T1_FUNC_NAME
+  if (g_Crc_Algo == 1)
+    return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size);
+#endif
+
+#ifdef MY_CPU_LE
+    return FUNC_NAME_LE(crc, data, size, g_CrcTable);
+#elif defined(MY_CPU_BE)
+    return FUNC_NAME_BE(crc, data, size, g_CrcTable);
+#else
+  if (g_Crc_Be)
+    return FUNC_NAME_BE(crc, data, size, g_CrcTable);
+  else
+    return FUNC_NAME_LE(crc, data, size, g_CrcTable);
+#endif
+#endif // Z7_CRC_NUM_TABLES_USE != 1
 }
 
-#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
 
-UInt32 MY_FAST_CALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table)
+#ifdef Z7_CRC_HW_USE
+Z7_NO_INLINE
+UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size)
 {
-  const Byte *p = (const Byte *)data;
-  const Byte *pEnd = p + size;
-  for (; p != pEnd; p++)
-    v = CRC_UPDATE_BYTE_2(v, *p);
-  return v;
+  if (g_Crc_Algo == 0)
+    return CrcUpdate_HW(crc, data, size);
+  return CrcUpdate_Base(crc, data, size);
+}
+#endif
+
+#endif // !defined(Z7_CRC_HW_FORCE)
+
+
+
+UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size)
+{
+  return CrcUpdate(CRC_INIT_VAL, data, size) ^ CRC_INIT_VAL;
 }
 
-void MY_FAST_CALL CrcGenerateTable()
+
+MY_ALIGN(64)
+UInt32 g_CrcTable[256 * Z7_CRC_NUM_TABLES_TOTAL];
+
+
+void Z7_FASTCALL CrcGenerateTable(void)
 {
   UInt32 i;
   for (i = 0; i < 256; i++)
   {
+#if defined(Z7_CRC_HW_FORCE)
+    g_CrcTable[i] = __crc32b(i, 0);
+#else
+    #define kCrcPoly 0xEDB88320
     UInt32 r = i;
     unsigned j;
     for (j = 0; j < 8; j++)
       r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
     g_CrcTable[i] = r;
+#endif
   }
-  for (i = 256; i < 256 * CRC_NUM_TABLES; i++)
+  for (i = 256; i < 256 * Z7_CRC_NUM_TABLES_USE; i++)
   {
-    UInt32 r = g_CrcTable[(size_t)i - 256];
+    const UInt32 r = g_CrcTable[(size_t)i - 256];
     g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8);
   }
 
-  #if CRC_NUM_TABLES < 4
-  
-  g_CrcUpdate = CrcUpdateT1;
-  
-  #else
- 
-  #ifdef MY_CPU_LE
-
-    g_CrcUpdateT4 = CrcUpdateT4;
-    g_CrcUpdate = CrcUpdateT4;
+#if !defined(Z7_CRC_HW_FORCE) && \
+    (defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) || defined(MY_CPU_BE))
 
-    #if CRC_NUM_TABLES >= 8
-      g_CrcUpdateT8 = CrcUpdateT8;
-  
-      #ifdef MY_CPU_X86_OR_AMD64
-      if (!CPU_Is_InOrder())
-      #endif
-        g_CrcUpdate = CrcUpdateT8;
-    #endif
+#if Z7_CRC_NUM_TABLES_USE <= 1
+    g_Crc_Algo = 1;
+#else // Z7_CRC_NUM_TABLES_USE <= 1
 
-  #else
+#if defined(MY_CPU_LE)
+    g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
+#else // !defined(MY_CPU_LE)
   {
-    #ifndef MY_CPU_BE
+#ifndef MY_CPU_BE
     UInt32 k = 0x01020304;
     const Byte *p = (const Byte *)&k;
     if (p[0] == 4 && p[1] == 3)
-    {
-      g_CrcUpdateT4 = CrcUpdateT4;
-      g_CrcUpdate = CrcUpdateT4;
-      #if CRC_NUM_TABLES >= 8
-      g_CrcUpdateT8 = CrcUpdateT8;
-      g_CrcUpdate = CrcUpdateT8;
-      #endif
-    }
+      g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
     else if (p[0] != 1 || p[1] != 2)
-      g_CrcUpdate = CrcUpdateT1;
+      g_Crc_Algo = 1;
     else
-    #endif
+#endif // MY_CPU_BE
     {
-      for (i = 256 * CRC_NUM_TABLES - 1; i >= 256; i--)
+      for (i = 256 * Z7_CRC_NUM_TABLES_TOTAL - 1; i >= 256; i--)
       {
-        UInt32 x = g_CrcTable[(size_t)i - 256];
-        g_CrcTable[i] = CRC_UINT32_SWAP(x);
+        const UInt32 x = g_CrcTable[(size_t)i - 256];
+        g_CrcTable[i] = Z7_BSWAP32(x);
       }
-      g_CrcUpdateT4 = CrcUpdateT1_BeT4;
-      g_CrcUpdate = CrcUpdateT1_BeT4;
-      #if CRC_NUM_TABLES >= 8
-      g_CrcUpdateT8 = CrcUpdateT1_BeT8;
-      g_CrcUpdate = CrcUpdateT1_BeT8;
-      #endif
+#if defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
+      g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
+#endif
+#if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
+      g_Crc_Be = 1;
+#endif
     }
   }
-  #endif
+#endif  // !defined(MY_CPU_LE)
+
+#ifdef MY_CPU_LE
+#ifdef Z7_CRC_HW_USE
+  if (CPU_IsSupported_CRC32())
+    g_Crc_Algo = 0;
+#endif // Z7_CRC_HW_USE
+#endif // MY_CPU_LE
+
+#endif // Z7_CRC_NUM_TABLES_USE <= 1
+#endif // g_Crc_Algo was declared
+}
+
+Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo)
+{
+  if (algo == 0)
+    return &CrcUpdate;
+
+#if defined(Z7_CRC_HW_USE)
+  if (algo == sizeof(CRC_HW_WORD_TYPE) * 8)
+  {
+#ifdef Z7_CRC_HW_FORCE
+    return &CrcUpdate;
+#else
+    if (g_Crc_Algo == 0)
+      return &CrcUpdate_HW;
+#endif
+  }
+#endif
 
+#ifndef Z7_CRC_HW_FORCE
+  if (algo == Z7_CRC_NUM_TABLES_USE)
+    return
+  #ifdef Z7_CRC_HW_USE
+      &CrcUpdate_Base;
+  #else
+      &CrcUpdate;
   #endif
+#endif
+
+  return NULL;
 }
+
+#undef kCrcPoly
+#undef Z7_CRC_NUM_TABLES_USE
+#undef Z7_CRC_NUM_TABLES_TOTAL
+#undef CRC_UPDATE_BYTE_2
+#undef FUNC_NAME_LE_2
+#undef FUNC_NAME_LE_1
+#undef FUNC_NAME_LE
+#undef FUNC_NAME_BE_2
+#undef FUNC_NAME_BE_1
+#undef FUNC_NAME_BE
+
+#undef CRC_HW_UNROLL_BYTES
+#undef CRC_HW_WORD_FUNC
+#undef CRC_HW_WORD_TYPE
diff --git a/src/sdk/C/7zCrc.h b/src/sdk/C/7zCrc.h
index 8fd5795..3e6d408 100644
--- a/src/sdk/C/7zCrc.h
+++ b/src/sdk/C/7zCrc.h
@@ -1,8 +1,8 @@
 /* 7zCrc.h -- CRC32 calculation
-2013-01-18 : Igor Pavlov : Public domain */
+2024-01-22 : Igor Pavlov : Public domain */
 
-#ifndef __7Z_CRC_H
-#define __7Z_CRC_H
+#ifndef ZIP7_INC_7Z_CRC_H
+#define ZIP7_INC_7Z_CRC_H
 
 #include "7zTypes.h"
 
@@ -11,14 +11,17 @@ EXTERN_C_BEGIN
 extern UInt32 g_CrcTable[];
 
 /* Call CrcGenerateTable one time before other CRC functions */
-void MY_FAST_CALL CrcGenerateTable(void);
+void Z7_FASTCALL CrcGenerateTable(void);
 
 #define CRC_INIT_VAL 0xFFFFFFFF
 #define CRC_GET_DIGEST(crc) ((crc) ^ CRC_INIT_VAL)
 #define CRC_UPDATE_BYTE(crc, b) (g_CrcTable[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
 
-UInt32 MY_FAST_CALL CrcUpdate(UInt32 crc, const void *data, size_t size);
-UInt32 MY_FAST_CALL CrcCalc(const void *data, size_t size);
+UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size);
+UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size);
+
+typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_FUNC)(UInt32 v, const void *data, size_t size);
+Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo);
 
 EXTERN_C_END
 
diff --git a/src/sdk/C/7zCrcOpt.c b/src/sdk/C/7zCrcOpt.c
index 73beba2..9408017 100644
--- a/src/sdk/C/7zCrcOpt.c
+++ b/src/sdk/C/7zCrcOpt.c
@@ -1,115 +1,199 @@
-/* 7zCrcOpt.c -- CRC32 calculation
-2017-04-03 : Igor Pavlov : Public domain */
+/* 7zCrcOpt.c -- CRC32 calculation (optimized functions)
+2023-12-07 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include "CpuArch.h"
 
+#if !defined(Z7_CRC_NUM_TABLES) || Z7_CRC_NUM_TABLES > 1
+
+// for debug only : define Z7_CRC_DEBUG_BE to test big-endian code in little-endian cpu
+// #define Z7_CRC_DEBUG_BE
+#ifdef Z7_CRC_DEBUG_BE
+#undef MY_CPU_LE
+#define MY_CPU_BE
+#endif
+
+// the value Z7_CRC_NUM_TABLES_USE must be defined to same value as in 7zCrc.c
+#ifdef Z7_CRC_NUM_TABLES
+#define Z7_CRC_NUM_TABLES_USE  Z7_CRC_NUM_TABLES
+#else
+#define Z7_CRC_NUM_TABLES_USE  12
+#endif
+
+#if Z7_CRC_NUM_TABLES_USE % 4     || \
+    Z7_CRC_NUM_TABLES_USE < 4 * 1 || \
+    Z7_CRC_NUM_TABLES_USE > 4 * 6
+  #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
+#endif
+
+
 #ifndef MY_CPU_BE
 
-#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
+#define CRC_UPDATE_BYTE_2(crc, b)  (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
 
-UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table)
-{
-  const Byte *p = (const Byte *)data;
-  for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++)
-    v = CRC_UPDATE_BYTE_2(v, *p);
-  for (; size >= 4; size -= 4, p += 4)
-  {
-    v ^= *(const UInt32 *)p;
-    v =
-          (table + 0x300)[((v      ) & 0xFF)]
-        ^ (table + 0x200)[((v >>  8) & 0xFF)]
-        ^ (table + 0x100)[((v >> 16) & 0xFF)]
-        ^ (table + 0x000)[((v >> 24))];
-  }
-  for (; size > 0; size--, p++)
-    v = CRC_UPDATE_BYTE_2(v, *p);
-  return v;
-}
+#define Q(n, d) \
+    ( (table + ((n) * 4 + 3) * 0x100)[(Byte)(d)] \
+    ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \
+    ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \
+    ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] )
+
+#define R(a)  *((const UInt32 *)(const void *)p + (a))
+
+#define CRC_FUNC_PRE_LE2(step) \
+UInt32 Z7_FASTCALL CrcUpdateT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table)
 
-UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table)
+#define CRC_FUNC_PRE_LE(step)   \
+        CRC_FUNC_PRE_LE2(step); \
+        CRC_FUNC_PRE_LE2(step)
+
+CRC_FUNC_PRE_LE(Z7_CRC_NUM_TABLES_USE)
 {
   const Byte *p = (const Byte *)data;
-  for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++)
+  const Byte *lim;
+  for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++)
     v = CRC_UPDATE_BYTE_2(v, *p);
-  for (; size >= 8; size -= 8, p += 8)
+  lim = p + size;
+  if (size >= Z7_CRC_NUM_TABLES_USE)
   {
-    UInt32 d;
-    v ^= *(const UInt32 *)p;
-    v =
-          (table + 0x700)[((v      ) & 0xFF)]
-        ^ (table + 0x600)[((v >>  8) & 0xFF)]
-        ^ (table + 0x500)[((v >> 16) & 0xFF)]
-        ^ (table + 0x400)[((v >> 24))];
-    d = *((const UInt32 *)p + 1);
-    v ^=
-          (table + 0x300)[((d      ) & 0xFF)]
-        ^ (table + 0x200)[((d >>  8) & 0xFF)]
-        ^ (table + 0x100)[((d >> 16) & 0xFF)]
-        ^ (table + 0x000)[((d >> 24))];
+    lim -= Z7_CRC_NUM_TABLES_USE;
+    do
+    {
+      v ^= R(0);
+      {
+#if Z7_CRC_NUM_TABLES_USE == 1 * 4
+        v = Q(0, v);
+#else
+#define U2(r, op) \
+        { d = R(r);  x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); }
+        UInt32 d, x;
+        U2(1, =)
+#if Z7_CRC_NUM_TABLES_USE >= 3 * 4
+#define U(r)  U2(r, ^=)
+        U(2)
+#if Z7_CRC_NUM_TABLES_USE >= 4 * 4
+        U(3)
+#if Z7_CRC_NUM_TABLES_USE >= 5 * 4
+        U(4)
+#if Z7_CRC_NUM_TABLES_USE >= 6 * 4
+        U(5)
+#if Z7_CRC_NUM_TABLES_USE >= 7 * 4
+#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
+#endif
+#endif
+#endif
+#endif
+#endif
+#undef U
+#undef U2
+        v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v);
+#endif
+      }
+      p += Z7_CRC_NUM_TABLES_USE;
+    }
+    while (p <= lim);
+    lim += Z7_CRC_NUM_TABLES_USE;
   }
-  for (; size > 0; size--, p++)
+  for (; p < lim; p++)
     v = CRC_UPDATE_BYTE_2(v, *p);
   return v;
 }
 
+#undef CRC_UPDATE_BYTE_2
+#undef R
+#undef Q
+#undef CRC_FUNC_PRE_LE
+#undef CRC_FUNC_PRE_LE2
+
 #endif
 
 
+
+
 #ifndef MY_CPU_LE
 
-#define CRC_UINT32_SWAP(v) ((v >> 24) | ((v >> 8) & 0xFF00) | ((v << 8) & 0xFF0000) | (v << 24))
+#define CRC_UPDATE_BYTE_2_BE(crc, b)  (table[((crc) >> 24) ^ (b)] ^ ((crc) << 8))
 
-#define CRC_UPDATE_BYTE_2_BE(crc, b) (table[(((crc) >> 24) ^ (b))] ^ ((crc) << 8))
+#define Q(n, d) \
+    ( (table + ((n) * 4 + 0) * 0x100)[((d)) & 0xFF] \
+    ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \
+    ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \
+    ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] )
 
-UInt32 MY_FAST_CALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table)
-{
-  const Byte *p = (const Byte *)data;
-  table += 0x100;
-  v = CRC_UINT32_SWAP(v);
-  for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++)
-    v = CRC_UPDATE_BYTE_2_BE(v, *p);
-  for (; size >= 4; size -= 4, p += 4)
-  {
-    v ^= *(const UInt32 *)p;
-    v =
-          (table + 0x000)[((v      ) & 0xFF)]
-        ^ (table + 0x100)[((v >>  8) & 0xFF)]
-        ^ (table + 0x200)[((v >> 16) & 0xFF)]
-        ^ (table + 0x300)[((v >> 24))];
-  }
-  for (; size > 0; size--, p++)
-    v = CRC_UPDATE_BYTE_2_BE(v, *p);
-  return CRC_UINT32_SWAP(v);
-}
+#ifdef Z7_CRC_DEBUG_BE
+  #define R(a)  GetBe32a((const UInt32 *)(const void *)p + (a))
+#else
+  #define R(a)         *((const UInt32 *)(const void *)p + (a))
+#endif
+
+
+#define CRC_FUNC_PRE_BE2(step) \
+UInt32 Z7_FASTCALL CrcUpdateT1_BeT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table)
 
-UInt32 MY_FAST_CALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table)
+#define CRC_FUNC_PRE_BE(step)   \
+        CRC_FUNC_PRE_BE2(step); \
+        CRC_FUNC_PRE_BE2(step)
+
+CRC_FUNC_PRE_BE(Z7_CRC_NUM_TABLES_USE)
 {
   const Byte *p = (const Byte *)data;
+  const Byte *lim;
   table += 0x100;
-  v = CRC_UINT32_SWAP(v);
-  for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++)
+  v = Z7_BSWAP32(v);
+  for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++)
     v = CRC_UPDATE_BYTE_2_BE(v, *p);
-  for (; size >= 8; size -= 8, p += 8)
+  lim = p + size;
+  if (size >= Z7_CRC_NUM_TABLES_USE)
   {
-    UInt32 d;
-    v ^= *(const UInt32 *)p;
-    v =
-          (table + 0x400)[((v      ) & 0xFF)]
-        ^ (table + 0x500)[((v >>  8) & 0xFF)]
-        ^ (table + 0x600)[((v >> 16) & 0xFF)]
-        ^ (table + 0x700)[((v >> 24))];
-    d = *((const UInt32 *)p + 1);
-    v ^=
-          (table + 0x000)[((d      ) & 0xFF)]
-        ^ (table + 0x100)[((d >>  8) & 0xFF)]
-        ^ (table + 0x200)[((d >> 16) & 0xFF)]
-        ^ (table + 0x300)[((d >> 24))];
+    lim -= Z7_CRC_NUM_TABLES_USE;
+    do
+    {
+      v ^= R(0);
+      {
+#if Z7_CRC_NUM_TABLES_USE == 1 * 4
+        v = Q(0, v);
+#else
+#define U2(r, op) \
+        { d = R(r);  x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); }
+        UInt32 d, x;
+        U2(1, =)
+#if Z7_CRC_NUM_TABLES_USE >= 3 * 4
+#define U(r)  U2(r, ^=)
+        U(2)
+#if Z7_CRC_NUM_TABLES_USE >= 4 * 4
+        U(3)
+#if Z7_CRC_NUM_TABLES_USE >= 5 * 4
+        U(4)
+#if Z7_CRC_NUM_TABLES_USE >= 6 * 4
+        U(5)
+#if Z7_CRC_NUM_TABLES_USE >= 7 * 4
+#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
+#endif
+#endif
+#endif
+#endif
+#endif
+#undef U
+#undef U2
+        v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v);
+#endif
+      }
+      p += Z7_CRC_NUM_TABLES_USE;
+    }
+    while (p <= lim);
+    lim += Z7_CRC_NUM_TABLES_USE;
   }
-  for (; size > 0; size--, p++)
+  for (; p < lim; p++)
     v = CRC_UPDATE_BYTE_2_BE(v, *p);
-  return CRC_UINT32_SWAP(v);
+  return Z7_BSWAP32(v);
 }
 
+#undef CRC_UPDATE_BYTE_2_BE
+#undef R
+#undef Q
+#undef CRC_FUNC_PRE_BE
+#undef CRC_FUNC_PRE_BE2
+
+#endif
+#undef Z7_CRC_NUM_TABLES_USE
 #endif
diff --git a/src/sdk/C/7zDec.c b/src/sdk/C/7zDec.c
index 7c46352..520cbfd 100644
--- a/src/sdk/C/7zDec.c
+++ b/src/sdk/C/7zDec.c
@@ -1,11 +1,11 @@
 /* 7zDec.c -- Decoding from 7z folder
-2019-02-02 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include <string.h>
 
-/* #define _7ZIP_PPMD_SUPPPORT */
+/* #define Z7_PPMD_SUPPORT */
 
 #include "7z.h"
 #include "7zCrc.h"
@@ -16,24 +16,50 @@
 #include "Delta.h"
 #include "LzmaDec.h"
 #include "Lzma2Dec.h"
-#ifdef _7ZIP_PPMD_SUPPPORT
+#ifdef Z7_PPMD_SUPPORT
 #include "Ppmd7.h"
 #endif
 
 #define k_Copy 0
-#define k_Delta 3
+#ifndef Z7_NO_METHOD_LZMA2
 #define k_LZMA2 0x21
+#endif
 #define k_LZMA  0x30101
-#define k_BCJ   0x3030103
 #define k_BCJ2  0x303011B
+
+#if !defined(Z7_NO_METHODS_FILTERS)
+#define Z7_USE_BRANCH_FILTER
+#endif
+
+#if !defined(Z7_NO_METHODS_FILTERS) || \
+     defined(Z7_USE_NATIVE_BRANCH_FILTER) && defined(MY_CPU_ARM64)
+#define Z7_USE_FILTER_ARM64
+#ifndef Z7_USE_BRANCH_FILTER
+#define Z7_USE_BRANCH_FILTER
+#endif
+#define k_ARM64 0xa
+#endif
+
+#if !defined(Z7_NO_METHODS_FILTERS) || \
+     defined(Z7_USE_NATIVE_BRANCH_FILTER) && defined(MY_CPU_ARMT)
+#define Z7_USE_FILTER_ARMT
+#ifndef Z7_USE_BRANCH_FILTER
+#define Z7_USE_BRANCH_FILTER
+#endif
+#define k_ARMT  0x3030701
+#endif
+
+#ifndef Z7_NO_METHODS_FILTERS
+#define k_Delta 3
+#define k_RISCV 0xb
+#define k_BCJ   0x3030103
 #define k_PPC   0x3030205
 #define k_IA64  0x3030401
 #define k_ARM   0x3030501
-#define k_ARMT  0x3030701
 #define k_SPARC 0x3030805
+#endif
 
-
-#ifdef _7ZIP_PPMD_SUPPPORT
+#ifdef Z7_PPMD_SUPPORT
 
 #define k_PPMD 0x30401
 
@@ -46,17 +72,17 @@ typedef struct
   UInt64 processed;
   BoolInt extra;
   SRes res;
-  const ILookInStream *inStream;
+  ILookInStreamPtr inStream;
 } CByteInToLook;
 
-static Byte ReadByte(const IByteIn *pp)
+static Byte ReadByte(IByteInPtr pp)
 {
-  CByteInToLook *p = CONTAINER_FROM_VTBL(pp, CByteInToLook, vt);
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CByteInToLook)
   if (p->cur != p->end)
     return *p->cur++;
   if (p->res == SZ_OK)
   {
-    size_t size = p->cur - p->begin;
+    size_t size = (size_t)(p->cur - p->begin);
     p->processed += size;
     p->res = ILookInStream_Skip(p->inStream, size);
     size = (1 << 25);
@@ -64,13 +90,13 @@ static Byte ReadByte(const IByteIn *pp)
     p->cur = p->begin;
     p->end = p->begin + size;
     if (size != 0)
-      return *p->cur++;;
+      return *p->cur++;
   }
   p->extra = True;
   return 0;
 }
 
-static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, const ILookInStream *inStream,
+static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream,
     Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain)
 {
   CPpmd7 ppmd;
@@ -101,28 +127,32 @@ static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, c
     Ppmd7_Init(&ppmd, order);
   }
   {
-    CPpmd7z_RangeDec rc;
-    Ppmd7z_RangeDec_CreateVTable(&rc);
-    rc.Stream = &s.vt;
-    if (!Ppmd7z_RangeDec_Init(&rc))
+    ppmd.rc.dec.Stream = &s.vt;
+    if (!Ppmd7z_RangeDec_Init(&ppmd.rc.dec))
       res = SZ_ERROR_DATA;
-    else if (s.extra)
-      res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA);
-    else
+    else if (!s.extra)
     {
-      SizeT i;
-      for (i = 0; i < outSize; i++)
+      Byte *buf = outBuffer;
+      const Byte *lim = buf + outSize;
+      for (; buf != lim; buf++)
       {
-        int sym = Ppmd7_DecodeSymbol(&ppmd, &rc.vt);
+        int sym = Ppmd7z_DecodeSymbol(&ppmd);
         if (s.extra || sym < 0)
           break;
-        outBuffer[i] = (Byte)sym;
+        *buf = (Byte)sym;
       }
-      if (i != outSize)
-        res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA);
-      else if (s.processed + (s.cur - s.begin) != inSize || !Ppmd7z_RangeDec_IsFinishedOK(&rc))
+      if (buf != lim)
         res = SZ_ERROR_DATA;
+      else if (!Ppmd7z_RangeDec_IsFinishedOK(&ppmd.rc.dec))
+      {
+        /* if (Ppmd7z_DecodeSymbol(&ppmd) != PPMD7_SYM_END || !Ppmd7z_RangeDec_IsFinishedOK(&ppmd.rc.dec)) */
+        res = SZ_ERROR_DATA;
+      }
     }
+    if (s.extra)
+      res = (s.res != SZ_OK ? s.res : SZ_ERROR_DATA);
+    else if (s.processed + (size_t)(s.cur - s.begin) != inSize)
+      res = SZ_ERROR_DATA;
   }
   Ppmd7_Free(&ppmd, allocMain);
   return res;
@@ -131,14 +161,14 @@ static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, c
 #endif
 
 
-static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStream *inStream,
+static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream,
     Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain)
 {
   CLzmaDec state;
   SRes res = SZ_OK;
 
-  LzmaDec_Construct(&state);
-  RINOK(LzmaDec_AllocateProbs(&state, props, propsSize, allocMain));
+  LzmaDec_CONSTRUCT(&state)
+  RINOK(LzmaDec_AllocateProbs(&state, props, propsSize, allocMain))
   state.dic = outBuffer;
   state.dicBufSize = outSize;
   LzmaDec_Init(&state);
@@ -189,18 +219,18 @@ static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, I
 }
 
 
-#ifndef _7Z_NO_METHOD_LZMA2
+#ifndef Z7_NO_METHOD_LZMA2
 
-static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStream *inStream,
+static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream,
     Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain)
 {
   CLzma2Dec state;
   SRes res = SZ_OK;
 
-  Lzma2Dec_Construct(&state);
+  Lzma2Dec_CONSTRUCT(&state)
   if (propsSize != 1)
     return SZ_ERROR_DATA;
-  RINOK(Lzma2Dec_AllocateProbs(&state, props[0], allocMain));
+  RINOK(Lzma2Dec_AllocateProbs(&state, props[0], allocMain))
   state.decoder.dic = outBuffer;
   state.decoder.dicBufSize = outSize;
   Lzma2Dec_Init(&state);
@@ -250,7 +280,7 @@ static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize,
 #endif
 
 
-static SRes SzDecodeCopy(UInt64 inSize, ILookInStream *inStream, Byte *outBuffer)
+static SRes SzDecodeCopy(UInt64 inSize, ILookInStreamPtr inStream, Byte *outBuffer)
 {
   while (inSize > 0)
   {
@@ -258,13 +288,13 @@ static SRes SzDecodeCopy(UInt64 inSize, ILookInStream *inStream, Byte *outBuffer
     size_t curSize = (1 << 18);
     if (curSize > inSize)
       curSize = (size_t)inSize;
-    RINOK(ILookInStream_Look(inStream, &inBuf, &curSize));
+    RINOK(ILookInStream_Look(inStream, &inBuf, &curSize))
     if (curSize == 0)
       return SZ_ERROR_INPUT_EOF;
     memcpy(outBuffer, inBuf, curSize);
     outBuffer += curSize;
     inSize -= curSize;
-    RINOK(ILookInStream_Skip(inStream, curSize));
+    RINOK(ILookInStream_Skip(inStream, curSize))
   }
   return SZ_OK;
 }
@@ -275,15 +305,16 @@ static BoolInt IS_MAIN_METHOD(UInt32 m)
   {
     case k_Copy:
     case k_LZMA:
-    #ifndef _7Z_NO_METHOD_LZMA2
+  #ifndef Z7_NO_METHOD_LZMA2
     case k_LZMA2:
-    #endif
-    #ifdef _7ZIP_PPMD_SUPPPORT
+  #endif
+  #ifdef Z7_PPMD_SUPPORT
     case k_PPMD:
-    #endif
+  #endif
       return True;
+    default:
+      return False;
   }
-  return False;
 }
 
 static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c)
@@ -310,7 +341,7 @@ static SRes CheckSupportedFolder(const CSzFolder *f)
   }
   
   
-  #ifndef _7Z_NO_METHODS_FILTERS
+  #if defined(Z7_USE_BRANCH_FILTER)
 
   if (f->NumCoders == 2)
   {
@@ -326,13 +357,21 @@ static SRes CheckSupportedFolder(const CSzFolder *f)
       return SZ_ERROR_UNSUPPORTED;
     switch ((UInt32)c->MethodID)
     {
+    #if !defined(Z7_NO_METHODS_FILTERS)
       case k_Delta:
       case k_BCJ:
       case k_PPC:
       case k_IA64:
       case k_SPARC:
       case k_ARM:
+      case k_RISCV:
+    #endif
+    #ifdef Z7_USE_FILTER_ARM64
+      case k_ARM64:
+    #endif
+    #ifdef Z7_USE_FILTER_ARMT
       case k_ARMT:
+    #endif
         break;
       default:
         return SZ_ERROR_UNSUPPORTED;
@@ -365,13 +404,16 @@ static SRes CheckSupportedFolder(const CSzFolder *f)
   return SZ_ERROR_UNSUPPORTED;
 }
 
-#define CASE_BRA_CONV(isa) case k_ ## isa: isa ## _Convert(outBuffer, outSize, 0, 0); break;
+
+
+
+
 
 static SRes SzFolder_Decode2(const CSzFolder *folder,
     const Byte *propsData,
     const UInt64 *unpackSizes,
     const UInt64 *packPositions,
-    ILookInStream *inStream, UInt64 startPos,
+    ILookInStreamPtr inStream, UInt64 startPos,
     Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain,
     Byte *tempBuf[])
 {
@@ -380,7 +422,7 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
   SizeT tempSize3 = 0;
   Byte *tempBuf3 = 0;
 
-  RINOK(CheckSupportedFolder(folder));
+  RINOK(CheckSupportedFolder(folder))
 
   for (ci = 0; ci < folder->NumCoders; ci++)
   {
@@ -395,8 +437,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
       SizeT outSizeCur = outSize;
       if (folder->NumCoders == 4)
       {
-        UInt32 indices[] = { 3, 2, 0 };
-        UInt64 unpackSize = unpackSizes[ci];
+        const UInt32 indices[] = { 3, 2, 0 };
+        const UInt64 unpackSize = unpackSizes[ci];
         si = indices[ci];
         if (ci < 2)
         {
@@ -422,37 +464,37 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
       }
       offset = packPositions[si];
       inSize = packPositions[(size_t)si + 1] - offset;
-      RINOK(LookInStream_SeekTo(inStream, startPos + offset));
+      RINOK(LookInStream_SeekTo(inStream, startPos + offset))
 
       if (coder->MethodID == k_Copy)
       {
         if (inSize != outSizeCur) /* check it */
           return SZ_ERROR_DATA;
-        RINOK(SzDecodeCopy(inSize, inStream, outBufCur));
+        RINOK(SzDecodeCopy(inSize, inStream, outBufCur))
       }
       else if (coder->MethodID == k_LZMA)
       {
-        RINOK(SzDecodeLzma(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain));
+        RINOK(SzDecodeLzma(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain))
       }
-      #ifndef _7Z_NO_METHOD_LZMA2
+    #ifndef Z7_NO_METHOD_LZMA2
       else if (coder->MethodID == k_LZMA2)
       {
-        RINOK(SzDecodeLzma2(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain));
+        RINOK(SzDecodeLzma2(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain))
       }
-      #endif
-      #ifdef _7ZIP_PPMD_SUPPPORT
+    #endif
+    #ifdef Z7_PPMD_SUPPORT
       else if (coder->MethodID == k_PPMD)
       {
-        RINOK(SzDecodePpmd(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain));
+        RINOK(SzDecodePpmd(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain))
       }
-      #endif
+    #endif
       else
         return SZ_ERROR_UNSUPPORTED;
     }
     else if (coder->MethodID == k_BCJ2)
     {
-      UInt64 offset = packPositions[1];
-      UInt64 s3Size = packPositions[2] - offset;
+      const UInt64 offset = packPositions[1];
+      const UInt64 s3Size = packPositions[2] - offset;
       
       if (ci != 3)
         return SZ_ERROR_UNSUPPORTED;
@@ -464,8 +506,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
       if (!tempBuf[2] && tempSizes[2] != 0)
         return SZ_ERROR_MEM;
       
-      RINOK(LookInStream_SeekTo(inStream, startPos + offset));
-      RINOK(SzDecodeCopy(s3Size, inStream, tempBuf[2]));
+      RINOK(LookInStream_SeekTo(inStream, startPos + offset))
+      RINOK(SzDecodeCopy(s3Size, inStream, tempBuf[2]))
 
       if ((tempSizes[0] & 3) != 0 ||
           (tempSizes[1] & 3) != 0 ||
@@ -484,26 +526,22 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
         p.destLim = outBuffer + outSize;
         
         Bcj2Dec_Init(&p);
-        RINOK(Bcj2Dec_Decode(&p));
+        RINOK(Bcj2Dec_Decode(&p))
 
         {
           unsigned i;
           for (i = 0; i < 4; i++)
             if (p.bufs[i] != p.lims[i])
               return SZ_ERROR_DATA;
-          
-          if (!Bcj2Dec_IsFinished(&p))
-            return SZ_ERROR_DATA;
-
-          if (p.dest != p.destLim
-             || p.state != BCJ2_STREAM_MAIN)
+          if (p.dest != p.destLim || !Bcj2Dec_IsMaybeFinished(&p))
             return SZ_ERROR_DATA;
         }
       }
     }
-    #ifndef _7Z_NO_METHODS_FILTERS
+#if defined(Z7_USE_BRANCH_FILTER)
     else if (ci == 1)
     {
+#if !defined(Z7_NO_METHODS_FILTERS)
       if (coder->MethodID == k_Delta)
       {
         if (coder->PropsSize != 1)
@@ -513,31 +551,75 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
           Delta_Init(state);
           Delta_Decode(state, (unsigned)(propsData[coder->PropsOffset]) + 1, outBuffer, outSize);
         }
+        continue;
       }
-      else
+#endif
+     
+#ifdef Z7_USE_FILTER_ARM64
+      if (coder->MethodID == k_ARM64)
+      {
+        UInt32 pc = 0;
+        if (coder->PropsSize == 4)
+        {
+          pc = GetUi32(propsData + coder->PropsOffset);
+          if (pc & 3)
+            return SZ_ERROR_UNSUPPORTED;
+        }
+        else if (coder->PropsSize != 0)
+          return SZ_ERROR_UNSUPPORTED;
+        z7_BranchConv_ARM64_Dec(outBuffer, outSize, pc);
+        continue;
+      }
+#endif
+
+#if !defined(Z7_NO_METHODS_FILTERS)
+      if (coder->MethodID == k_RISCV)
+      {
+        UInt32 pc = 0;
+        if (coder->PropsSize == 4)
+        {
+          pc = GetUi32(propsData + coder->PropsOffset);
+          if (pc & 1)
+            return SZ_ERROR_UNSUPPORTED;
+        }
+        else if (coder->PropsSize != 0)
+          return SZ_ERROR_UNSUPPORTED;
+        z7_BranchConv_RISCV_Dec(outBuffer, outSize, pc);
+        continue;
+      }
+#endif
+      
+#if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT)
       {
         if (coder->PropsSize != 0)
           return SZ_ERROR_UNSUPPORTED;
+       #define CASE_BRA_CONV(isa) case k_ ## isa: Z7_BRANCH_CONV_DEC(isa)(outBuffer, outSize, 0); break; // pc = 0;
         switch (coder->MethodID)
         {
+         #if !defined(Z7_NO_METHODS_FILTERS)
           case k_BCJ:
           {
-            UInt32 state;
-            x86_Convert_Init(state);
-            x86_Convert(outBuffer, outSize, 0, &state, 0);
+            UInt32 state = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL;
+            z7_BranchConvSt_X86_Dec(outBuffer, outSize, 0, &state); // pc = 0
             break;
           }
-          CASE_BRA_CONV(PPC)
+          case k_PPC: Z7_BRANCH_CONV_DEC_2(BranchConv_PPC)(outBuffer, outSize, 0); break; // pc = 0;
+          // CASE_BRA_CONV(PPC)
           CASE_BRA_CONV(IA64)
           CASE_BRA_CONV(SPARC)
           CASE_BRA_CONV(ARM)
+         #endif
+         #if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT)
           CASE_BRA_CONV(ARMT)
+         #endif
           default:
             return SZ_ERROR_UNSUPPORTED;
         }
+        continue;
       }
-    }
-    #endif
+#endif
+    } // (c == 1)
+#endif // Z7_USE_BRANCH_FILTER
     else
       return SZ_ERROR_UNSUPPORTED;
   }
@@ -547,7 +629,7 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
 
 
 SRes SzAr_DecodeFolder(const CSzAr *p, UInt32 folderIndex,
-    ILookInStream *inStream, UInt64 startPos,
+    ILookInStreamPtr inStream, UInt64 startPos,
     Byte *outBuffer, size_t outSize,
     ISzAllocPtr allocMain)
 {
diff --git a/src/sdk/C/7zFile.c b/src/sdk/C/7zFile.c
index 8992fb1..ba5daa1 100644
--- a/src/sdk/C/7zFile.c
+++ b/src/sdk/C/7zFile.c
@@ -1,5 +1,5 @@
 /* 7zFile.c -- File IO
-2017-04-03 : Igor Pavlov : Public domain */
+2023-04-02 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -7,9 +7,19 @@
 
 #ifndef USE_WINDOWS_FILE
 
-#ifndef UNDER_CE
-#include <errno.h>
-#endif
+  #include <errno.h>
+
+  #ifndef USE_FOPEN
+    #include <stdio.h>
+    #include <fcntl.h>
+    #ifdef _WIN32
+      #include <io.h>
+      typedef int ssize_t;
+      typedef int off_t;
+    #else
+      #include <unistd.h>
+    #endif
+  #endif
 
 #else
 
@@ -23,30 +33,36 @@
    And message can be "Network connection was lost"
 */
 
-#define kChunkSizeMax (1 << 22)
-
 #endif
 
+#define kChunkSizeMax (1 << 22)
+
 void File_Construct(CSzFile *p)
 {
   #ifdef USE_WINDOWS_FILE
   p->handle = INVALID_HANDLE_VALUE;
-  #else
+  #elif defined(USE_FOPEN)
   p->file = NULL;
+  #else
+  p->fd = -1;
   #endif
 }
 
 #if !defined(UNDER_CE) || !defined(USE_WINDOWS_FILE)
+
 static WRes File_Open(CSzFile *p, const char *name, int writeMode)
 {
   #ifdef USE_WINDOWS_FILE
+  
   p->handle = CreateFileA(name,
       writeMode ? GENERIC_WRITE : GENERIC_READ,
       FILE_SHARE_READ, NULL,
       writeMode ? CREATE_ALWAYS : OPEN_EXISTING,
       FILE_ATTRIBUTE_NORMAL, NULL);
   return (p->handle != INVALID_HANDLE_VALUE) ? 0 : GetLastError();
-  #else
+  
+  #elif defined(USE_FOPEN)
+  
   p->file = fopen(name, writeMode ? "wb+" : "rb");
   return (p->file != 0) ? 0 :
     #ifdef UNDER_CE
@@ -54,13 +70,34 @@ static WRes File_Open(CSzFile *p, const char *name, int writeMode)
     #else
     errno;
     #endif
+  
+  #else
+
+  int flags = (writeMode ? (O_CREAT | O_EXCL | O_WRONLY) : O_RDONLY);
+  #ifdef O_BINARY
+  flags |= O_BINARY;
+  #endif
+  p->fd = open(name, flags, 0666);
+  return (p->fd != -1) ? 0 : errno;
+
   #endif
 }
 
 WRes InFile_Open(CSzFile *p, const char *name) { return File_Open(p, name, 0); }
-WRes OutFile_Open(CSzFile *p, const char *name) { return File_Open(p, name, 1); }
+
+WRes OutFile_Open(CSzFile *p, const char *name)
+{
+  #if defined(USE_WINDOWS_FILE) || defined(USE_FOPEN)
+  return File_Open(p, name, 1);
+  #else
+  p->fd = creat(name, 0666);
+  return (p->fd != -1) ? 0 : errno;
+  #endif
+}
+
 #endif
 
+
 #ifdef USE_WINDOWS_FILE
 static WRes File_OpenW(CSzFile *p, const WCHAR *name, int writeMode)
 {
@@ -78,74 +115,124 @@ WRes OutFile_OpenW(CSzFile *p, const WCHAR *name) { return File_OpenW(p, name, 1
 WRes File_Close(CSzFile *p)
 {
   #ifdef USE_WINDOWS_FILE
+  
   if (p->handle != INVALID_HANDLE_VALUE)
   {
     if (!CloseHandle(p->handle))
       return GetLastError();
     p->handle = INVALID_HANDLE_VALUE;
   }
-  #else
+  
+  #elif defined(USE_FOPEN)
+
   if (p->file != NULL)
   {
     int res = fclose(p->file);
     if (res != 0)
+    {
+      if (res == EOF)
+        return errno;
       return res;
+    }
     p->file = NULL;
   }
+
+  #else
+
+  if (p->fd != -1)
+  {
+    if (close(p->fd) != 0)
+      return errno;
+    p->fd = -1;
+  }
+
   #endif
+
   return 0;
 }
 
+
 WRes File_Read(CSzFile *p, void *data, size_t *size)
 {
   size_t originalSize = *size;
+  *size = 0;
   if (originalSize == 0)
     return 0;
 
   #ifdef USE_WINDOWS_FILE
 
-  *size = 0;
   do
   {
-    DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize;
+    const DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize;
     DWORD processed = 0;
-    BOOL res = ReadFile(p->handle, data, curSize, &processed, NULL);
+    const BOOL res = ReadFile(p->handle, data, curSize, &processed, NULL);
     data = (void *)((Byte *)data + processed);
     originalSize -= processed;
     *size += processed;
     if (!res)
       return GetLastError();
+    // debug : we can break here for partial reading mode
+    if (processed == 0)
+      break;
+  }
+  while (originalSize > 0);
+
+  #elif defined(USE_FOPEN)
+
+  do
+  {
+    const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize;
+    const size_t processed = fread(data, 1, curSize, p->file);
+    data = (void *)((Byte *)data + (size_t)processed);
+    originalSize -= processed;
+    *size += processed;
+    if (processed != curSize)
+      return ferror(p->file);
+    // debug : we can break here for partial reading mode
     if (processed == 0)
       break;
   }
   while (originalSize > 0);
-  return 0;
 
   #else
-  
-  *size = fread(data, 1, originalSize, p->file);
-  if (*size == originalSize)
-    return 0;
-  return ferror(p->file);
-  
+
+  do
+  {
+    const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize;
+    const ssize_t processed = read(p->fd, data, curSize);
+    if (processed == -1)
+      return errno;
+    if (processed == 0)
+      break;
+    data = (void *)((Byte *)data + (size_t)processed);
+    originalSize -= (size_t)processed;
+    *size += (size_t)processed;
+    // debug : we can break here for partial reading mode
+    // break;
+  }
+  while (originalSize > 0);
+
   #endif
+
+  return 0;
 }
 
+
 WRes File_Write(CSzFile *p, const void *data, size_t *size)
 {
   size_t originalSize = *size;
+  *size = 0;
   if (originalSize == 0)
     return 0;
   
   #ifdef USE_WINDOWS_FILE
 
-  *size = 0;
   do
   {
-    DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize;
+    const DWORD curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : (DWORD)originalSize;
     DWORD processed = 0;
-    BOOL res = WriteFile(p->handle, data, curSize, &processed, NULL);
-    data = (void *)((Byte *)data + processed);
+    const BOOL res = WriteFile(p->handle, data, curSize, &processed, NULL);
+    data = (const void *)((const Byte *)data + processed);
     originalSize -= processed;
     *size += processed;
     if (!res)
@@ -154,61 +241,106 @@ WRes File_Write(CSzFile *p, const void *data, size_t *size)
       break;
   }
   while (originalSize > 0);
-  return 0;
+
+  #elif defined(USE_FOPEN)
+
+  do
+  {
+    const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize;
+    const size_t processed = fwrite(data, 1, curSize, p->file);
+    data = (void *)((Byte *)data + (size_t)processed);
+    originalSize -= processed;
+    *size += processed;
+    if (processed != curSize)
+      return ferror(p->file);
+    if (processed == 0)
+      break;
+  }
+  while (originalSize > 0);
 
   #else
 
-  *size = fwrite(data, 1, originalSize, p->file);
-  if (*size == originalSize)
-    return 0;
-  return ferror(p->file);
-  
+  do
+  {
+    const size_t curSize = (originalSize > kChunkSizeMax) ? kChunkSizeMax : originalSize;
+    const ssize_t processed = write(p->fd, data, curSize);
+    if (processed == -1)
+      return errno;
+    if (processed == 0)
+      break;
+    data = (const void *)((const Byte *)data + (size_t)processed);
+    originalSize -= (size_t)processed;
+    *size += (size_t)processed;
+  }
+  while (originalSize > 0);
+
   #endif
+
+  return 0;
 }
 
+
 WRes File_Seek(CSzFile *p, Int64 *pos, ESzSeek origin)
 {
   #ifdef USE_WINDOWS_FILE
 
-  LARGE_INTEGER value;
   DWORD moveMethod;
-  value.LowPart = (DWORD)*pos;
-  value.HighPart = (LONG)((UInt64)*pos >> 16 >> 16); /* for case when UInt64 is 32-bit only */
-  switch (origin)
+  UInt32 low = (UInt32)*pos;
+  LONG high = (LONG)((UInt64)*pos >> 16 >> 16); /* for case when UInt64 is 32-bit only */
+  // (int) to eliminate clang warning
+  switch ((int)origin)
   {
     case SZ_SEEK_SET: moveMethod = FILE_BEGIN; break;
     case SZ_SEEK_CUR: moveMethod = FILE_CURRENT; break;
     case SZ_SEEK_END: moveMethod = FILE_END; break;
     default: return ERROR_INVALID_PARAMETER;
   }
-  value.LowPart = SetFilePointer(p->handle, value.LowPart, &value.HighPart, moveMethod);
-  if (value.LowPart == 0xFFFFFFFF)
+  low = SetFilePointer(p->handle, (LONG)low, &high, moveMethod);
+  if (low == (UInt32)0xFFFFFFFF)
   {
     WRes res = GetLastError();
     if (res != NO_ERROR)
       return res;
   }
-  *pos = ((Int64)value.HighPart << 32) | value.LowPart;
+  *pos = ((Int64)high << 32) | low;
   return 0;
 
   #else
   
-  int moveMethod;
-  int res;
-  switch (origin)
+  int moveMethod; // = origin;
+
+  switch ((int)origin)
   {
     case SZ_SEEK_SET: moveMethod = SEEK_SET; break;
     case SZ_SEEK_CUR: moveMethod = SEEK_CUR; break;
     case SZ_SEEK_END: moveMethod = SEEK_END; break;
-    default: return 1;
+    default: return EINVAL;
   }
-  res = fseek(p->file, (long)*pos, moveMethod);
-  *pos = ftell(p->file);
-  return res;
   
-  #endif
+  #if defined(USE_FOPEN)
+  {
+    int res = fseek(p->file, (long)*pos, moveMethod);
+    if (res == -1)
+      return errno;
+    *pos = ftell(p->file);
+    if (*pos == -1)
+      return errno;
+    return 0;
+  }
+  #else
+  {
+    off_t res = lseek(p->fd, (off_t)*pos, moveMethod);
+    if (res == -1)
+      return errno;
+    *pos = res;
+    return 0;
+  }
+  
+  #endif // USE_FOPEN
+  #endif // USE_WINDOWS_FILE
 }
 
+
 WRes File_GetLength(CSzFile *p, UInt64 *length)
 {
   #ifdef USE_WINDOWS_FILE
@@ -224,13 +356,31 @@ WRes File_GetLength(CSzFile *p, UInt64 *length)
   *length = (((UInt64)sizeHigh) << 32) + sizeLow;
   return 0;
   
-  #else
+  #elif defined(USE_FOPEN)
   
   long pos = ftell(p->file);
   int res = fseek(p->file, 0, SEEK_END);
   *length = ftell(p->file);
   fseek(p->file, pos, SEEK_SET);
   return res;
+
+  #else
+
+  off_t pos;
+  *length = 0;
+  pos = lseek(p->fd, 0, SEEK_CUR);
+  if (pos != -1)
+  {
+    const off_t len2 = lseek(p->fd, 0, SEEK_END);
+    const off_t res2 = lseek(p->fd, pos, SEEK_SET);
+    if (len2 != -1)
+    {
+      *length = (UInt64)len2;
+      if (res2 != -1)
+        return 0;
+    }
+  }
+  return errno;
   
   #endif
 }
@@ -238,10 +388,12 @@ WRes File_GetLength(CSzFile *p, UInt64 *length)
 
 /* ---------- FileSeqInStream ---------- */
 
-static SRes FileSeqInStream_Read(const ISeqInStream *pp, void *buf, size_t *size)
+static SRes FileSeqInStream_Read(ISeqInStreamPtr pp, void *buf, size_t *size)
 {
-  CFileSeqInStream *p = CONTAINER_FROM_VTBL(pp, CFileSeqInStream, vt);
-  return File_Read(&p->file, buf, size) == 0 ? SZ_OK : SZ_ERROR_READ;
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileSeqInStream)
+  const WRes wres = File_Read(&p->file, buf, size);
+  p->wres = wres;
+  return (wres == 0) ? SZ_OK : SZ_ERROR_READ;
 }
 
 void FileSeqInStream_CreateVTable(CFileSeqInStream *p)
@@ -252,16 +404,20 @@ void FileSeqInStream_CreateVTable(CFileSeqInStream *p)
 
 /* ---------- FileInStream ---------- */
 
-static SRes FileInStream_Read(const ISeekInStream *pp, void *buf, size_t *size)
+static SRes FileInStream_Read(ISeekInStreamPtr pp, void *buf, size_t *size)
 {
-  CFileInStream *p = CONTAINER_FROM_VTBL(pp, CFileInStream, vt);
-  return (File_Read(&p->file, buf, size) == 0) ? SZ_OK : SZ_ERROR_READ;
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileInStream)
+  const WRes wres = File_Read(&p->file, buf, size);
+  p->wres = wres;
+  return (wres == 0) ? SZ_OK : SZ_ERROR_READ;
 }
 
-static SRes FileInStream_Seek(const ISeekInStream *pp, Int64 *pos, ESzSeek origin)
+static SRes FileInStream_Seek(ISeekInStreamPtr pp, Int64 *pos, ESzSeek origin)
 {
-  CFileInStream *p = CONTAINER_FROM_VTBL(pp, CFileInStream, vt);
-  return File_Seek(&p->file, pos, origin);
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileInStream)
+  const WRes wres = File_Seek(&p->file, pos, origin);
+  p->wres = wres;
+  return (wres == 0) ? SZ_OK : SZ_ERROR_READ;
 }
 
 void FileInStream_CreateVTable(CFileInStream *p)
@@ -273,10 +429,11 @@ void FileInStream_CreateVTable(CFileInStream *p)
 
 /* ---------- FileOutStream ---------- */
 
-static size_t FileOutStream_Write(const ISeqOutStream *pp, const void *data, size_t size)
+static size_t FileOutStream_Write(ISeqOutStreamPtr pp, const void *data, size_t size)
 {
-  CFileOutStream *p = CONTAINER_FROM_VTBL(pp, CFileOutStream, vt);
-  File_Write(&p->file, data, &size);
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileOutStream)
+  const WRes wres = File_Write(&p->file, data, &size);
+  p->wres = wres;
   return size;
 }
 
diff --git a/src/sdk/C/7zFile.h b/src/sdk/C/7zFile.h
index 0e79253..f5069cd 100644
--- a/src/sdk/C/7zFile.h
+++ b/src/sdk/C/7zFile.h
@@ -1,17 +1,21 @@
 /* 7zFile.h -- File IO
-2017-04-03 : Igor Pavlov : Public domain */
+2023-03-05 : Igor Pavlov : Public domain */
 
-#ifndef __7Z_FILE_H
-#define __7Z_FILE_H
+#ifndef ZIP7_INC_FILE_H
+#define ZIP7_INC_FILE_H
 
 #ifdef _WIN32
 #define USE_WINDOWS_FILE
+// #include <windows.h>
 #endif
 
 #ifdef USE_WINDOWS_FILE
-#include <windows.h>
+#include "7zWindows.h"
+
 #else
-#include <stdio.h>
+// note: USE_FOPEN mode is limited to 32-bit file size
+// #define USE_FOPEN
+// #include <stdio.h>
 #endif
 
 #include "7zTypes.h"
@@ -24,8 +28,10 @@ typedef struct
 {
   #ifdef USE_WINDOWS_FILE
   HANDLE handle;
-  #else
+  #elif defined(USE_FOPEN)
   FILE *file;
+  #else
+  int fd;
   #endif
 } CSzFile;
 
@@ -56,6 +62,7 @@ typedef struct
 {
   ISeqInStream vt;
   CSzFile file;
+  WRes wres;
 } CFileSeqInStream;
 
 void FileSeqInStream_CreateVTable(CFileSeqInStream *p);
@@ -65,6 +72,7 @@ typedef struct
 {
   ISeekInStream vt;
   CSzFile file;
+  WRes wres;
 } CFileInStream;
 
 void FileInStream_CreateVTable(CFileInStream *p);
@@ -74,6 +82,7 @@ typedef struct
 {
   ISeqOutStream vt;
   CSzFile file;
+  WRes wres;
 } CFileOutStream;
 
 void FileOutStream_CreateVTable(CFileOutStream *p);
diff --git a/src/sdk/C/7zStream.c b/src/sdk/C/7zStream.c
index 6b5aa16..74e75b6 100644
--- a/src/sdk/C/7zStream.c
+++ b/src/sdk/C/7zStream.c
@@ -1,5 +1,5 @@
 /* 7zStream.c -- 7z Stream functions
-2017-04-03 : Igor Pavlov : Public domain */
+2023-04-02 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -7,12 +7,33 @@
 
 #include "7zTypes.h"
 
-SRes SeqInStream_Read2(const ISeqInStream *stream, void *buf, size_t size, SRes errorType)
+
+SRes SeqInStream_ReadMax(ISeqInStreamPtr stream, void *buf, size_t *processedSize)
+{
+  size_t size = *processedSize;
+  *processedSize = 0;
+  while (size != 0)
+  {
+    size_t cur = size;
+    const SRes res = ISeqInStream_Read(stream, buf, &cur);
+    *processedSize += cur;
+    buf = (void *)((Byte *)buf + cur);
+    size -= cur;
+    if (res != SZ_OK)
+      return res;
+    if (cur == 0)
+      return SZ_OK;
+  }
+  return SZ_OK;
+}
+
+/*
+SRes SeqInStream_Read2(ISeqInStreamPtr stream, void *buf, size_t size, SRes errorType)
 {
   while (size != 0)
   {
     size_t processed = size;
-    RINOK(ISeqInStream_Read(stream, buf, &processed));
+    RINOK(ISeqInStream_Read(stream, buf, &processed))
     if (processed == 0)
       return errorType;
     buf = (void *)((Byte *)buf + processed);
@@ -21,42 +42,44 @@ SRes SeqInStream_Read2(const ISeqInStream *stream, void *buf, size_t size, SRes
   return SZ_OK;
 }
 
-SRes SeqInStream_Read(const ISeqInStream *stream, void *buf, size_t size)
+SRes SeqInStream_Read(ISeqInStreamPtr stream, void *buf, size_t size)
 {
   return SeqInStream_Read2(stream, buf, size, SZ_ERROR_INPUT_EOF);
 }
+*/
+
 
-SRes SeqInStream_ReadByte(const ISeqInStream *stream, Byte *buf)
+SRes SeqInStream_ReadByte(ISeqInStreamPtr stream, Byte *buf)
 {
   size_t processed = 1;
-  RINOK(ISeqInStream_Read(stream, buf, &processed));
+  RINOK(ISeqInStream_Read(stream, buf, &processed))
   return (processed == 1) ? SZ_OK : SZ_ERROR_INPUT_EOF;
 }
 
 
 
-SRes LookInStream_SeekTo(const ILookInStream *stream, UInt64 offset)
+SRes LookInStream_SeekTo(ILookInStreamPtr stream, UInt64 offset)
 {
-  Int64 t = offset;
+  Int64 t = (Int64)offset;
   return ILookInStream_Seek(stream, &t, SZ_SEEK_SET);
 }
 
-SRes LookInStream_LookRead(const ILookInStream *stream, void *buf, size_t *size)
+SRes LookInStream_LookRead(ILookInStreamPtr stream, void *buf, size_t *size)
 {
   const void *lookBuf;
   if (*size == 0)
     return SZ_OK;
-  RINOK(ILookInStream_Look(stream, &lookBuf, size));
+  RINOK(ILookInStream_Look(stream, &lookBuf, size))
   memcpy(buf, lookBuf, *size);
   return ILookInStream_Skip(stream, *size);
 }
 
-SRes LookInStream_Read2(const ILookInStream *stream, void *buf, size_t size, SRes errorType)
+SRes LookInStream_Read2(ILookInStreamPtr stream, void *buf, size_t size, SRes errorType)
 {
   while (size != 0)
   {
     size_t processed = size;
-    RINOK(ILookInStream_Read(stream, buf, &processed));
+    RINOK(ILookInStream_Read(stream, buf, &processed))
     if (processed == 0)
       return errorType;
     buf = (void *)((Byte *)buf + processed);
@@ -65,16 +88,16 @@ SRes LookInStream_Read2(const ILookInStream *stream, void *buf, size_t size, SRe
   return SZ_OK;
 }
 
-SRes LookInStream_Read(const ILookInStream *stream, void *buf, size_t size)
+SRes LookInStream_Read(ILookInStreamPtr stream, void *buf, size_t size)
 {
   return LookInStream_Read2(stream, buf, size, SZ_ERROR_INPUT_EOF);
 }
 
 
 
-#define GET_LookToRead2 CLookToRead2 *p = CONTAINER_FROM_VTBL(pp, CLookToRead2, vt);
+#define GET_LookToRead2  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CLookToRead2)
 
-static SRes LookToRead2_Look_Lookahead(const ILookInStream *pp, const void **buf, size_t *size)
+static SRes LookToRead2_Look_Lookahead(ILookInStreamPtr pp, const void **buf, size_t *size)
 {
   SRes res = SZ_OK;
   GET_LookToRead2
@@ -93,7 +116,7 @@ static SRes LookToRead2_Look_Lookahead(const ILookInStream *pp, const void **buf
   return res;
 }
 
-static SRes LookToRead2_Look_Exact(const ILookInStream *pp, const void **buf, size_t *size)
+static SRes LookToRead2_Look_Exact(ILookInStreamPtr pp, const void **buf, size_t *size)
 {
   SRes res = SZ_OK;
   GET_LookToRead2
@@ -113,14 +136,14 @@ static SRes LookToRead2_Look_Exact(const ILookInStream *pp, const void **buf, si
   return res;
 }
 
-static SRes LookToRead2_Skip(const ILookInStream *pp, size_t offset)
+static SRes LookToRead2_Skip(ILookInStreamPtr pp, size_t offset)
 {
   GET_LookToRead2
   p->pos += offset;
   return SZ_OK;
 }
 
-static SRes LookToRead2_Read(const ILookInStream *pp, void *buf, size_t *size)
+static SRes LookToRead2_Read(ILookInStreamPtr pp, void *buf, size_t *size)
 {
   GET_LookToRead2
   size_t rem = p->size - p->pos;
@@ -134,7 +157,7 @@ static SRes LookToRead2_Read(const ILookInStream *pp, void *buf, size_t *size)
   return SZ_OK;
 }
 
-static SRes LookToRead2_Seek(const ILookInStream *pp, Int64 *pos, ESzSeek origin)
+static SRes LookToRead2_Seek(ILookInStreamPtr pp, Int64 *pos, ESzSeek origin)
 {
   GET_LookToRead2
   p->pos = p->size = 0;
@@ -153,9 +176,9 @@ void LookToRead2_CreateVTable(CLookToRead2 *p, int lookahead)
 
 
 
-static SRes SecToLook_Read(const ISeqInStream *pp, void *buf, size_t *size)
+static SRes SecToLook_Read(ISeqInStreamPtr pp, void *buf, size_t *size)
 {
-  CSecToLook *p = CONTAINER_FROM_VTBL(pp, CSecToLook, vt);
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSecToLook)
   return LookInStream_LookRead(p->realStream, buf, size);
 }
 
@@ -164,9 +187,9 @@ void SecToLook_CreateVTable(CSecToLook *p)
   p->vt.Read = SecToLook_Read;
 }
 
-static SRes SecToRead_Read(const ISeqInStream *pp, void *buf, size_t *size)
+static SRes SecToRead_Read(ISeqInStreamPtr pp, void *buf, size_t *size)
 {
-  CSecToRead *p = CONTAINER_FROM_VTBL(pp, CSecToRead, vt);
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSecToRead)
   return ILookInStream_Read(p->realStream, buf, size);
 }
 
diff --git a/src/sdk/C/7zTypes.h b/src/sdk/C/7zTypes.h
index 65b3af6..5b77420 100644
--- a/src/sdk/C/7zTypes.h
+++ b/src/sdk/C/7zTypes.h
@@ -1,11 +1,13 @@
 /* 7zTypes.h -- Basic types
-2018-08-04 : Igor Pavlov : Public domain */
+2024-01-24 : Igor Pavlov : Public domain */
 
-#ifndef __7Z_TYPES_H
-#define __7Z_TYPES_H
+#ifndef ZIP7_7Z_TYPES_H
+#define ZIP7_7Z_TYPES_H
 
 #ifdef _WIN32
 /* #include <windows.h> */
+#else
+#include <errno.h>
 #endif
 
 #include <stddef.h>
@@ -43,31 +45,134 @@ EXTERN_C_BEGIN
 typedef int SRes;
 
 
+#ifdef _MSC_VER
+  #if _MSC_VER > 1200
+    #define MY_ALIGN(n) __declspec(align(n))
+  #else
+    #define MY_ALIGN(n)
+  #endif
+#else
+  /*
+  // C11/C++11:
+  #include <stdalign.h>
+  #define MY_ALIGN(n) alignas(n)
+  */
+  #define MY_ALIGN(n) __attribute__ ((aligned(n)))
+#endif
+
+
 #ifdef _WIN32
 
 /* typedef DWORD WRes; */
 typedef unsigned WRes;
 #define MY_SRes_HRESULT_FROM_WRes(x) HRESULT_FROM_WIN32(x)
 
-#else
+// #define MY_HRES_ERROR_INTERNAL_ERROR  MY_SRes_HRESULT_FROM_WRes(ERROR_INTERNAL_ERROR)
 
+#else // _WIN32
+
+// #define ENV_HAVE_LSTAT
 typedef int WRes;
-#define MY__FACILITY_WIN32 7
-#define MY__FACILITY__WRes MY__FACILITY_WIN32
-#define MY_SRes_HRESULT_FROM_WRes(x) ((HRESULT)(x) <= 0 ? ((HRESULT)(x)) : ((HRESULT) (((x) & 0x0000FFFF) | (MY__FACILITY__WRes << 16) | 0x80000000)))
+
+// (FACILITY_ERRNO = 0x800) is 7zip's FACILITY constant to represent (errno) errors in HRESULT
+#define MY_FACILITY_ERRNO  0x800
+#define MY_FACILITY_WIN32  7
+#define MY_FACILITY_WRes  MY_FACILITY_ERRNO
+
+#define MY_HRESULT_FROM_errno_CONST_ERROR(x) ((HRESULT)( \
+          ( (HRESULT)(x) & 0x0000FFFF) \
+          | (MY_FACILITY_WRes << 16)  \
+          | (HRESULT)0x80000000 ))
+
+#define MY_SRes_HRESULT_FROM_WRes(x) \
+  ((HRESULT)(x) <= 0 ? ((HRESULT)(x)) : MY_HRESULT_FROM_errno_CONST_ERROR(x))
+
+// we call macro HRESULT_FROM_WIN32 for system errors (WRes) that are (errno)
+#define HRESULT_FROM_WIN32(x) MY_SRes_HRESULT_FROM_WRes(x)
+
+/*
+#define ERROR_FILE_NOT_FOUND             2L
+#define ERROR_ACCESS_DENIED              5L
+#define ERROR_NO_MORE_FILES              18L
+#define ERROR_LOCK_VIOLATION             33L
+#define ERROR_FILE_EXISTS                80L
+#define ERROR_DISK_FULL                  112L
+#define ERROR_NEGATIVE_SEEK              131L
+#define ERROR_ALREADY_EXISTS             183L
+#define ERROR_DIRECTORY                  267L
+#define ERROR_TOO_MANY_POSTS             298L
+
+#define ERROR_INTERNAL_ERROR             1359L
+#define ERROR_INVALID_REPARSE_DATA       4392L
+#define ERROR_REPARSE_TAG_INVALID        4393L
+#define ERROR_REPARSE_TAG_MISMATCH       4394L
+*/
+
+// we use errno equivalents for some WIN32 errors:
+
+#define ERROR_INVALID_PARAMETER     EINVAL
+#define ERROR_INVALID_FUNCTION      EINVAL
+#define ERROR_ALREADY_EXISTS        EEXIST
+#define ERROR_FILE_EXISTS           EEXIST
+#define ERROR_PATH_NOT_FOUND        ENOENT
+#define ERROR_FILE_NOT_FOUND        ENOENT
+#define ERROR_DISK_FULL             ENOSPC
+// #define ERROR_INVALID_HANDLE        EBADF
+
+// we use FACILITY_WIN32 for errors that has no errno equivalent
+// Too many posts were made to a semaphore.
+#define ERROR_TOO_MANY_POSTS        ((HRESULT)0x8007012AL)
+#define ERROR_INVALID_REPARSE_DATA  ((HRESULT)0x80071128L)
+#define ERROR_REPARSE_TAG_INVALID   ((HRESULT)0x80071129L)
+
+// if (MY_FACILITY_WRes != FACILITY_WIN32),
+// we use FACILITY_WIN32 for COM errors:
+#define E_OUTOFMEMORY               ((HRESULT)0x8007000EL)
+#define E_INVALIDARG                ((HRESULT)0x80070057L)
+#define MY_E_ERROR_NEGATIVE_SEEK    ((HRESULT)0x80070083L)
+
+/*
+// we can use FACILITY_ERRNO for some COM errors, that have errno equivalents:
+#define E_OUTOFMEMORY             MY_HRESULT_FROM_errno_CONST_ERROR(ENOMEM)
+#define E_INVALIDARG              MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL)
+#define MY_E_ERROR_NEGATIVE_SEEK  MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL)
+*/
+
+#define TEXT(quote) quote
+
+#define FILE_ATTRIBUTE_READONLY       0x0001
+#define FILE_ATTRIBUTE_HIDDEN         0x0002
+#define FILE_ATTRIBUTE_SYSTEM         0x0004
+#define FILE_ATTRIBUTE_DIRECTORY      0x0010
+#define FILE_ATTRIBUTE_ARCHIVE        0x0020
+#define FILE_ATTRIBUTE_DEVICE         0x0040
+#define FILE_ATTRIBUTE_NORMAL         0x0080
+#define FILE_ATTRIBUTE_TEMPORARY      0x0100
+#define FILE_ATTRIBUTE_SPARSE_FILE    0x0200
+#define FILE_ATTRIBUTE_REPARSE_POINT  0x0400
+#define FILE_ATTRIBUTE_COMPRESSED     0x0800
+#define FILE_ATTRIBUTE_OFFLINE        0x1000
+#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x2000
+#define FILE_ATTRIBUTE_ENCRYPTED      0x4000
+
+#define FILE_ATTRIBUTE_UNIX_EXTENSION 0x8000   /* trick for Unix */
 
 #endif
 
 
 #ifndef RINOK
-#define RINOK(x) { int __result__ = (x); if (__result__ != 0) return __result__; }
+#define RINOK(x) { const int _result_ = (x); if (_result_ != 0) return _result_; }
+#endif
+
+#ifndef RINOK_WRes
+#define RINOK_WRes(x) { const WRes _result_ = (x); if (_result_ != 0) return _result_; }
 #endif
 
 typedef unsigned char Byte;
 typedef short Int16;
 typedef unsigned short UInt16;
 
-#ifdef _LZMA_UINT32_IS_ULONG
+#ifdef Z7_DECL_Int32_AS_long
 typedef long Int32;
 typedef unsigned long UInt32;
 #else
@@ -75,34 +180,82 @@ typedef int Int32;
 typedef unsigned int UInt32;
 #endif
 
-#ifdef _SZ_NO_INT_64
 
-/* define _SZ_NO_INT_64, if your compiler doesn't support 64-bit integers.
-   NOTES: Some code will work incorrectly in that case! */
+#ifndef _WIN32
+
+typedef int INT;
+typedef Int32 INT32;
+typedef unsigned int UINT;
+typedef UInt32 UINT32;
+typedef INT32 LONG;   // LONG, ULONG and DWORD must be 32-bit for _WIN32 compatibility
+typedef UINT32 ULONG;
+
+#undef DWORD
+typedef UINT32 DWORD;
+
+#define VOID void
+
+#define HRESULT LONG
+
+typedef void *LPVOID;
+// typedef void VOID;
+// typedef ULONG_PTR DWORD_PTR, *PDWORD_PTR;
+// gcc / clang on Unix  : sizeof(long==sizeof(void*) in 32 or 64 bits)
+typedef          long  INT_PTR;
+typedef unsigned long  UINT_PTR;
+typedef          long  LONG_PTR;
+typedef unsigned long  DWORD_PTR;
+
+typedef size_t SIZE_T;
+
+#endif //  _WIN32
+
+
+#define MY_HRES_ERROR_INTERNAL_ERROR  ((HRESULT)0x8007054FL)
+
+
+#ifdef Z7_DECL_Int64_AS_long
 
 typedef long Int64;
 typedef unsigned long UInt64;
 
 #else
 
-#if defined(_MSC_VER) || defined(__BORLANDC__)
+#if (defined(_MSC_VER) || defined(__BORLANDC__)) && !defined(__clang__)
 typedef __int64 Int64;
 typedef unsigned __int64 UInt64;
-#define UINT64_CONST(n) n
+#else
+#if defined(__clang__) || defined(__GNUC__)
+#include <stdint.h>
+typedef int64_t Int64;
+typedef uint64_t UInt64;
 #else
 typedef long long int Int64;
 typedef unsigned long long int UInt64;
-#define UINT64_CONST(n) n ## ULL
+// #define UINT64_CONST(n) n ## ULL
+#endif
 #endif
 
 #endif
 
-#ifdef _LZMA_NO_SYSTEM_SIZE_T
-typedef UInt32 SizeT;
+#define UINT64_CONST(n) n
+
+
+#ifdef Z7_DECL_SizeT_AS_unsigned_int
+typedef unsigned int SizeT;
 #else
 typedef size_t SizeT;
 #endif
 
+/*
+#if (defined(_MSC_VER) && _MSC_VER <= 1200)
+typedef size_t MY_uintptr_t;
+#else
+#include <stdint.h>
+typedef uintptr_t MY_uintptr_t;
+#endif
+*/
+
 typedef int BoolInt;
 /* typedef BoolInt Bool; */
 #define True 1
@@ -110,81 +263,99 @@ typedef int BoolInt;
 
 
 #ifdef _WIN32
-#define MY_STD_CALL __stdcall
+#define Z7_STDCALL __stdcall
 #else
-#define MY_STD_CALL
+#define Z7_STDCALL
 #endif
 
 #ifdef _MSC_VER
 
 #if _MSC_VER >= 1300
-#define MY_NO_INLINE __declspec(noinline)
+#define Z7_NO_INLINE __declspec(noinline)
 #else
-#define MY_NO_INLINE
+#define Z7_NO_INLINE
 #endif
 
-#define MY_FORCE_INLINE __forceinline
+#define Z7_FORCE_INLINE __forceinline
 
-#define MY_CDECL __cdecl
-#define MY_FAST_CALL __fastcall
+#define Z7_CDECL      __cdecl
+#define Z7_FASTCALL  __fastcall
 
-#else
+#else //  _MSC_VER
 
-#define MY_NO_INLINE
-#define MY_FORCE_INLINE
-#define MY_CDECL
-#define MY_FAST_CALL
+#if (defined(__GNUC__) && (__GNUC__ >= 4)) \
+    || (defined(__clang__) && (__clang_major__ >= 4)) \
+    || defined(__INTEL_COMPILER) \
+    || defined(__xlC__)
+#define Z7_NO_INLINE      __attribute__((noinline))
+#define Z7_FORCE_INLINE   __attribute__((always_inline)) inline
+#else
+#define Z7_NO_INLINE
+#define Z7_FORCE_INLINE
+#endif
 
-/* inline keyword : for C++ / C99 */
+#define Z7_CDECL
 
-/* GCC, clang: */
-/*
-#if defined (__GNUC__) && (__GNUC__ >= 4)
-#define MY_FORCE_INLINE __attribute__((always_inline))
-#define MY_NO_INLINE __attribute__((noinline))
+#if  defined(_M_IX86) \
+  || defined(__i386__)
+// #define Z7_FASTCALL __attribute__((fastcall))
+// #define Z7_FASTCALL __attribute__((cdecl))
+#define Z7_FASTCALL
+#elif defined(MY_CPU_AMD64)
+// #define Z7_FASTCALL __attribute__((ms_abi))
+#define Z7_FASTCALL
+#else
+#define Z7_FASTCALL
 #endif
-*/
 
-#endif
+#endif //  _MSC_VER
 
 
 /* The following interfaces use first parameter as pointer to structure */
 
-typedef struct IByteIn IByteIn;
-struct IByteIn
+// #define Z7_C_IFACE_CONST_QUAL
+#define Z7_C_IFACE_CONST_QUAL const
+
+#define Z7_C_IFACE_DECL(a) \
+  struct a ## _; \
+  typedef Z7_C_IFACE_CONST_QUAL struct a ## _ * a ## Ptr; \
+  typedef struct a ## _ a; \
+  struct a ## _
+
+
+Z7_C_IFACE_DECL (IByteIn)
 {
-  Byte (*Read)(const IByteIn *p); /* reads one byte, returns 0 in case of EOF or error */
+  Byte (*Read)(IByteInPtr p); /* reads one byte, returns 0 in case of EOF or error */
 };
 #define IByteIn_Read(p) (p)->Read(p)
 
 
-typedef struct IByteOut IByteOut;
-struct IByteOut
+Z7_C_IFACE_DECL (IByteOut)
 {
-  void (*Write)(const IByteOut *p, Byte b);
+  void (*Write)(IByteOutPtr p, Byte b);
 };
 #define IByteOut_Write(p, b) (p)->Write(p, b)
 
 
-typedef struct ISeqInStream ISeqInStream;
-struct ISeqInStream
+Z7_C_IFACE_DECL (ISeqInStream)
 {
-  SRes (*Read)(const ISeqInStream *p, void *buf, size_t *size);
+  SRes (*Read)(ISeqInStreamPtr p, void *buf, size_t *size);
     /* if (input(*size) != 0 && output(*size) == 0) means end_of_stream.
        (output(*size) < input(*size)) is allowed */
 };
 #define ISeqInStream_Read(p, buf, size) (p)->Read(p, buf, size)
 
+/* try to read as much as avail in stream and limited by (*processedSize) */
+SRes SeqInStream_ReadMax(ISeqInStreamPtr stream, void *buf, size_t *processedSize);
 /* it can return SZ_ERROR_INPUT_EOF */
-SRes SeqInStream_Read(const ISeqInStream *stream, void *buf, size_t size);
-SRes SeqInStream_Read2(const ISeqInStream *stream, void *buf, size_t size, SRes errorType);
-SRes SeqInStream_ReadByte(const ISeqInStream *stream, Byte *buf);
+// SRes SeqInStream_Read(ISeqInStreamPtr stream, void *buf, size_t size);
+// SRes SeqInStream_Read2(ISeqInStreamPtr stream, void *buf, size_t size, SRes errorType);
+SRes SeqInStream_ReadByte(ISeqInStreamPtr stream, Byte *buf);
 
 
-typedef struct ISeqOutStream ISeqOutStream;
-struct ISeqOutStream
+Z7_C_IFACE_DECL (ISeqOutStream)
 {
-  size_t (*Write)(const ISeqOutStream *p, const void *buf, size_t size);
+  size_t (*Write)(ISeqOutStreamPtr p, const void *buf, size_t size);
     /* Returns: result - the number of actually written bytes.
        (result < size) means error */
 };
@@ -198,29 +369,26 @@ typedef enum
 } ESzSeek;
 
 
-typedef struct ISeekInStream ISeekInStream;
-struct ISeekInStream
+Z7_C_IFACE_DECL (ISeekInStream)
 {
-  SRes (*Read)(const ISeekInStream *p, void *buf, size_t *size);  /* same as ISeqInStream::Read */
-  SRes (*Seek)(const ISeekInStream *p, Int64 *pos, ESzSeek origin);
+  SRes (*Read)(ISeekInStreamPtr p, void *buf, size_t *size);  /* same as ISeqInStream::Read */
+  SRes (*Seek)(ISeekInStreamPtr p, Int64 *pos, ESzSeek origin);
 };
 #define ISeekInStream_Read(p, buf, size)   (p)->Read(p, buf, size)
 #define ISeekInStream_Seek(p, pos, origin) (p)->Seek(p, pos, origin)
 
 
-typedef struct ILookInStream ILookInStream;
-struct ILookInStream
+Z7_C_IFACE_DECL (ILookInStream)
 {
-  SRes (*Look)(const ILookInStream *p, const void **buf, size_t *size);
+  SRes (*Look)(ILookInStreamPtr p, const void **buf, size_t *size);
     /* if (input(*size) != 0 && output(*size) == 0) means end_of_stream.
        (output(*size) > input(*size)) is not allowed
        (output(*size) < input(*size)) is allowed */
-  SRes (*Skip)(const ILookInStream *p, size_t offset);
+  SRes (*Skip)(ILookInStreamPtr p, size_t offset);
     /* offset must be <= output(*size) of Look */
-
-  SRes (*Read)(const ILookInStream *p, void *buf, size_t *size);
+  SRes (*Read)(ILookInStreamPtr p, void *buf, size_t *size);
     /* reads directly (without buffer). It's same as ISeqInStream::Read */
-  SRes (*Seek)(const ILookInStream *p, Int64 *pos, ESzSeek origin);
+  SRes (*Seek)(ILookInStreamPtr p, Int64 *pos, ESzSeek origin);
 };
 
 #define ILookInStream_Look(p, buf, size)   (p)->Look(p, buf, size)
@@ -229,19 +397,18 @@ struct ILookInStream
 #define ILookInStream_Seek(p, pos, origin) (p)->Seek(p, pos, origin)
 
 
-SRes LookInStream_LookRead(const ILookInStream *stream, void *buf, size_t *size);
-SRes LookInStream_SeekTo(const ILookInStream *stream, UInt64 offset);
+SRes LookInStream_LookRead(ILookInStreamPtr stream, void *buf, size_t *size);
+SRes LookInStream_SeekTo(ILookInStreamPtr stream, UInt64 offset);
 
 /* reads via ILookInStream::Read */
-SRes LookInStream_Read2(const ILookInStream *stream, void *buf, size_t size, SRes errorType);
-SRes LookInStream_Read(const ILookInStream *stream, void *buf, size_t size);
-
+SRes LookInStream_Read2(ILookInStreamPtr stream, void *buf, size_t size, SRes errorType);
+SRes LookInStream_Read(ILookInStreamPtr stream, void *buf, size_t size);
 
 
 typedef struct
 {
   ILookInStream vt;
-  const ISeekInStream *realStream;
+  ISeekInStreamPtr realStream;
  
   size_t pos;
   size_t size; /* it's data size */
@@ -253,13 +420,13 @@ typedef struct
 
 void LookToRead2_CreateVTable(CLookToRead2 *p, int lookahead);
 
-#define LookToRead2_Init(p) { (p)->pos = (p)->size = 0; }
+#define LookToRead2_INIT(p) { (p)->pos = (p)->size = 0; }
 
 
 typedef struct
 {
   ISeqInStream vt;
-  const ILookInStream *realStream;
+  ILookInStreamPtr realStream;
 } CSecToLook;
 
 void SecToLook_CreateVTable(CSecToLook *p);
@@ -269,20 +436,19 @@ void SecToLook_CreateVTable(CSecToLook *p);
 typedef struct
 {
   ISeqInStream vt;
-  const ILookInStream *realStream;
+  ILookInStreamPtr realStream;
 } CSecToRead;
 
 void SecToRead_CreateVTable(CSecToRead *p);
 
 
-typedef struct ICompressProgress ICompressProgress;
-
-struct ICompressProgress
+Z7_C_IFACE_DECL (ICompressProgress)
 {
-  SRes (*Progress)(const ICompressProgress *p, UInt64 inSize, UInt64 outSize);
+  SRes (*Progress)(ICompressProgressPtr p, UInt64 inSize, UInt64 outSize);
     /* Returns: result. (result != SZ_OK) means break.
        Value (UInt64)(Int64)-1 for size means unknown value. */
 };
+
 #define ICompressProgress_Progress(p, inSize, outSize) (p)->Progress(p, inSize, outSize)
 
 
@@ -320,13 +486,13 @@ struct ISzAlloc
 
 
 
-#ifndef MY_container_of
+#ifndef Z7_container_of
 
 /*
-#define MY_container_of(ptr, type, m) container_of(ptr, type, m)
-#define MY_container_of(ptr, type, m) CONTAINING_RECORD(ptr, type, m)
-#define MY_container_of(ptr, type, m) ((type *)((char *)(ptr) - offsetof(type, m)))
-#define MY_container_of(ptr, type, m) (&((type *)0)->m == (ptr), ((type *)(((char *)(ptr)) - MY_offsetof(type, m))))
+#define Z7_container_of(ptr, type, m) container_of(ptr, type, m)
+#define Z7_container_of(ptr, type, m) CONTAINING_RECORD(ptr, type, m)
+#define Z7_container_of(ptr, type, m) ((type *)((char *)(ptr) - offsetof(type, m)))
+#define Z7_container_of(ptr, type, m) (&((type *)0)->m == (ptr), ((type *)(((char *)(ptr)) - MY_offsetof(type, m))))
 */
 
 /*
@@ -335,23 +501,63 @@ struct ISzAlloc
     GCC 4.8.1 : classes with non-public variable members"
 */
 
-#define MY_container_of(ptr, type, m) ((type *)((char *)(1 ? (ptr) : &((type *)0)->m) - MY_offsetof(type, m)))
+#define Z7_container_of(ptr, type, m) \
+  ((type *)(void *)((char *)(void *) \
+  (1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m)))
+
+#define Z7_container_of_CONST(ptr, type, m) \
+  ((const type *)(const void *)((const char *)(const void *) \
+  (1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m)))
 
+/*
+#define Z7_container_of_NON_CONST_FROM_CONST(ptr, type, m) \
+  ((type *)(void *)(const void *)((const char *)(const void *) \
+  (1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m)))
+*/
 
 #endif
 
-#define CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) ((type *)(ptr))
+#define Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) ((type *)(void *)(ptr))
 
-/*
-#define CONTAINER_FROM_VTBL(ptr, type, m) CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m)
-*/
-#define CONTAINER_FROM_VTBL(ptr, type, m) MY_container_of(ptr, type, m)
+// #define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m)
+#define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_container_of(ptr, type, m)
+// #define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_container_of_NON_CONST_FROM_CONST(ptr, type, m)
+
+#define Z7_CONTAINER_FROM_VTBL_CONST(ptr, type, m) Z7_container_of_CONST(ptr, type, m)
 
-#define CONTAINER_FROM_VTBL_CLS(ptr, type, m) CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m)
+#define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m)
 /*
-#define CONTAINER_FROM_VTBL_CLS(ptr, type, m) CONTAINER_FROM_VTBL(ptr, type, m)
+#define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL(ptr, type, m)
 */
+#if defined (__clang__) || defined(__GNUC__)
+#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \
+  _Pragma("GCC diagnostic push") \
+  _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
+#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL \
+  _Pragma("GCC diagnostic pop")
+#else
+#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL
+#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL
+#endif
+
+#define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(ptr, type, m, p) \
+  Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \
+  type *p = Z7_CONTAINER_FROM_VTBL(ptr, type, m); \
+  Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL
+
+#define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(type) \
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(pp, type, vt, p)
+
 
+// #define ZIP7_DECLARE_HANDLE(name)  typedef void *name;
+#define Z7_DECLARE_HANDLE(name)  struct name##_dummy{int unused;}; typedef struct name##_dummy *name;
+
+
+#define Z7_memset_0_ARRAY(a)  memset((a), 0, sizeof(a))
+
+#ifndef Z7_ARRAY_SIZE
+#define Z7_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+#endif
 
 
 #ifdef _WIN32
@@ -370,6 +576,22 @@ struct ISzAlloc
 
 #endif
 
+#define k_PropVar_TimePrec_0        0
+#define k_PropVar_TimePrec_Unix     1
+#define k_PropVar_TimePrec_DOS      2
+#define k_PropVar_TimePrec_HighPrec 3
+#define k_PropVar_TimePrec_Base     16
+#define k_PropVar_TimePrec_100ns (k_PropVar_TimePrec_Base + 7)
+#define k_PropVar_TimePrec_1ns   (k_PropVar_TimePrec_Base + 9)
+
 EXTERN_C_END
 
 #endif
+
+/*
+#ifndef Z7_ST
+#ifdef _7ZIP_ST
+#define Z7_ST
+#endif
+#endif
+*/
diff --git a/src/sdk/C/7zVersion.h b/src/sdk/C/7zVersion.h
index c176823..b6142e9 100644
--- a/src/sdk/C/7zVersion.h
+++ b/src/sdk/C/7zVersion.h
@@ -1,7 +1,7 @@
-#define MY_VER_MAJOR 19
-#define MY_VER_MINOR 00
+#define MY_VER_MAJOR 25
+#define MY_VER_MINOR 1
 #define MY_VER_BUILD 0
-#define MY_VERSION_NUMBERS "19.00"
+#define MY_VERSION_NUMBERS "25.01"
 #define MY_VERSION MY_VERSION_NUMBERS
 
 #ifdef MY_CPU_NAME
@@ -10,12 +10,12 @@
   #define MY_VERSION_CPU MY_VERSION
 #endif
 
-#define MY_DATE "2019-02-21"
+#define MY_DATE "2025-08-03"
 #undef MY_COPYRIGHT
 #undef MY_VERSION_COPYRIGHT_DATE
 #define MY_AUTHOR_NAME "Igor Pavlov"
 #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain"
-#define MY_COPYRIGHT_CR "Copyright (c) 1999-2018 Igor Pavlov"
+#define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov"
 
 #ifdef USE_COPYRIGHT_CR
   #define MY_COPYRIGHT MY_COPYRIGHT_CR
diff --git a/src/sdk/C/7zVersion.rc b/src/sdk/C/7zVersion.rc
index e520995..6ed26de 100644
--- a/src/sdk/C/7zVersion.rc
+++ b/src/sdk/C/7zVersion.rc
@@ -1,55 +1,55 @@
-#define MY_VS_FFI_FILEFLAGSMASK  0x0000003FL
-#define MY_VOS_NT_WINDOWS32  0x00040004L
-#define MY_VOS_CE_WINDOWS32  0x00050004L
-
-#define MY_VFT_APP  0x00000001L
-#define MY_VFT_DLL  0x00000002L
-
-// #include <WinVer.h>
-
-#ifndef MY_VERSION
-#include "7zVersion.h"
-#endif
-
-#define MY_VER MY_VER_MAJOR,MY_VER_MINOR,MY_VER_BUILD,0
-
-#ifdef DEBUG
-#define DBG_FL VS_FF_DEBUG
-#else
-#define DBG_FL 0
-#endif
-
-#define MY_VERSION_INFO(fileType, descr, intName, origName)  \
-LANGUAGE 9, 1 \
-1 VERSIONINFO \
-  FILEVERSION MY_VER \
-  PRODUCTVERSION MY_VER \
-  FILEFLAGSMASK MY_VS_FFI_FILEFLAGSMASK \
-  FILEFLAGS DBG_FL \
-  FILEOS MY_VOS_NT_WINDOWS32 \
-  FILETYPE fileType \
-  FILESUBTYPE 0x0L \
-BEGIN \
-    BLOCK "StringFileInfo" \
-    BEGIN  \
-        BLOCK "040904b0" \
-        BEGIN \
-            VALUE "CompanyName", "Igor Pavlov" \
-            VALUE "FileDescription", descr \
-            VALUE "FileVersion", MY_VERSION  \
-            VALUE "InternalName", intName \
-            VALUE "LegalCopyright", MY_COPYRIGHT \
-            VALUE "OriginalFilename", origName \
-            VALUE "ProductName", "7-Zip" \
-            VALUE "ProductVersion", MY_VERSION \
-        END \
-    END \
-    BLOCK "VarFileInfo" \
-    BEGIN \
-        VALUE "Translation", 0x409, 1200 \
-    END \
-END
-
-#define MY_VERSION_INFO_APP(descr, intName) MY_VERSION_INFO(MY_VFT_APP, descr, intName, intName ".exe")
-
-#define MY_VERSION_INFO_DLL(descr, intName) MY_VERSION_INFO(MY_VFT_DLL, descr, intName, intName ".dll")
+#define MY_VS_FFI_FILEFLAGSMASK  0x0000003FL
+#define MY_VOS_NT_WINDOWS32  0x00040004L
+#define MY_VOS_CE_WINDOWS32  0x00050004L
+
+#define MY_VFT_APP  0x00000001L
+#define MY_VFT_DLL  0x00000002L
+
+// #include <WinVer.h>
+
+#ifndef MY_VERSION
+#include "7zVersion.h"
+#endif
+
+#define MY_VER MY_VER_MAJOR,MY_VER_MINOR,MY_VER_BUILD,0
+
+#ifdef DEBUG
+#define DBG_FL VS_FF_DEBUG
+#else
+#define DBG_FL 0
+#endif
+
+#define MY_VERSION_INFO(fileType, descr, intName, origName)  \
+LANGUAGE 9, 1 \
+1 VERSIONINFO \
+  FILEVERSION MY_VER \
+  PRODUCTVERSION MY_VER \
+  FILEFLAGSMASK MY_VS_FFI_FILEFLAGSMASK \
+  FILEFLAGS DBG_FL \
+  FILEOS MY_VOS_NT_WINDOWS32 \
+  FILETYPE fileType \
+  FILESUBTYPE 0x0L \
+BEGIN \
+    BLOCK "StringFileInfo" \
+    BEGIN  \
+        BLOCK "040904b0" \
+        BEGIN \
+            VALUE "CompanyName", "Igor Pavlov" \
+            VALUE "FileDescription", descr \
+            VALUE "FileVersion", MY_VERSION  \
+            VALUE "InternalName", intName \
+            VALUE "LegalCopyright", MY_COPYRIGHT \
+            VALUE "OriginalFilename", origName \
+            VALUE "ProductName", "7-Zip" \
+            VALUE "ProductVersion", MY_VERSION \
+        END \
+    END \
+    BLOCK "VarFileInfo" \
+    BEGIN \
+        VALUE "Translation", 0x409, 1200 \
+    END \
+END
+
+#define MY_VERSION_INFO_APP(descr, intName) MY_VERSION_INFO(MY_VFT_APP, descr, intName, intName ".exe")
+
+#define MY_VERSION_INFO_DLL(descr, intName) MY_VERSION_INFO(MY_VFT_DLL, descr, intName, intName ".dll")
diff --git a/src/sdk/C/7zWindows.h b/src/sdk/C/7zWindows.h
new file mode 100644
index 0000000..42c6db8
--- /dev/null
+++ b/src/sdk/C/7zWindows.h
@@ -0,0 +1,101 @@
+/* 7zWindows.h -- StdAfx
+2023-04-02 : Igor Pavlov : Public domain */
+
+#ifndef ZIP7_INC_7Z_WINDOWS_H
+#define ZIP7_INC_7Z_WINDOWS_H
+
+#ifdef _WIN32
+
+#if defined(__clang__)
+# pragma clang diagnostic push
+#endif
+
+#if defined(_MSC_VER)
+
+#pragma warning(push)
+#pragma warning(disable : 4668) // '_WIN32_WINNT' is not defined as a preprocessor macro, replacing with '0' for '#if/#elif'
+
+#if _MSC_VER == 1900
+// for old kit10 versions
+// #pragma warning(disable : 4255) // winuser.h(13979): warning C4255: 'GetThreadDpiAwarenessContext':
+#endif
+// win10 Windows Kit:
+#endif // _MSC_VER
+
+#if defined(_MSC_VER) && _MSC_VER <= 1200 && !defined(_WIN64)
+// for msvc6 without sdk2003
+#define RPC_NO_WINDOWS_H
+#endif
+
+#if defined(__MINGW32__) || defined(__MINGW64__)
+// #if defined(__GNUC__) && !defined(__clang__)
+#include <windows.h>
+#else
+#include <Windows.h>
+#endif
+// #include <basetsd.h>
+// #include <wtypes.h>
+
+// but if precompiled with clang-cl then we need
+// #include <windows.h>
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#if defined(__clang__)
+# pragma clang diagnostic pop
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER <= 1200 && !defined(_WIN64)
+#ifndef _W64
+
+typedef long LONG_PTR, *PLONG_PTR;
+typedef unsigned long ULONG_PTR, *PULONG_PTR;
+typedef ULONG_PTR DWORD_PTR, *PDWORD_PTR;
+
+#define Z7_OLD_WIN_SDK
+#endif // _W64
+#endif // _MSC_VER == 1200
+
+#ifdef Z7_OLD_WIN_SDK
+
+#ifndef INVALID_FILE_ATTRIBUTES
+#define INVALID_FILE_ATTRIBUTES ((DWORD)-1)
+#endif
+#ifndef INVALID_SET_FILE_POINTER
+#define INVALID_SET_FILE_POINTER ((DWORD)-1)
+#endif
+#ifndef FILE_SPECIAL_ACCESS
+#define FILE_SPECIAL_ACCESS    (FILE_ANY_ACCESS)
+#endif
+
+// ShlObj.h:
+// #define BIF_NEWDIALOGSTYLE     0x0040
+
+#pragma warning(disable : 4201)
+// #pragma warning(disable : 4115)
+
+#undef  VARIANT_TRUE
+#define VARIANT_TRUE ((VARIANT_BOOL)-1)
+#endif
+
+#endif // Z7_OLD_WIN_SDK
+
+#ifdef UNDER_CE
+#undef  VARIANT_TRUE
+#define VARIANT_TRUE ((VARIANT_BOOL)-1)
+#endif
+
+
+#if defined(_MSC_VER)
+#if _MSC_VER >= 1400 && _MSC_VER <= 1600
+  // BaseTsd.h(148) : 'HandleToULong' : unreferenced inline function has been removed
+  // string.h
+  // #pragma warning(disable : 4514)
+#endif
+#endif
+
+
+/* #include "7zTypes.h" */
+
+#endif
diff --git a/src/sdk/C/Aes.c b/src/sdk/C/Aes.c
index 1cdd0e7..abc5d24 100644
--- a/src/sdk/C/Aes.c
+++ b/src/sdk/C/Aes.c
@@ -1,12 +1,21 @@
 /* Aes.c -- AES encryption / decryption
-2017-01-24 : Igor Pavlov : Public domain */
+2024-03-01 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
-#include "Aes.h"
 #include "CpuArch.h"
+#include "Aes.h"
 
+AES_CODE_FUNC g_AesCbc_Decode;
+#ifndef Z7_SFX
+AES_CODE_FUNC g_AesCbc_Encode;
+AES_CODE_FUNC g_AesCtr_Code;
+UInt32 g_Aes_SupportedFunctions_Flags;
+#endif
+
+MY_ALIGN(64)
 static UInt32 T[256 * 4];
+MY_ALIGN(64)
 static const Byte Sbox[256] = {
   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
@@ -25,23 +34,12 @@ static const Byte Sbox[256] = {
   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
 
-void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);
-void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);
-void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);
-
-void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *ivAes, Byte *data, size_t numBlocks);
-void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *ivAes, Byte *data, size_t numBlocks);
-void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *ivAes, Byte *data, size_t numBlocks);
-
-AES_CODE_FUNC g_AesCbc_Encode;
-AES_CODE_FUNC g_AesCbc_Decode;
-AES_CODE_FUNC g_AesCtr_Code;
 
+MY_ALIGN(64)
 static UInt32 D[256 * 4];
+MY_ALIGN(64)
 static Byte InvS[256];
 
-static const Byte Rcon[11] = { 0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };
-
 #define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF)
 
 #define Ui32(a0, a1, a2, a3) ((UInt32)(a0) | ((UInt32)(a1) << 8) | ((UInt32)(a2) << 16) | ((UInt32)(a3) << 24))
@@ -57,6 +55,66 @@ static const Byte Rcon[11] = { 0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0
 #define DD(x) (D + (x << 8))
 
 
+// #define Z7_SHOW_AES_STATUS
+
+#ifdef MY_CPU_X86_OR_AMD64
+
+  #if defined(__INTEL_COMPILER)
+    #if (__INTEL_COMPILER >= 1110)
+      #define USE_HW_AES
+      #if (__INTEL_COMPILER >= 1900)
+        #define USE_HW_VAES
+      #endif
+    #endif
+  #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
+     || defined(Z7_GCC_VERSION)   && (Z7_GCC_VERSION   >= 40400)
+    #define USE_HW_AES
+      #if defined(__clang__) && (__clang_major__ >= 8) \
+          || defined(__GNUC__) && (__GNUC__ >= 8)
+        #define USE_HW_VAES
+      #endif
+  #elif defined(_MSC_VER)
+    #define USE_HW_AES
+    #define USE_HW_VAES
+  #endif
+
+#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
+  
+  #if   defined(__ARM_FEATURE_AES) \
+     || defined(__ARM_FEATURE_CRYPTO)
+    #define USE_HW_AES
+  #else
+    #if  defined(MY_CPU_ARM64) \
+      || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
+      || defined(Z7_MSC_VER_ORIGINAL)
+    #if  defined(__ARM_FP) && \
+          (   defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
+           || defined(__GNUC__) && (__GNUC__ >= 6) \
+          ) \
+      || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
+    #if  defined(MY_CPU_ARM64) \
+      || !defined(Z7_CLANG_VERSION) \
+      || defined(__ARM_NEON) && \
+          (Z7_CLANG_VERSION < 170000 || \
+           Z7_CLANG_VERSION > 170001)
+      #define USE_HW_AES
+    #endif
+    #endif
+    #endif
+  #endif
+#endif
+
+#ifdef USE_HW_AES
+// #pragma message("=== Aes.c USE_HW_AES === ")
+#ifdef Z7_SHOW_AES_STATUS
+#include <stdio.h>
+#define PRF(x) x
+#else
+#define PRF(x)
+#endif
+#endif
+
+
 void AesGenTables(void)
 {
   unsigned i;
@@ -66,23 +124,23 @@ void AesGenTables(void)
   for (i = 0; i < 256; i++)
   {
     {
-      UInt32 a1 = Sbox[i];
-      UInt32 a2 = xtime(a1);
-      UInt32 a3 = a2 ^ a1;
+      const UInt32 a1 = Sbox[i];
+      const UInt32 a2 = xtime(a1);
+      const UInt32 a3 = a2 ^ a1;
       TT(0)[i] = Ui32(a2, a1, a1, a3);
       TT(1)[i] = Ui32(a3, a2, a1, a1);
       TT(2)[i] = Ui32(a1, a3, a2, a1);
       TT(3)[i] = Ui32(a1, a1, a3, a2);
     }
     {
-      UInt32 a1 = InvS[i];
-      UInt32 a2 = xtime(a1);
-      UInt32 a4 = xtime(a2);
-      UInt32 a8 = xtime(a4);
-      UInt32 a9 = a8 ^ a1;
-      UInt32 aB = a8 ^ a2 ^ a1;
-      UInt32 aD = a8 ^ a4 ^ a1;
-      UInt32 aE = a8 ^ a4 ^ a2;
+      const UInt32 a1 = InvS[i];
+      const UInt32 a2 = xtime(a1);
+      const UInt32 a4 = xtime(a2);
+      const UInt32 a8 = xtime(a4);
+      const UInt32 a9 = a8 ^ a1;
+      const UInt32 aB = a8 ^ a2 ^ a1;
+      const UInt32 aD = a8 ^ a4 ^ a1;
+      const UInt32 aE = a8 ^ a4 ^ a2;
       DD(0)[i] = Ui32(aE, a9, aD, aB);
       DD(1)[i] = Ui32(aB, aE, a9, aD);
       DD(2)[i] = Ui32(aD, aB, aE, a9);
@@ -90,18 +148,50 @@ void AesGenTables(void)
     }
   }
   
-  g_AesCbc_Encode = AesCbc_Encode;
-  g_AesCbc_Decode = AesCbc_Decode;
-  g_AesCtr_Code = AesCtr_Code;
+  {
+  AES_CODE_FUNC d = AesCbc_Decode;
+  #ifndef Z7_SFX
+  AES_CODE_FUNC e = AesCbc_Encode;
+  AES_CODE_FUNC c = AesCtr_Code;
+  UInt32 flags = 0;
+  #endif
   
-  #ifdef MY_CPU_X86_OR_AMD64
-  if (CPU_Is_Aes_Supported())
+  #ifdef USE_HW_AES
+  if (CPU_IsSupported_AES())
   {
-    g_AesCbc_Encode = AesCbc_Encode_Intel;
-    g_AesCbc_Decode = AesCbc_Decode_Intel;
-    g_AesCtr_Code = AesCtr_Code_Intel;
+    // #pragma message ("AES HW")
+    PRF(printf("\n===AES HW\n"));
+    d = AesCbc_Decode_HW;
+
+    #ifndef Z7_SFX
+    e = AesCbc_Encode_HW;
+    c = AesCtr_Code_HW;
+    flags = k_Aes_SupportedFunctions_HW;
+    #endif
+
+    #ifdef MY_CPU_X86_OR_AMD64
+    #ifdef USE_HW_VAES
+    if (CPU_IsSupported_VAES_AVX2())
+    {
+      PRF(printf("\n===vaes avx2\n"));
+      d = AesCbc_Decode_HW_256;
+      #ifndef Z7_SFX
+      c = AesCtr_Code_HW_256;
+      flags |= k_Aes_SupportedFunctions_HW_256;
+      #endif
+    }
+    #endif
+    #endif
   }
   #endif
+
+  g_AesCbc_Decode = d;
+  #ifndef Z7_SFX
+  g_AesCbc_Encode = e;
+  g_AesCtr_Code = c;
+  g_Aes_SupportedFunctions_Flags = flags;
+  #endif
+  }
 }
 
 
@@ -140,10 +230,13 @@ void AesGenTables(void)
 #define FD(i, x) InvS[gb(x, m[(i - x) & 3])]
 #define FD4(i) dest[i] = Ui32(FD(i, 0), FD(i, 1), FD(i, 2), FD(i, 3)) ^ w[i];
 
-void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize)
+void Z7_FASTCALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize)
 {
-  unsigned i, wSize;
-  wSize = keySize + 28;
+  unsigned i, m;
+  const UInt32 *wLim;
+  UInt32 t;
+  UInt32 rcon = 1;
+  
   keySize /= 4;
   w[0] = ((UInt32)keySize / 2) + 3;
   w += 4;
@@ -151,19 +244,29 @@ void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize)
   for (i = 0; i < keySize; i++, key += 4)
     w[i] = GetUi32(key);
 
-  for (; i < wSize; i++)
+  t = w[(size_t)keySize - 1];
+  wLim = w + (size_t)keySize * 3 + 28;
+  m = 0;
+  do
   {
-    UInt32 t = w[(size_t)i - 1];
-    unsigned rem = i % keySize;
-    if (rem == 0)
-      t = Ui32(Sbox[gb1(t)] ^ Rcon[i / keySize], Sbox[gb2(t)], Sbox[gb3(t)], Sbox[gb0(t)]);
-    else if (keySize > 6 && rem == 4)
+    if (m == 0)
+    {
+      t = Ui32(Sbox[gb1(t)] ^ rcon, Sbox[gb2(t)], Sbox[gb3(t)], Sbox[gb0(t)]);
+      rcon <<= 1;
+      if (rcon & 0x100)
+        rcon = 0x1b;
+      m = keySize;
+    }
+    else if (m == 4 && keySize > 6)
       t = Ui32(Sbox[gb0(t)], Sbox[gb1(t)], Sbox[gb2(t)], Sbox[gb3(t)]);
-    w[i] = w[i - keySize] ^ t;
+    m--;
+    t ^= w[0];
+    w[keySize] = t;
   }
+  while (++w != wLim);
 }
 
-void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize)
+void Z7_FASTCALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize)
 {
   unsigned i, num;
   Aes_SetKey_Enc(w, key, keySize);
@@ -184,6 +287,7 @@ void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize)
   src and dest are pointers to 4 UInt32 words.
   src and dest can point to same block */
 
+// Z7_FORCE_INLINE
 static void Aes_Encode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
 {
   UInt32 s[4];
@@ -197,16 +301,20 @@ static void Aes_Encode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
   w += 4;
   for (;;)
   {
-    HT16(m, s, 0);
+    HT16(m, s, 0)
     if (--numRounds2 == 0)
       break;
-    HT16(s, m, 4);
+    HT16(s, m, 4)
     w += 8;
   }
   w += 4;
-  FT4(0); FT4(1); FT4(2); FT4(3);
+  FT4(0)
+  FT4(1)
+  FT4(2)
+  FT4(3)
 }
 
+Z7_FORCE_INLINE
 static void Aes_Decode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
 {
   UInt32 s[4];
@@ -220,12 +328,15 @@ static void Aes_Decode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
   for (;;)
   {
     w -= 8;
-    HD16(m, s, 4);
+    HD16(m, s, 4)
     if (--numRounds2 == 0)
       break;
-    HD16(s, m, 0);
+    HD16(s, m, 0)
   }
-  FD4(0); FD4(1); FD4(2); FD4(3);
+  FD4(0)
+  FD4(1)
+  FD4(2)
+  FD4(3)
 }
 
 void AesCbc_Init(UInt32 *p, const Byte *iv)
@@ -235,7 +346,7 @@ void AesCbc_Init(UInt32 *p, const Byte *iv)
     p[i] = GetUi32(iv + i * 4);
 }
 
-void MY_FAST_CALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks)
+void Z7_FASTCALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks)
 {
   for (; numBlocks != 0; numBlocks--, data += AES_BLOCK_SIZE)
   {
@@ -246,14 +357,14 @@ void MY_FAST_CALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks)
     
     Aes_Encode(p + 4, p, p);
     
-    SetUi32(data,      p[0]);
-    SetUi32(data + 4,  p[1]);
-    SetUi32(data + 8,  p[2]);
-    SetUi32(data + 12, p[3]);
+    SetUi32(data,      p[0])
+    SetUi32(data + 4,  p[1])
+    SetUi32(data + 8,  p[2])
+    SetUi32(data + 12, p[3])
   }
 }
 
-void MY_FAST_CALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks)
+void Z7_FASTCALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks)
 {
   UInt32 in[4], out[4];
   for (; numBlocks != 0; numBlocks--, data += AES_BLOCK_SIZE)
@@ -265,10 +376,10 @@ void MY_FAST_CALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks)
 
     Aes_Decode(p + 4, out, in);
 
-    SetUi32(data,      p[0] ^ out[0]);
-    SetUi32(data + 4,  p[1] ^ out[1]);
-    SetUi32(data + 8,  p[2] ^ out[2]);
-    SetUi32(data + 12, p[3] ^ out[3]);
+    SetUi32(data,      p[0] ^ out[0])
+    SetUi32(data + 4,  p[1] ^ out[1])
+    SetUi32(data + 8,  p[2] ^ out[2])
+    SetUi32(data + 12, p[3] ^ out[3])
     
     p[0] = in[0];
     p[1] = in[1];
@@ -277,7 +388,7 @@ void MY_FAST_CALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks)
   }
 }
 
-void MY_FAST_CALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks)
+void Z7_FASTCALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks)
 {
   for (; numBlocks != 0; numBlocks--)
   {
@@ -291,16 +402,28 @@ void MY_FAST_CALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks)
     
     for (i = 0; i < 4; i++, data += 4)
     {
-      UInt32 t = temp[i];
+      const UInt32 t = temp[i];
 
       #ifdef MY_CPU_LE_UNALIGN
-        *((UInt32 *)data) ^= t;
+        *((UInt32 *)(void *)data) ^= t;
       #else
-        data[0] ^= (t & 0xFF);
-        data[1] ^= ((t >> 8) & 0xFF);
-        data[2] ^= ((t >> 16) & 0xFF);
-        data[3] ^= ((t >> 24));
+        data[0] = (Byte)(data[0] ^ (t & 0xFF));
+        data[1] = (Byte)(data[1] ^ ((t >> 8) & 0xFF));
+        data[2] = (Byte)(data[2] ^ ((t >> 16) & 0xFF));
+        data[3] = (Byte)(data[3] ^ ((t >> 24)));
       #endif
     }
   }
 }
+
+#undef xtime
+#undef Ui32
+#undef gb0
+#undef gb1
+#undef gb2
+#undef gb3
+#undef gb
+#undef TT
+#undef DD
+#undef USE_HW_AES
+#undef PRF
diff --git a/src/sdk/C/Aes.h b/src/sdk/C/Aes.h
index 64979b5..7f0182a 100644
--- a/src/sdk/C/Aes.h
+++ b/src/sdk/C/Aes.h
@@ -1,8 +1,8 @@
 /* Aes.h -- AES encryption / decryption
-2013-01-18 : Igor Pavlov : Public domain */
+2023-04-02 : Igor Pavlov : Public domain */
 
-#ifndef __AES_H
-#define __AES_H
+#ifndef ZIP7_INC_AES_H
+#define ZIP7_INC_AES_H
 
 #include "7zTypes.h"
 
@@ -20,18 +20,40 @@ void AesGenTables(void);
 
 /* aes - 16-byte aligned pointer to keyMode+roundKeys sequence */
 /* keySize = 16 or 24 or 32 (bytes) */
-typedef void (MY_FAST_CALL *AES_SET_KEY_FUNC)(UInt32 *aes, const Byte *key, unsigned keySize);
-void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *aes, const Byte *key, unsigned keySize);
-void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *aes, const Byte *key, unsigned keySize);
+typedef void (Z7_FASTCALL *AES_SET_KEY_FUNC)(UInt32 *aes, const Byte *key, unsigned keySize);
+void Z7_FASTCALL Aes_SetKey_Enc(UInt32 *aes, const Byte *key, unsigned keySize);
+void Z7_FASTCALL Aes_SetKey_Dec(UInt32 *aes, const Byte *key, unsigned keySize);
 
 /* ivAes - 16-byte aligned pointer to iv+keyMode+roundKeys sequence: UInt32[AES_NUM_IVMRK_WORDS] */
 void AesCbc_Init(UInt32 *ivAes, const Byte *iv); /* iv size is AES_BLOCK_SIZE */
+
 /* data - 16-byte aligned pointer to data */
 /* numBlocks - the number of 16-byte blocks in data array */
-typedef void (MY_FAST_CALL *AES_CODE_FUNC)(UInt32 *ivAes, Byte *data, size_t numBlocks);
-extern AES_CODE_FUNC g_AesCbc_Encode;
+typedef void (Z7_FASTCALL *AES_CODE_FUNC)(UInt32 *ivAes, Byte *data, size_t numBlocks);
+
 extern AES_CODE_FUNC g_AesCbc_Decode;
+#ifndef Z7_SFX
+extern AES_CODE_FUNC g_AesCbc_Encode;
 extern AES_CODE_FUNC g_AesCtr_Code;
+#define k_Aes_SupportedFunctions_HW     (1 << 2)
+#define k_Aes_SupportedFunctions_HW_256 (1 << 3)
+extern UInt32 g_Aes_SupportedFunctions_Flags;
+#endif
+
+
+#define Z7_DECLARE_AES_CODE_FUNC(funcName) \
+    void Z7_FASTCALL funcName(UInt32 *ivAes, Byte *data, size_t numBlocks);
+
+Z7_DECLARE_AES_CODE_FUNC (AesCbc_Encode)
+Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode)
+Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code)
+
+Z7_DECLARE_AES_CODE_FUNC (AesCbc_Encode_HW)
+Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode_HW)
+Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code_HW)
+
+Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode_HW_256)
+Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code_HW_256)
 
 EXTERN_C_END
 
diff --git a/src/sdk/C/AesOpt.c b/src/sdk/C/AesOpt.c
index 9571c46..b281807 100644
--- a/src/sdk/C/AesOpt.c
+++ b/src/sdk/C/AesOpt.c
@@ -1,184 +1,1002 @@
-/* AesOpt.c -- Intel's AES
-2017-06-08 : Igor Pavlov : Public domain */
+/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
+Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
+#include "Aes.h"
 #include "CpuArch.h"
 
 #ifdef MY_CPU_X86_OR_AMD64
-#if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
-#define USE_INTEL_AES
-#endif
-#endif
+
+  #if defined(__INTEL_COMPILER)
+    #if (__INTEL_COMPILER >= 1110)
+      #define USE_INTEL_AES
+      #if (__INTEL_COMPILER >= 1900)
+        #define USE_INTEL_VAES
+      #endif
+    #endif
+  #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
+     || defined(Z7_GCC_VERSION)   && (Z7_GCC_VERSION   >= 40400)
+        #define USE_INTEL_AES
+        #if !defined(__AES__)
+          #define ATTRIB_AES __attribute__((__target__("aes")))
+        #endif
+      #if defined(__clang__) && (__clang_major__ >= 8) \
+          || defined(__GNUC__) && (__GNUC__ >= 8)
+        #define USE_INTEL_VAES
+        #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
+          #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
+        #endif
+      #endif
+  #elif defined(_MSC_VER)
+    #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
+      #define USE_INTEL_AES
+      #if (_MSC_VER >= 1910)
+        #define USE_INTEL_VAES
+      #endif
+    #endif
+    #ifndef USE_INTEL_AES
+      #define Z7_USE_AES_HW_STUB
+    #endif
+    #ifndef USE_INTEL_VAES
+      #define Z7_USE_VAES_HW_STUB
+    #endif
+  #endif
+
+    #ifndef USE_INTEL_AES
+      // #define Z7_USE_AES_HW_STUB // for debug
+    #endif
+    #ifndef USE_INTEL_VAES
+      // #define Z7_USE_VAES_HW_STUB // for debug
+    #endif
+
 
 #ifdef USE_INTEL_AES
 
 #include <wmmintrin.h>
 
-void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
+#if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
+#define AES_TYPE_keys UInt32
+#define AES_TYPE_data Byte
+// #define AES_TYPE_keys __m128i
+// #define AES_TYPE_data __m128i
+#endif
+
+#ifndef ATTRIB_AES
+  #define ATTRIB_AES
+#endif
+
+#define AES_FUNC_START(name) \
+    void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
+    // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
+
+#define AES_FUNC_START2(name) \
+AES_FUNC_START (name); \
+ATTRIB_AES \
+AES_FUNC_START (name)
+
+#define MM_OP(op, dest, src)  dest = op(dest, src);
+#define MM_OP_m(op, src)      MM_OP(op, m, src)
+
+#define MM_XOR( dest, src)    MM_OP(_mm_xor_si128,    dest, src)
+
+#if 1
+// use aligned SSE load/store for data.
+// It is required for our Aes functions, that data is aligned for 16-bytes.
+// So we can use this branch of code.
+// and compiler can use fused load-op SSE instructions:
+//   xorps xmm0, XMMWORD PTR [rdx]
+#define LOAD_128(pp)        (*(__m128i *)(void *)(pp))
+#define STORE_128(pp, _v)    *(__m128i *)(void *)(pp) = _v
+// use aligned SSE load/store for data. Alternative code with direct access
+// #define LOAD_128(pp)        _mm_load_si128(pp)
+// #define STORE_128(pp, _v)   _mm_store_si128(pp, _v)
+#else
+// use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
+#define LOAD_128(pp)        _mm_loadu_si128(pp)
+#define STORE_128(pp, _v)   _mm_storeu_si128(pp, _v)
+#endif
+
+AES_FUNC_START2 (AesCbc_Encode_HW)
 {
+  if (numBlocks == 0)
+    return;
+  {
+  __m128i *p = (__m128i *)(void *)ivAes;
+  __m128i *data = (__m128i *)(void *)data8;
   __m128i m = *p;
-  for (; numBlocks != 0; numBlocks--, data++)
+  const __m128i k0 = p[2];
+  const __m128i k1 = p[3];
+  const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
+  do
   {
-    UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
-    const __m128i *w = p + 3;
-    m = _mm_xor_si128(m, *data);
-    m = _mm_xor_si128(m, p[2]);
+    UInt32 r = numRounds2;
+    const __m128i *w = p + 4;
+    __m128i temp = LOAD_128(data);
+    MM_XOR (temp, k0)
+    MM_XOR (m, temp)
+    MM_OP_m (_mm_aesenc_si128, k1)
     do
     {
-      m = _mm_aesenc_si128(m, w[0]);
-      m = _mm_aesenc_si128(m, w[1]);
+      MM_OP_m (_mm_aesenc_si128, w[0])
+      MM_OP_m (_mm_aesenc_si128, w[1])
       w += 2;
     }
-    while (--numRounds2 != 0);
-    m = _mm_aesenc_si128(m, w[0]);
-    m = _mm_aesenclast_si128(m, w[1]);
-    *data = m;
+    while (--r);
+    MM_OP_m (_mm_aesenclast_si128, w[0])
+    STORE_128(data, m);
+    data++;
   }
+  while (--numBlocks);
   *p = m;
+  }
 }
 
-#define NUM_WAYS 3
 
-#define AES_OP_W(op, n) { \
-    const __m128i t = w[n]; \
-    m0 = op(m0, t); \
-    m1 = op(m1, t); \
-    m2 = op(m2, t); \
-    }
+#define WOP_1(op)
+#define WOP_2(op)   WOP_1 (op)  op (m1, 1)
+#define WOP_3(op)   WOP_2 (op)  op (m2, 2)
+#define WOP_4(op)   WOP_3 (op)  op (m3, 3)
+#ifdef MY_CPU_AMD64
+#define WOP_5(op)   WOP_4 (op)  op (m4, 4)
+#define WOP_6(op)   WOP_5 (op)  op (m5, 5)
+#define WOP_7(op)   WOP_6 (op)  op (m6, 6)
+#define WOP_8(op)   WOP_7 (op)  op (m7, 7)
+#endif
+/*
+#define WOP_9(op)   WOP_8 (op)  op (m8, 8);
+#define WOP_10(op)  WOP_9 (op)  op (m9, 9);
+#define WOP_11(op)  WOP_10(op)  op (m10, 10);
+#define WOP_12(op)  WOP_11(op)  op (m11, 11);
+#define WOP_13(op)  WOP_12(op)  op (m12, 12);
+#define WOP_14(op)  WOP_13(op)  op (m13, 13);
+*/
+
+#ifdef MY_CPU_AMD64
+  #define NUM_WAYS      8
+  #define WOP_M1    WOP_8
+#else
+  #define NUM_WAYS      4
+  #define WOP_M1    WOP_4
+#endif
+
+#define WOP(op)  op (m0, 0)  WOP_M1(op)
+
+#define DECLARE_VAR(reg, ii)  __m128i reg;
+#define LOAD_data_ii(ii)      LOAD_128(data + (ii))
+#define LOAD_data(  reg, ii)  reg = LOAD_data_ii(ii);
+#define STORE_data( reg, ii)  STORE_128(data + (ii), reg);
+#if (NUM_WAYS > 1)
+#define XOR_data_M1(reg, ii)  MM_XOR (reg, LOAD_128(data + (ii- 1)))
+#endif
+
+#define MM_OP_key(op, reg)  MM_OP(op, reg, key);
 
-#define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)
-#define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)
-#define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)
-#define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)
+#define AES_DEC(      reg, ii)   MM_OP_key (_mm_aesdec_si128,     reg)
+#define AES_DEC_LAST( reg, ii)   MM_OP_key (_mm_aesdeclast_si128, reg)
+#define AES_ENC(      reg, ii)   MM_OP_key (_mm_aesenc_si128,     reg)
+#define AES_ENC_LAST( reg, ii)   MM_OP_key (_mm_aesenclast_si128, reg)
+#define AES_XOR(      reg, ii)   MM_OP_key (_mm_xor_si128,        reg)
 
-void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
+#define CTR_START(reg, ii)  MM_OP (_mm_add_epi64, ctr, one)  reg = ctr;
+#define CTR_END(  reg, ii)  STORE_128(data + (ii), _mm_xor_si128(reg, \
+                            LOAD_128 (data + (ii))));
+#define WOP_KEY(op, n) { \
+    const __m128i key = w[n]; \
+    WOP(op) }
+
+#define WIDE_LOOP_START  \
+    dataEnd = data + numBlocks;  \
+    if (numBlocks >= NUM_WAYS)  \
+    { dataEnd -= NUM_WAYS; do {  \
+
+#define WIDE_LOOP_END  \
+    data += NUM_WAYS;  \
+    } while (data <= dataEnd);  \
+    dataEnd += NUM_WAYS; }  \
+
+#define SINGLE_LOOP  \
+    for (; data < dataEnd; data++)
+
+
+
+#ifdef USE_INTEL_VAES
+
+#define AVX_XOR(dest, src)    MM_OP(_mm256_xor_si256, dest, src)
+#define AVX_DECLARE_VAR(reg, ii)  __m256i reg;
+
+#if 1
+// use unaligned AVX load/store for data.
+// It is required for our Aes functions, that data is aligned for 16-bytes.
+// But we need 32-bytes reading.
+// So we use intrinsics for unaligned AVX load/store.
+// notes for _mm256_storeu_si256:
+// msvc2022: uses vmovdqu and keeps the order of instruction sequence.
+// new gcc11 uses vmovdqu
+// old gcc9 could use pair of instructions:
+//   vmovups        %xmm7, -224(%rax)
+//   vextracti128   $0x1, %ymm7, -208(%rax)
+#define AVX_LOAD(p)         _mm256_loadu_si256((const __m256i *)(const void *)(p))
+#define AVX_STORE(p, _v)    _mm256_storeu_si256((__m256i *)(void *)(p), _v);
+#else
+// use aligned AVX load/store for data.
+// for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
+// msvc2022 uses vmovdqu still
+// gcc      uses vmovdqa (that requires 32-bytes alignment)
+#define AVX_LOAD(p)         (*(const __m256i *)(const void *)(p))
+#define AVX_STORE(p, _v)    (*(__m256i *)(void *)(p)) = _v;
+#endif
+
+#define AVX_LOAD_data(  reg, ii)  reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
+#define AVX_STORE_data( reg, ii)  AVX_STORE((__m256i *)(void *)data + (ii), reg)
+/*
+AVX_XOR_data_M1() needs unaligned memory load, even if (data)
+is aligned for 256-bits, because we read 32-bytes chunk that
+crosses (data) position: from (data - 16bytes) to (data + 16bytes).
+*/
+#define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))
+
+#define AVX_AES_DEC(      reg, ii)   MM_OP_key (_mm256_aesdec_epi128,     reg)
+#define AVX_AES_DEC_LAST( reg, ii)   MM_OP_key (_mm256_aesdeclast_epi128, reg)
+#define AVX_AES_ENC(      reg, ii)   MM_OP_key (_mm256_aesenc_epi128,     reg)
+#define AVX_AES_ENC_LAST( reg, ii)   MM_OP_key (_mm256_aesenclast_epi128, reg)
+#define AVX_AES_XOR(      reg, ii)   MM_OP_key (_mm256_xor_si256,         reg)
+#define AVX_CTR_START(reg, ii)  \
+    MM_OP (_mm256_add_epi64, ctr2, two) \
+    reg = _mm256_xor_si256(ctr2, key);
+
+#define AVX_CTR_END(reg, ii)  \
+    AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
+    AVX_LOAD ((__m256i *)(void *)data + (ii))));
+
+#define AVX_WOP_KEY(op, n) { \
+    const __m256i key = w[n]; \
+    WOP(op) }
+
+#define NUM_AES_KEYS_MAX 15
+
+#define WIDE_LOOP_START_AVX(OP)  \
+    dataEnd = data + numBlocks;  \
+    if (numBlocks >= NUM_WAYS * 2)  \
+    { __m256i keys[NUM_AES_KEYS_MAX];  \
+      OP  \
+      { UInt32 ii; for (ii = 0; ii < numRounds; ii++)  \
+        keys[ii] = _mm256_broadcastsi128_si256(p[ii]); }  \
+      dataEnd -= NUM_WAYS * 2; \
+      do {  \
+
+#define WIDE_LOOP_END_AVX(OP)  \
+        data += NUM_WAYS * 2;  \
+      } while (data <= dataEnd);  \
+      dataEnd += NUM_WAYS * 2;  \
+      OP  \
+      _mm256_zeroupper();  \
+    }  \
+
+/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
+   MSVC still can insert vzeroupper instruction. */
+
+#endif
+
+
+
+AES_FUNC_START2 (AesCbc_Decode_HW)
 {
+  __m128i *p = (__m128i *)(void *)ivAes;
+  __m128i *data = (__m128i *)(void *)data8;
   __m128i iv = *p;
-  for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
+  const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
+  const __m128i *dataEnd;
+  p += 2;
+  
+  WIDE_LOOP_START
   {
-    UInt32 numRounds2 = *(const UInt32 *)(p + 1);
-    const __m128i *w = p + numRounds2 * 2;
-    __m128i m0, m1, m2;
+    const __m128i *w = wStart;
+    WOP (DECLARE_VAR)
+    WOP (LOAD_data)
+    WOP_KEY (AES_XOR, 1)
+    do
     {
-      const __m128i t = w[2];
-      m0 = _mm_xor_si128(t, data[0]);
-      m1 = _mm_xor_si128(t, data[1]);
-      m2 = _mm_xor_si128(t, data[2]);
+      WOP_KEY (AES_DEC, 0)
+
+      w--;
     }
-    numRounds2--;
+    while (w != p);
+    WOP_KEY (AES_DEC_LAST, 0)
+
+    MM_XOR (m0, iv)
+    WOP_M1 (XOR_data_M1)
+    LOAD_data(iv, NUM_WAYS - 1)
+    WOP (STORE_data)
+  }
+  WIDE_LOOP_END
+
+  SINGLE_LOOP
+  {
+    const __m128i *w = wStart - 1;
+    __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
+    
     do
     {
-      AES_DEC(1)
-      AES_DEC(0)
+      MM_OP_m (_mm_aesdec_si128, w[1])
+      MM_OP_m (_mm_aesdec_si128, w[0])
       w -= 2;
     }
-    while (--numRounds2 != 0);
-    AES_DEC(1)
-    AES_DEC_LAST(0)
+    while (w != p);
+    MM_OP_m (_mm_aesdec_si128,     w[1])
+    MM_OP_m (_mm_aesdeclast_si128, w[0])
+    MM_XOR (m, iv)
+    LOAD_data(iv, 0)
+    STORE_data(m, 0)
+  }
+  
+  p[-2] = iv;
+}
+
+
+AES_FUNC_START2 (AesCtr_Code_HW)
+{
+  __m128i *p = (__m128i *)(void *)ivAes;
+  __m128i *data = (__m128i *)(void *)data8;
+  __m128i ctr = *p;
+  const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
+  const __m128i *dataEnd;
+  const __m128i one = _mm_cvtsi32_si128(1);
 
+  p += 2;
+  
+  WIDE_LOOP_START
+  {
+    const __m128i *w = p;
+    UInt32 r = numRoundsMinus2;
+    WOP (DECLARE_VAR)
+    WOP (CTR_START)
+    WOP_KEY (AES_XOR, 0)
+    w += 1;
+    do
     {
-      __m128i t;
-      t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t;
-      t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t;
-      t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t;
+      WOP_KEY (AES_ENC, 0)
+      w += 1;
     }
+    while (--r);
+    WOP_KEY (AES_ENC_LAST, 0)
+    WOP (CTR_END)
   }
-  for (; numBlocks != 0; numBlocks--, data++)
+  WIDE_LOOP_END
+
+  SINGLE_LOOP
   {
-    UInt32 numRounds2 = *(const UInt32 *)(p + 1);
-    const __m128i *w = p + numRounds2 * 2;
-    __m128i m = _mm_xor_si128(w[2], *data);
-    numRounds2--;
+    UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
+    const __m128i *w = p;
+    __m128i m;
+    MM_OP (_mm_add_epi64, ctr, one)
+    m = _mm_xor_si128 (ctr, p[0]);
+    w += 1;
     do
     {
-      m = _mm_aesdec_si128(m, w[1]);
-      m = _mm_aesdec_si128(m, w[0]);
-      w -= 2;
+      MM_OP_m (_mm_aesenc_si128, w[0])
+      MM_OP_m (_mm_aesenc_si128, w[1])
+      w += 2;
     }
-    while (--numRounds2 != 0);
-    m = _mm_aesdec_si128(m, w[1]);
-    m = _mm_aesdeclast_si128(m, w[0]);
+    while (--numRounds2);
+    MM_OP_m (_mm_aesenc_si128,     w[0])
+    MM_OP_m (_mm_aesenclast_si128, w[1])
+    CTR_END (m, 0)
+  }
+  
+  p[-2] = ctr;
+}
+
+
 
-    m = _mm_xor_si128(m, iv);
-    iv = *data;
-    *data = m;
+#ifdef USE_INTEL_VAES
+
+/*
+GCC before 2013-Jun:
+  <immintrin.h>:
+    #ifdef __AVX__
+     #include <avxintrin.h>
+    #endif
+GCC after 2013-Jun:
+  <immintrin.h>:
+    #include <avxintrin.h>
+CLANG 3.8+:
+{
+  <immintrin.h>:
+    #if !defined(_MSC_VER) || defined(__AVX__)
+      #include <avxintrin.h>
+    #endif
+
+  if (the compiler is clang for Windows and if global arch is not set for __AVX__)
+    [ if (defined(_MSC_VER) && !defined(__AVX__)) ]
+  {
+    <immintrin.h> doesn't include <avxintrin.h>
+    and we have 2 ways to fix it:
+      1) we can define required __AVX__ before <immintrin.h>
+      or
+      2) we can include <avxintrin.h> after <immintrin.h>
   }
-  *p = iv;
 }
 
-void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks)
+If we include <avxintrin.h> manually for GCC/CLANG, it's
+required that <immintrin.h> must be included before <avxintrin.h>.
+*/
+
+/*
+#if defined(__clang__) && defined(_MSC_VER)
+#define __AVX__
+#define __AVX2__
+#define __VAES__
+#endif
+*/
+
+#include <immintrin.h>
+#if defined(__clang__) && defined(_MSC_VER)
+  #if !defined(__AVX__)
+    #include <avxintrin.h>
+  #endif
+  #if !defined(__AVX2__)
+    #include <avx2intrin.h>
+  #endif
+  #if !defined(__VAES__)
+    #include <vaesintrin.h>
+  #endif
+#endif  // __clang__ && _MSC_VER
+
+#ifndef ATTRIB_VAES
+  #define ATTRIB_VAES
+#endif
+
+#define VAES_FUNC_START2(name) \
+AES_FUNC_START (name); \
+ATTRIB_VAES \
+AES_FUNC_START (name)
+
+VAES_FUNC_START2 (AesCbc_Decode_HW_256)
 {
-  __m128i ctr = *p;
-  __m128i one;
-  one.m128i_u64[0] = 1;
-  one.m128i_u64[1] = 0;
-  for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
+  __m128i *p = (__m128i *)(void *)ivAes;
+  __m128i *data = (__m128i *)(void *)data8;
+  __m128i iv = *p;
+  const __m128i *dataEnd;
+  const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
+  p += 2;
+  
+  WIDE_LOOP_START_AVX(;)
   {
-    UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
-    const __m128i *w = p;
-    __m128i m0, m1, m2;
+    const __m256i *w = keys + numRounds - 2;
+    
+    WOP (AVX_DECLARE_VAR)
+    WOP (AVX_LOAD_data)
+    AVX_WOP_KEY (AVX_AES_XOR, 1)
+
+    do
     {
-      const __m128i t = w[2];
-      ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t);
-      ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t);
-      ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t);
+      AVX_WOP_KEY (AVX_AES_DEC, 0)
+      w--;
     }
-    w += 3;
+    while (w != keys);
+    AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
+
+    AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
+    WOP_M1 (AVX_XOR_data_M1)
+    LOAD_data (iv, NUM_WAYS * 2 - 1)
+    WOP (AVX_STORE_data)
+  }
+  WIDE_LOOP_END_AVX(;)
+
+  SINGLE_LOOP
+  {
+    const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
+    __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
     do
     {
-      AES_ENC(0)
-      AES_ENC(1)
-      w += 2;
+      MM_OP_m (_mm_aesdec_si128, w[1])
+      MM_OP_m (_mm_aesdec_si128, w[0])
+      w -= 2;
+    }
+    while (w != p);
+    MM_OP_m (_mm_aesdec_si128,     w[1])
+    MM_OP_m (_mm_aesdeclast_si128, w[0])
+
+    MM_XOR (m, iv)
+    LOAD_data(iv, 0)
+    STORE_data(m, 0)
+  }
+  
+  p[-2] = iv;
+}
+
+
+/*
+SSE2: _mm_cvtsi32_si128 : movd
+AVX:  _mm256_setr_m128i            : vinsertf128
+AVX2: _mm256_add_epi64             : vpaddq ymm, ymm, ymm
+      _mm256_extracti128_si256     : vextracti128
+      _mm256_broadcastsi128_si256  : vbroadcasti128
+*/
+
+#define AVX_CTR_LOOP_START  \
+    ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
+    two = _mm256_setr_m128i(one, one); \
+    two = _mm256_add_epi64(two, two); \
+
+// two = _mm256_setr_epi64x(2, 0, 2, 0);
+  
+#define AVX_CTR_LOOP_ENC  \
+    ctr = _mm256_extracti128_si256 (ctr2, 1); \
+ 
+VAES_FUNC_START2 (AesCtr_Code_HW_256)
+{
+  __m128i *p = (__m128i *)(void *)ivAes;
+  __m128i *data = (__m128i *)(void *)data8;
+  __m128i ctr = *p;
+  const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
+  const __m128i *dataEnd;
+  const __m128i one = _mm_cvtsi32_si128(1);
+  __m256i ctr2, two;
+  p += 2;
+  
+  WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
+  {
+    const __m256i *w = keys;
+    UInt32 r = numRounds - 2;
+    WOP (AVX_DECLARE_VAR)
+    AVX_WOP_KEY (AVX_CTR_START, 0)
+
+    w += 1;
+    do
+    {
+      AVX_WOP_KEY (AVX_AES_ENC, 0)
+      w += 1;
     }
-    while (--numRounds2 != 0);
-    AES_ENC(0)
-    AES_ENC_LAST(1)
-    data[0] = _mm_xor_si128(data[0], m0);
-    data[1] = _mm_xor_si128(data[1], m1);
-    data[2] = _mm_xor_si128(data[2], m2);
+    while (--r);
+    AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
+   
+    WOP (AVX_CTR_END)
   }
-  for (; numBlocks != 0; numBlocks--, data++)
+  WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
+  
+  SINGLE_LOOP
   {
-    UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
+    UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
     const __m128i *w = p;
     __m128i m;
-    ctr = _mm_add_epi64(ctr, one);
-    m = _mm_xor_si128(ctr, p[2]);
-    w += 3;
+    MM_OP (_mm_add_epi64, ctr, one)
+    m = _mm_xor_si128 (ctr, p[0]);
+    w += 1;
     do
     {
-      m = _mm_aesenc_si128(m, w[0]);
-      m = _mm_aesenc_si128(m, w[1]);
+      MM_OP_m (_mm_aesenc_si128, w[0])
+      MM_OP_m (_mm_aesenc_si128, w[1])
       w += 2;
     }
-    while (--numRounds2 != 0);
-    m = _mm_aesenc_si128(m, w[0]);
-    m = _mm_aesenclast_si128(m, w[1]);
-    *data = _mm_xor_si128(*data, m);
+    while (--numRounds2);
+    MM_OP_m (_mm_aesenc_si128,     w[0])
+    MM_OP_m (_mm_aesenclast_si128, w[1])
+    CTR_END (m, 0)
   }
-  *p = ctr;
+
+  p[-2] = ctr;
 }
 
+#endif // USE_INTEL_VAES
+
+#else // USE_INTEL_AES
+
+/* no USE_INTEL_AES */
+
+#if defined(Z7_USE_AES_HW_STUB)
+// We can compile this file with another C compiler,
+// or we can compile asm version.
+// So we can generate real code instead of this stub function.
+// #if defined(_MSC_VER)
+#pragma message("AES  HW_SW stub was used")
+// #endif
+
+#if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
+#define AES_TYPE_keys UInt32
+#define AES_TYPE_data Byte
+#endif
+
+#define AES_FUNC_START(name) \
+    void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
+
+#define AES_COMPAT_STUB(name) \
+    AES_FUNC_START(name); \
+    AES_FUNC_START(name ## _HW) \
+    { name(p, data, numBlocks); }
+
+AES_COMPAT_STUB (AesCbc_Encode)
+AES_COMPAT_STUB (AesCbc_Decode)
+AES_COMPAT_STUB (AesCtr_Code)
+#endif // Z7_USE_AES_HW_STUB
+
+#endif // USE_INTEL_AES
+
+
+#ifndef USE_INTEL_VAES
+#if defined(Z7_USE_VAES_HW_STUB)
+// #if defined(_MSC_VER)
+#pragma message("VAES HW_SW stub was used")
+// #endif
+
+#define VAES_COMPAT_STUB(name) \
+    void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
+    void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
+    { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
+
+VAES_COMPAT_STUB (AesCbc_Decode_HW)
+VAES_COMPAT_STUB (AesCtr_Code_HW)
+#endif
+#endif // ! USE_INTEL_VAES
+
+
+
+
+#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
+
+  #if   defined(__ARM_FEATURE_AES) \
+     || defined(__ARM_FEATURE_CRYPTO)
+    #define USE_HW_AES
+  #else
+    #if  defined(MY_CPU_ARM64) \
+      || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
+      || defined(Z7_MSC_VER_ORIGINAL)
+    #if  defined(__ARM_FP) && \
+          (   defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
+           || defined(__GNUC__) && (__GNUC__ >= 6) \
+          ) \
+      || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
+    #if  defined(MY_CPU_ARM64) \
+      || !defined(Z7_CLANG_VERSION) \
+      || defined(__ARM_NEON) && \
+          (Z7_CLANG_VERSION < 170000 || \
+           Z7_CLANG_VERSION > 170001)
+      #define USE_HW_AES
+    #endif
+    #endif
+    #endif
+  #endif
+
+#ifdef USE_HW_AES
+
+// #pragma message("=== AES HW === ")
+// __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES
+
+#if defined(__clang__) || defined(__GNUC__)
+#if !defined(__ARM_FEATURE_AES) && \
+    !defined(__ARM_FEATURE_CRYPTO)
+  #ifdef MY_CPU_ARM64
+#if defined(__clang__)
+    #define ATTRIB_AES __attribute__((__target__("crypto")))
 #else
+    #define ATTRIB_AES __attribute__((__target__("+crypto")))
+#endif
+  #else
+#if defined(__clang__)
+    #define ATTRIB_AES __attribute__((__target__("armv8-a,aes")))
+#else
+    #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
+#endif
+  #endif
+#endif
+#else
+  // _MSC_VER
+  // for arm32
+  #define _ARM_USE_NEW_NEON_INTRINSICS
+#endif
+
+#ifndef ATTRIB_AES
+  #define ATTRIB_AES
+#endif
+
+#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
+#include <arm64_neon.h>
+#else
+/*
+  clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese
+  clang
+   3.8.1 : __ARM_NEON             :                    defined(__ARM_FEATURE_CRYPTO)
+   7.0.1 : __ARM_NEON             : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
+  11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
+  13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES)
+  16     : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8
+*/
+#if defined(__clang__) && __clang_major__ < 16
+#if !defined(__ARM_FEATURE_AES) && \
+    !defined(__ARM_FEATURE_CRYPTO)
+//     #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
+    Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+    #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
+// #if defined(__clang__) && __clang_major__ < 13
+    #define __ARM_FEATURE_CRYPTO 1
+// #else
+    #define __ARM_FEATURE_AES 1
+// #endif
+    Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+#endif
+#endif // clang
+
+#if defined(__clang__)
+
+#if defined(__ARM_ARCH) && __ARM_ARCH < 8
+    Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+//    #pragma message("#define __ARM_ARCH 8")
+    #undef  __ARM_ARCH
+    #define __ARM_ARCH 8
+    Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+#endif
+
+#endif // clang
+
+#include <arm_neon.h>
+
+#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
+    defined(__ARM_FEATURE_CRYPTO) && \
+    defined(__ARM_FEATURE_AES)
+Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+    #undef __ARM_FEATURE_CRYPTO
+    #undef __ARM_FEATURE_AES
+    #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
+Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+//    #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
+#endif
+
+#endif // Z7_MSC_VER_ORIGINAL
+
+typedef uint8x16_t v128;
 
-void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);
-void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);
-void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);
+#define AES_FUNC_START(name) \
+    void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
+    // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
 
-void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
+#define AES_FUNC_START2(name) \
+AES_FUNC_START (name); \
+ATTRIB_AES \
+AES_FUNC_START (name)
+
+#define MM_OP(op, dest, src)  dest = op(dest, src);
+#define MM_OP_m(op, src)      MM_OP(op, m, src)
+#define MM_OP1_m(op)          m = op(m);
+
+#define MM_XOR( dest, src)    MM_OP(veorq_u8, dest, src)
+#define MM_XOR_m( src)        MM_XOR(m, src)
+
+#define AES_E_m(k)     MM_OP_m (vaeseq_u8, k)
+#define AES_E_MC_m(k)  AES_E_m (k)  MM_OP1_m(vaesmcq_u8)
+
+
+AES_FUNC_START2 (AesCbc_Encode_HW)
 {
-  AesCbc_Encode(p, data, numBlocks);
+  if (numBlocks == 0)
+    return;
+  {
+  v128 * const p = (v128 *)(void *)ivAes;
+  v128 *data = (v128 *)(void *)data8;
+  v128 m = *p;
+  const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
+  const v128 *w = p + (size_t)numRounds2 * 2;
+  const v128 k0 = p[2];
+  const v128 k1 = p[3];
+  const v128 k2 = p[4];
+  const v128 k3 = p[5];
+  const v128 k4 = p[6];
+  const v128 k5 = p[7];
+  const v128 k6 = p[8];
+  const v128 k7 = p[9];
+  const v128 k8 = p[10];
+  const v128 k9 = p[11];
+  const v128 k_z4 = w[-2];
+  const v128 k_z3 = w[-1];
+  const v128 k_z2 = w[0];
+  const v128 k_z1 = w[1];
+  const v128 k_z0 = w[2];
+  // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
+  // because gcc/clang compilers are not good for that optimization.
+  do
+  {
+    MM_XOR_m (*data)
+    AES_E_MC_m (k0)
+    AES_E_MC_m (k1)
+    AES_E_MC_m (k2)
+    AES_E_MC_m (k3)
+    AES_E_MC_m (k4)
+    AES_E_MC_m (k5)
+    if (numRounds2 >= 6)
+    {
+      AES_E_MC_m (k6)
+      AES_E_MC_m (k7)
+      if (numRounds2 != 6)
+      {
+        AES_E_MC_m (k8)
+        AES_E_MC_m (k9)
+      }
+    }
+    AES_E_MC_m (k_z4)
+    AES_E_MC_m (k_z3)
+    AES_E_MC_m (k_z2)
+    AES_E_m    (k_z1)
+    MM_XOR_m   (k_z0)
+    *data++ = m;
+  }
+  while (--numBlocks);
+  *p = m;
+  }
 }
 
-void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
+
+#define WOP_1(op)
+#define WOP_2(op)   WOP_1 (op)  op (m1, 1)
+#define WOP_3(op)   WOP_2 (op)  op (m2, 2)
+#define WOP_4(op)   WOP_3 (op)  op (m3, 3)
+#define WOP_5(op)   WOP_4 (op)  op (m4, 4)
+#define WOP_6(op)   WOP_5 (op)  op (m5, 5)
+#define WOP_7(op)   WOP_6 (op)  op (m6, 6)
+#define WOP_8(op)   WOP_7 (op)  op (m7, 7)
+
+  #define NUM_WAYS      8
+  #define WOP_M1    WOP_8
+
+#define WOP(op)  op (m0, 0)   WOP_M1(op)
+
+#define DECLARE_VAR(reg, ii)  v128 reg;
+#define LOAD_data(  reg, ii)  reg = data[ii];
+#define STORE_data( reg, ii)  data[ii] = reg;
+#if (NUM_WAYS > 1)
+#define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1])
+#endif
+
+#define MM_OP_key(op, reg)  MM_OP (op, reg, key)
+
+#define AES_D_m(k)      MM_OP_m (vaesdq_u8, k)
+#define AES_D_IMC_m(k)  AES_D_m (k)  MM_OP1_m (vaesimcq_u8)
+
+#define AES_XOR(   reg, ii)  MM_OP_key (veorq_u8,  reg)
+#define AES_D(     reg, ii)  MM_OP_key (vaesdq_u8, reg)
+#define AES_E(     reg, ii)  MM_OP_key (vaeseq_u8, reg)
+
+#define AES_D_IMC( reg, ii)  AES_D (reg, ii)  reg = vaesimcq_u8(reg);
+#define AES_E_MC(  reg, ii)  AES_E (reg, ii)  reg = vaesmcq_u8(reg);
+
+#define CTR_START(reg, ii)  MM_OP (vaddq_u64, ctr, one)  reg = vreinterpretq_u8_u64(ctr);
+#define CTR_END(  reg, ii)  MM_XOR (data[ii], reg)
+
+#define WOP_KEY(op, n) { \
+    const v128 key = w[n]; \
+    WOP(op) }
+
+#define WIDE_LOOP_START  \
+    dataEnd = data + numBlocks;  \
+    if (numBlocks >= NUM_WAYS)  \
+    { dataEnd -= NUM_WAYS; do {  \
+
+#define WIDE_LOOP_END  \
+    data += NUM_WAYS;  \
+    } while (data <= dataEnd);  \
+    dataEnd += NUM_WAYS; }  \
+
+#define SINGLE_LOOP  \
+    for (; data < dataEnd; data++)
+
+
+AES_FUNC_START2 (AesCbc_Decode_HW)
 {
-  AesCbc_Decode(p, data, numBlocks);
+  v128 *p = (v128 *)(void *)ivAes;
+  v128 *data = (v128 *)(void *)data8;
+  v128 iv = *p;
+  const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
+  const v128 *dataEnd;
+  p += 2;
+  
+  WIDE_LOOP_START
+  {
+    const v128 *w = wStart;
+    WOP (DECLARE_VAR)
+    WOP (LOAD_data)
+    WOP_KEY (AES_D_IMC, 2)
+    do
+    {
+      WOP_KEY (AES_D_IMC, 1)
+      WOP_KEY (AES_D_IMC, 0)
+      w -= 2;
+    }
+    while (w != p);
+    WOP_KEY (AES_D,   1)
+    WOP_KEY (AES_XOR, 0)
+    MM_XOR (m0, iv)
+    WOP_M1 (XOR_data_M1)
+    LOAD_data(iv, NUM_WAYS - 1)
+    WOP (STORE_data)
+  }
+  WIDE_LOOP_END
+
+  SINGLE_LOOP
+  {
+    const v128 *w = wStart;
+    v128 m;  LOAD_data(m, 0)
+    AES_D_IMC_m (w[2])
+    do
+    {
+      AES_D_IMC_m (w[1])
+      AES_D_IMC_m (w[0])
+      w -= 2;
+    }
+    while (w != p);
+    AES_D_m  (w[1])
+    MM_XOR_m (w[0])
+    MM_XOR_m (iv)
+    LOAD_data(iv, 0)
+    STORE_data(m, 0)
+  }
+  
+  p[-2] = iv;
 }
 
-void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks)
+
+AES_FUNC_START2 (AesCtr_Code_HW)
 {
-  AesCtr_Code(p, data, numBlocks);
+  v128 *p = (v128 *)(void *)ivAes;
+  v128 *data = (v128 *)(void *)data8;
+  uint64x2_t ctr = vreinterpretq_u64_u8(*p);
+  const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
+  const v128 *dataEnd;
+// the bug in clang:
+// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
+#if defined(__clang__) && (__clang_major__ <= 9)
+#pragma GCC diagnostic ignored "-Wvector-conversion"
+#endif
+  const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
+  p += 2;
+  
+  WIDE_LOOP_START
+  {
+    const v128 *w = p;
+    WOP (DECLARE_VAR)
+    WOP (CTR_START)
+    do
+    {
+      WOP_KEY (AES_E_MC, 0)
+      WOP_KEY (AES_E_MC, 1)
+      w += 2;
+    }
+    while (w != wEnd);
+    WOP_KEY (AES_E_MC, 0)
+    WOP_KEY (AES_E,    1)
+    WOP_KEY (AES_XOR,  2)
+    WOP (CTR_END)
+  }
+  WIDE_LOOP_END
+
+  SINGLE_LOOP
+  {
+    const v128 *w = p;
+    v128 m;
+    CTR_START (m, 0)
+    do
+    {
+      AES_E_MC_m (w[0])
+      AES_E_MC_m (w[1])
+      w += 2;
+    }
+    while (w != wEnd);
+    AES_E_MC_m (w[0])
+    AES_E_m    (w[1])
+    MM_XOR_m   (w[2])
+    CTR_END (m, 0)
+  }
+  
+  p[-2] = vreinterpretq_u8_u64(ctr);
 }
 
-#endif
+#endif // USE_HW_AES
+
+#endif // MY_CPU_ARM_OR_ARM64
+
+#undef NUM_WAYS
+#undef WOP_M1
+#undef WOP
+#undef DECLARE_VAR
+#undef LOAD_data
+#undef STORE_data
+#undef USE_INTEL_AES
+#undef USE_HW_AES
diff --git a/src/sdk/C/Alloc.c b/src/sdk/C/Alloc.c
index bcede4b..63e1a12 100644
--- a/src/sdk/C/Alloc.c
+++ b/src/sdk/C/Alloc.c
@@ -1,38 +1,53 @@
 /* Alloc.c -- Memory allocation functions
-2018-04-27 : Igor Pavlov : Public domain */
+2024-02-18 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
-#include <stdio.h>
-
 #ifdef _WIN32
-#include <windows.h>
+#include "7zWindows.h"
 #endif
 #include <stdlib.h>
 
 #include "Alloc.h"
 
-/* #define _SZ_ALLOC_DEBUG */
+#if defined(Z7_LARGE_PAGES) && defined(_WIN32) && \
+    (!defined(Z7_WIN32_WINNT_MIN) || Z7_WIN32_WINNT_MIN < 0x0502)  // < Win2003 (xp-64)
+  #define Z7_USE_DYN_GetLargePageMinimum
+#endif
 
-/* use _SZ_ALLOC_DEBUG to debug alloc/free operations */
-#ifdef _SZ_ALLOC_DEBUG
+// for debug:
+#if 0
+#if defined(__CHERI__) && defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
+// #pragma message("=== Z7_ALLOC_NO_OFFSET_ALLOCATOR === ")
+#define Z7_ALLOC_NO_OFFSET_ALLOCATOR
+#endif
+#endif
 
+// #define SZ_ALLOC_DEBUG
+/* #define SZ_ALLOC_DEBUG */
+
+/* use SZ_ALLOC_DEBUG to debug alloc/free operations */
+#ifdef SZ_ALLOC_DEBUG
+
+#include <string.h>
 #include <stdio.h>
-int g_allocCount = 0;
-int g_allocCountMid = 0;
-int g_allocCountBig = 0;
+static int g_allocCount = 0;
+#ifdef _WIN32
+static int g_allocCountMid = 0;
+static int g_allocCountBig = 0;
+#endif
 
 
 #define CONVERT_INT_TO_STR(charType, tempSize) \
-  unsigned char temp[tempSize]; unsigned i = 0; \
-  while (val >= 10) { temp[i++] = (unsigned char)('0' + (unsigned)(val % 10)); val /= 10; } \
+  char temp[tempSize]; unsigned i = 0; \
+  while (val >= 10) { temp[i++] = (char)('0' + (unsigned)(val % 10)); val /= 10; } \
   *s++ = (charType)('0' + (unsigned)val); \
   while (i != 0) { i--; *s++ = temp[i]; } \
   *s = 0;
 
 static void ConvertUInt64ToString(UInt64 val, char *s)
 {
-  CONVERT_INT_TO_STR(char, 24);
+  CONVERT_INT_TO_STR(char, 24)
 }
 
 #define GET_HEX_CHAR(t) ((char)(((t < 10) ? ('0' + t) : ('A' + (t - 10)))))
@@ -77,7 +92,7 @@ static void PrintAligned(const char *s, size_t align)
   Print(s);
 }
 
-static void PrintLn()
+static void PrintLn(void)
 {
   Print("\n");
 }
@@ -89,10 +104,10 @@ static void PrintHex(UInt64 v, size_t align)
   PrintAligned(s, align);
 }
 
-static void PrintDec(UInt64 v, size_t align)
+static void PrintDec(int v, size_t align)
 {
   char s[32];
-  ConvertUInt64ToString(v, s);
+  ConvertUInt64ToString((unsigned)v, s);
   PrintAligned(s, align);
 }
 
@@ -102,12 +117,19 @@ static void PrintAddr(void *p)
 }
 
 
-#define PRINT_ALLOC(name, cnt, size, ptr) \
+#define PRINT_REALLOC(name, cnt, size, ptr) { \
+    Print(name " "); \
+    if (!ptr) PrintDec(cnt++, 10); \
+    PrintHex(size, 10); \
+    PrintAddr(ptr); \
+    PrintLn(); }
+
+#define PRINT_ALLOC(name, cnt, size, ptr) { \
     Print(name " "); \
     PrintDec(cnt++, 10); \
     PrintHex(size, 10); \
     PrintAddr(ptr); \
-    PrintLn();
+    PrintLn(); }
  
 #define PRINT_FREE(name, cnt, ptr) if (ptr) { \
     Print(name " "); \
@@ -117,26 +139,45 @@ static void PrintAddr(void *p)
  
 #else
 
+#ifdef _WIN32
 #define PRINT_ALLOC(name, cnt, size, ptr)
+#endif
 #define PRINT_FREE(name, cnt, ptr)
 #define Print(s)
 #define PrintLn()
+#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
 #define PrintHex(v, align)
-#define PrintDec(v, align)
+#endif
 #define PrintAddr(p)
 
 #endif
 
 
+/*
+by specification:
+  malloc(non_NULL, 0)   : returns NULL or a unique pointer value that can later be successfully passed to free()
+  realloc(NULL, size)   : the call is equivalent to malloc(size)
+  realloc(non_NULL, 0)  : the call is equivalent to free(ptr)
+
+in main compilers:
+  malloc(0)             : returns non_NULL
+  realloc(NULL,     0)  : returns non_NULL
+  realloc(non_NULL, 0)  : returns NULL
+*/
+
 
 void *MyAlloc(size_t size)
 {
   if (size == 0)
     return NULL;
-  #ifdef _SZ_ALLOC_DEBUG
+  // PRINT_ALLOC("Alloc    ", g_allocCount, size, NULL)
+  #ifdef SZ_ALLOC_DEBUG
   {
     void *p = malloc(size);
-    PRINT_ALLOC("Alloc    ", g_allocCount, size, p);
+    if (p)
+    {
+      PRINT_ALLOC("Alloc    ", g_allocCount, size, p)
+    }
     return p;
   }
   #else
@@ -146,65 +187,107 @@ void *MyAlloc(size_t size)
 
 void MyFree(void *address)
 {
-  PRINT_FREE("Free    ", g_allocCount, address);
+  PRINT_FREE("Free    ", g_allocCount, address)
   
   free(address);
 }
 
+void *MyRealloc(void *address, size_t size)
+{
+  if (size == 0)
+  {
+    MyFree(address);
+    return NULL;
+  }
+  // PRINT_REALLOC("Realloc  ", g_allocCount, size, address)
+  #ifdef SZ_ALLOC_DEBUG
+  {
+    void *p = realloc(address, size);
+    if (p)
+    {
+      PRINT_REALLOC("Realloc    ", g_allocCount, size, address)
+    }
+    return p;
+  }
+  #else
+  return realloc(address, size);
+  #endif
+}
+
+
 #ifdef _WIN32
 
 void *MidAlloc(size_t size)
 {
   if (size == 0)
     return NULL;
-  
-  PRINT_ALLOC("Alloc-Mid", g_allocCountMid, size, NULL);
-  
+  #ifdef SZ_ALLOC_DEBUG
+  {
+    void *p = VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
+    if (p)
+    {
+      PRINT_ALLOC("Alloc-Mid", g_allocCountMid, size, p)
+    }
+    return p;
+  }
+  #else
   return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
+  #endif
 }
 
 void MidFree(void *address)
 {
-  PRINT_FREE("Free-Mid", g_allocCountMid, address);
+  PRINT_FREE("Free-Mid", g_allocCountMid, address)
 
   if (!address)
     return;
   VirtualFree(address, 0, MEM_RELEASE);
 }
 
-#ifndef MEM_LARGE_PAGES
-#undef _7ZIP_LARGE_PAGES
+#ifdef Z7_LARGE_PAGES
+
+#ifdef MEM_LARGE_PAGES
+  #define MY_MEM_LARGE_PAGES  MEM_LARGE_PAGES
+#else
+  #define MY_MEM_LARGE_PAGES  0x20000000
 #endif
 
-#ifdef _7ZIP_LARGE_PAGES
+extern
+SIZE_T g_LargePageSize;
 SIZE_T g_LargePageSize = 0;
-typedef SIZE_T (WINAPI *GetLargePageMinimumP)();
-#endif
+typedef SIZE_T (WINAPI *Func_GetLargePageMinimum)(VOID);
 
-void SetLargePageSize()
+void SetLargePageSize(void)
 {
-  #ifdef _7ZIP_LARGE_PAGES
   SIZE_T size;
-  GetLargePageMinimumP largePageMinimum = (GetLargePageMinimumP)
-        GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "GetLargePageMinimum");
-  if (!largePageMinimum)
+#ifdef Z7_USE_DYN_GetLargePageMinimum
+Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
+
+  const
+   Func_GetLargePageMinimum fn =
+  (Func_GetLargePageMinimum) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
+       "GetLargePageMinimum");
+  if (!fn)
     return;
-  size = largePageMinimum();
+  size = fn();
+#else
+  size = GetLargePageMinimum();
+#endif
   if (size == 0 || (size & (size - 1)) != 0)
     return;
   g_LargePageSize = size;
-  #endif
 }
 
+#endif // Z7_LARGE_PAGES
 
 void *BigAlloc(size_t size)
 {
   if (size == 0)
     return NULL;
 
-  PRINT_ALLOC("Alloc-Big", g_allocCountBig, size, NULL);
-  
-  #ifdef _7ZIP_LARGE_PAGES
+  PRINT_ALLOC("Alloc-Big", g_allocCountBig, size, NULL)
+
+  #ifdef Z7_LARGE_PAGES
   {
     SIZE_T ps = g_LargePageSize;
     if (ps != 0 && ps <= (1 << 30) && size > (ps / 2))
@@ -214,56 +297,43 @@ void *BigAlloc(size_t size)
       size2 = (size + ps) & ~ps;
       if (size2 >= size)
       {
-        void *res = VirtualAlloc(NULL, size2, MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE);
-        if (res)
-          return res;
+        void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY_MEM_LARGE_PAGES, PAGE_READWRITE);
+        if (p)
+        {
+          PRINT_ALLOC("Alloc-BM ", g_allocCountMid, size2, p)
+          return p;
+        }
       }
     }
   }
   #endif
 
-  return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
+  return MidAlloc(size);
 }
 
 void BigFree(void *address)
 {
-  PRINT_FREE("Free-Big", g_allocCountBig, address);
-  
-  if (!address)
-    return;
-  VirtualFree(address, 0, MEM_RELEASE);
+  PRINT_FREE("Free-Big", g_allocCountBig, address)
+  MidFree(address);
 }
 
-#endif
+#endif // _WIN32
 
 
-static void *SzAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MyAlloc(size); }
-static void SzFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MyFree(address); }
+static void *SzAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p)  return MyAlloc(size); }
+static void SzFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p)  MyFree(address); }
 const ISzAlloc g_Alloc = { SzAlloc, SzFree };
 
-static void *SzMidAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MidAlloc(size); }
-static void SzMidFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MidFree(address); }
+#ifdef _WIN32
+static void *SzMidAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p)  return MidAlloc(size); }
+static void SzMidFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p)  MidFree(address); }
+static void *SzBigAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p)  return BigAlloc(size); }
+static void SzBigFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p)  BigFree(address); }
 const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree };
-
-static void *SzBigAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return BigAlloc(size); }
-static void SzBigFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); BigFree(address); }
 const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree };
-
-
-/*
-  uintptr_t : <stdint.h> C99 (optional)
-            : unsupported in VS6
-*/
-
-#ifdef _WIN32
-  typedef UINT_PTR UIntPtr;
-#else
-  /*
-  typedef uintptr_t UIntPtr;
-  */
-  typedef ptrdiff_t UIntPtr;
 #endif
 
+#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
 
 #define ADJUST_ALLOC_SIZE 0
 /*
@@ -274,19 +344,43 @@ const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree };
      MyAlloc() can return address that is NOT multiple of sizeof(void *).
 */
 
-
 /*
-#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((char *)(p) - ((size_t)(UIntPtr)(p) & ((align) - 1))))
+  uintptr_t : <stdint.h> C99 (optional)
+            : unsupported in VS6
 */
-#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((((UIntPtr)(p)) & ~((UIntPtr)(align) - 1))))
-
-#define MY_ALIGN_PTR_UP_PLUS(p, align) MY_ALIGN_PTR_DOWN(((char *)(p) + (align) + ADJUST_ALLOC_SIZE), align)
+typedef
+  #ifdef _WIN32
+    UINT_PTR
+  #elif 1
+    uintptr_t
+  #else
+    ptrdiff_t
+  #endif
+    MY_uintptr_t;
+
+#if 0 \
+    || (defined(__CHERI__) \
+    || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ > 8))
+// for 128-bit pointers (cheri):
+#define MY_ALIGN_PTR_DOWN(p, align)  \
+    ((void *)((char *)(p) - ((size_t)(MY_uintptr_t)(p) & ((align) - 1))))
+#else
+#define MY_ALIGN_PTR_DOWN(p, align) \
+    ((void *)((((MY_uintptr_t)(p)) & ~((MY_uintptr_t)(align) - 1))))
+#endif
 
+#endif
 
-#if (_POSIX_C_SOURCE >= 200112L) && !defined(_WIN32)
+#if !defined(_WIN32) \
+    && (defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) \
+        || defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L))
   #define USE_posix_memalign
 #endif
 
+#ifndef USE_posix_memalign
+#define MY_ALIGN_PTR_UP_PLUS(p, align) MY_ALIGN_PTR_DOWN(((char *)(p) + (align) + ADJUST_ALLOC_SIZE), align)
+#endif
+
 /*
   This posix_memalign() is for test purposes only.
   We also need special Free() function instead of free(),
@@ -319,14 +413,13 @@ static int posix_memalign(void **ptr, size_t align, size_t size)
 
 #define ALLOC_ALIGN_SIZE ((size_t)1 << 7)
 
-static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
+void *z7_AlignedAlloc(size_t size)
 {
-  #ifndef USE_posix_memalign
+#ifndef USE_posix_memalign
   
   void *p;
   void *pAligned;
   size_t newSize;
-  UNUSED_VAR(pp);
 
   /* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned
      block to prevent cache line sharing with another allocated blocks */
@@ -351,10 +444,9 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
 
   return pAligned;
 
-  #else
+#else
 
   void *p;
-  UNUSED_VAR(pp);
   if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size))
     return NULL;
 
@@ -363,19 +455,37 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
 
   return p;
 
-  #endif
+#endif
+}
+
+
+void z7_AlignedFree(void *address)
+{
+#ifndef USE_posix_memalign
+  if (address)
+    MyFree(((void **)address)[-1]);
+#else
+  free(address);
+#endif
+}
+
+
+static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
+{
+  UNUSED_VAR(pp)
+  return z7_AlignedAlloc(size);
 }
 
 
 static void SzAlignedFree(ISzAllocPtr pp, void *address)
 {
-  UNUSED_VAR(pp);
-  #ifndef USE_posix_memalign
+  UNUSED_VAR(pp)
+#ifndef USE_posix_memalign
   if (address)
     MyFree(((void **)address)[-1]);
-  #else
+#else
   free(address);
-  #endif
+#endif
 }
 
 
@@ -383,17 +493,45 @@ const ISzAlloc g_AlignedAlloc = { SzAlignedAlloc, SzAlignedFree };
 
 
 
-#define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *))
-
 /* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */
-#define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1]
-/*
-#define REAL_BLOCK_PTR_VAR(p) ((void **)(p))[-1]
-*/
+#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
+#if 1
+  #define MY_ALIGN_PTR_DOWN_1(p)  MY_ALIGN_PTR_DOWN(p, sizeof(void *))
+  #define REAL_BLOCK_PTR_VAR(p)  ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1]
+#else
+  // we can use this simplified code,
+  // if (CAlignOffsetAlloc::offset == (k * sizeof(void *))
+  #define REAL_BLOCK_PTR_VAR(p)  (((void **)(p))[-1])
+#endif
+#endif
+
+
+#if 0
+#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
+#include <stdio.h>
+static void PrintPtr(const char *s, const void *p)
+{
+  const Byte *p2 = (const Byte *)&p;
+  unsigned i;
+  printf("%s %p ", s, p);
+  for (i = sizeof(p); i != 0;)
+  {
+    i--;
+    printf("%02x", p2[i]);
+  }
+  printf("\n");
+}
+#endif
+#endif
+
 
 static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
 {
-  CAlignOffsetAlloc *p = CONTAINER_FROM_VTBL(pp, CAlignOffsetAlloc, vt);
+#if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR)
+  UNUSED_VAR(pp)
+  return z7_AlignedAlloc(size);
+#else
+  const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt);
   void *adr;
   void *pAligned;
   size_t newSize;
@@ -421,6 +559,12 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
   pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr +
       alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset;
 
+#if 0
+  printf("\nalignSize = %6x, offset=%6x, size=%8x \n", (unsigned)alignSize, (unsigned)p->offset, (unsigned)size);
+  PrintPtr("base", adr);
+  PrintPtr("alig", pAligned);
+#endif
+
   PrintLn();
   Print("- Aligned: ");
   Print(" size="); PrintHex(size, 8);
@@ -432,19 +576,25 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
   REAL_BLOCK_PTR_VAR(pAligned) = adr;
 
   return pAligned;
+#endif
 }
 
 
 static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address)
 {
+#if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR)
+  UNUSED_VAR(pp)
+  z7_AlignedFree(address);
+#else
   if (address)
   {
-    CAlignOffsetAlloc *p = CONTAINER_FROM_VTBL(pp, CAlignOffsetAlloc, vt);
+    const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt);
     PrintLn();
     Print("- Aligned Free: ");
     PrintLn();
     ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address));
   }
+#endif
 }
 
 
diff --git a/src/sdk/C/Alloc.h b/src/sdk/C/Alloc.h
index 6482376..01bf6b7 100644
--- a/src/sdk/C/Alloc.h
+++ b/src/sdk/C/Alloc.h
@@ -1,37 +1,62 @@
 /* Alloc.h -- Memory allocation functions
-2018-02-19 : Igor Pavlov : Public domain */
+2024-01-22 : Igor Pavlov : Public domain */
 
-#ifndef __COMMON_ALLOC_H
-#define __COMMON_ALLOC_H
+#ifndef ZIP7_INC_ALLOC_H
+#define ZIP7_INC_ALLOC_H
 
 #include "7zTypes.h"
 
 EXTERN_C_BEGIN
 
+/*
+  MyFree(NULL)        : is allowed, as free(NULL)
+  MyAlloc(0)          : returns NULL : but malloc(0)        is allowed to return NULL or non_NULL
+  MyRealloc(NULL, 0)  : returns NULL : but realloc(NULL, 0) is allowed to return NULL or non_NULL
+MyRealloc() is similar to realloc() for the following cases:
+  MyRealloc(non_NULL, 0)         : returns NULL and always calls MyFree(ptr)
+  MyRealloc(NULL, non_ZERO)      : returns NULL, if allocation failed
+  MyRealloc(non_NULL, non_ZERO)  : returns NULL, if reallocation failed
+*/
+
 void *MyAlloc(size_t size);
 void MyFree(void *address);
+void *MyRealloc(void *address, size_t size);
+
+void *z7_AlignedAlloc(size_t size);
+void  z7_AlignedFree(void *p);
 
 #ifdef _WIN32
 
-void SetLargePageSize();
+#ifdef Z7_LARGE_PAGES
+void SetLargePageSize(void);
+#endif
 
 void *MidAlloc(size_t size);
 void MidFree(void *address);
 void *BigAlloc(size_t size);
 void BigFree(void *address);
 
+/* #define Z7_BIG_ALLOC_IS_ZERO_FILLED */
+
 #else
 
-#define MidAlloc(size) MyAlloc(size)
-#define MidFree(address) MyFree(address)
-#define BigAlloc(size) MyAlloc(size)
-#define BigFree(address) MyFree(address)
+#define MidAlloc(size)    z7_AlignedAlloc(size)
+#define MidFree(address)  z7_AlignedFree(address)
+#define BigAlloc(size)    z7_AlignedAlloc(size)
+#define BigFree(address)  z7_AlignedFree(address)
 
 #endif
 
 extern const ISzAlloc g_Alloc;
+
+#ifdef _WIN32
 extern const ISzAlloc g_BigAlloc;
 extern const ISzAlloc g_MidAlloc;
+#else
+#define g_BigAlloc g_AlignedAlloc
+#define g_MidAlloc g_AlignedAlloc
+#endif
+
 extern const ISzAlloc g_AlignedAlloc;
 
 
diff --git a/src/sdk/C/Bcj2.c b/src/sdk/C/Bcj2.c
index 9a0046a..7cb57ad 100644
--- a/src/sdk/C/Bcj2.c
+++ b/src/sdk/C/Bcj2.c
@@ -1,29 +1,24 @@
 /* Bcj2.c -- BCJ2 Decoder (Converter for x86 code)
-2018-04-28 : Igor Pavlov : Public domain */
+2023-03-01 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include "Bcj2.h"
 #include "CpuArch.h"
 
-#define CProb UInt16
-
 #define kTopValue ((UInt32)1 << 24)
-#define kNumModelBits 11
-#define kBitModelTotal (1 << kNumModelBits)
+#define kNumBitModelTotalBits 11
+#define kBitModelTotal (1 << kNumBitModelTotalBits)
 #define kNumMoveBits 5
 
-#define _IF_BIT_0 ttt = *prob; bound = (p->range >> kNumModelBits) * ttt; if (p->code < bound)
-#define _UPDATE_0 p->range = bound; *prob = (CProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
-#define _UPDATE_1 p->range -= bound; p->code -= bound; *prob = (CProb)(ttt - (ttt >> kNumMoveBits));
+// UInt32 bcj2_stats[256 + 2][2];
 
 void Bcj2Dec_Init(CBcj2Dec *p)
 {
   unsigned i;
-
-  p->state = BCJ2_DEC_STATE_OK;
+  p->state = BCJ2_STREAM_RC; // BCJ2_DEC_STATE_OK;
   p->ip = 0;
-  p->temp[3] = 0;
+  p->temp = 0;
   p->range = 0;
   p->code = 0;
   for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
@@ -32,217 +27,248 @@ void Bcj2Dec_Init(CBcj2Dec *p)
 
 SRes Bcj2Dec_Decode(CBcj2Dec *p)
 {
+  UInt32 v = p->temp;
+  // const Byte *src;
   if (p->range <= 5)
   {
-    p->state = BCJ2_DEC_STATE_OK;
+    UInt32 code = p->code;
+    p->state = BCJ2_DEC_STATE_ERROR; /* for case if we return SZ_ERROR_DATA; */
     for (; p->range != 5; p->range++)
     {
-      if (p->range == 1 && p->code != 0)
+      if (p->range == 1 && code != 0)
         return SZ_ERROR_DATA;
-      
       if (p->bufs[BCJ2_STREAM_RC] == p->lims[BCJ2_STREAM_RC])
       {
         p->state = BCJ2_STREAM_RC;
         return SZ_OK;
       }
-
-      p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++;
+      code = (code << 8) | *(p->bufs[BCJ2_STREAM_RC])++;
+      p->code = code;
     }
-    
-    if (p->code == 0xFFFFFFFF)
+    if (code == 0xffffffff)
       return SZ_ERROR_DATA;
-    
-    p->range = 0xFFFFFFFF;
+    p->range = 0xffffffff;
   }
-  else if (p->state >= BCJ2_DEC_STATE_ORIG_0)
+  // else
   {
-    while (p->state <= BCJ2_DEC_STATE_ORIG_3)
+    unsigned state = p->state;
+    // we check BCJ2_IS_32BIT_STREAM() here instead of check in the main loop
+    if (BCJ2_IS_32BIT_STREAM(state))
     {
-      Byte *dest = p->dest;
-      if (dest == p->destLim)
+      const Byte *cur = p->bufs[state];
+      if (cur == p->lims[state])
         return SZ_OK;
-      *dest = p->temp[(size_t)p->state - BCJ2_DEC_STATE_ORIG_0];
-      p->state++;
-      p->dest = dest + 1;
+      p->bufs[state] = cur + 4;
+      {
+        const UInt32 ip = p->ip + 4;
+        v = GetBe32a(cur) - ip;
+        p->ip = ip;
+      }
+      state = BCJ2_DEC_STATE_ORIG_0;
     }
-  }
-
-  /*
-  if (BCJ2_IS_32BIT_STREAM(p->state))
-  {
-    const Byte *cur = p->bufs[p->state];
-    if (cur == p->lims[p->state])
-      return SZ_OK;
-    p->bufs[p->state] = cur + 4;
-    
+    if ((unsigned)(state - BCJ2_DEC_STATE_ORIG_0) < 4)
     {
-      UInt32 val;
-      Byte *dest;
-      SizeT rem;
-      
-      p->ip += 4;
-      val = GetBe32(cur) - p->ip;
-      dest = p->dest;
-      rem = p->destLim - dest;
-      if (rem < 4)
+      Byte *dest = p->dest;
+      for (;;)
       {
-        SizeT i;
-        SetUi32(p->temp, val);
-        for (i = 0; i < rem; i++)
-          dest[i] = p->temp[i];
-        p->dest = dest + rem;
-        p->state = BCJ2_DEC_STATE_ORIG_0 + (unsigned)rem;
-        return SZ_OK;
+        if (dest == p->destLim)
+        {
+          p->state = state;
+          p->temp = v;
+          return SZ_OK;
+        }
+        *dest++ = (Byte)v;
+        p->dest = dest;
+        if (++state == BCJ2_DEC_STATE_ORIG_3 + 1)
+          break;
+        v >>= 8;
       }
-      SetUi32(dest, val);
-      p->temp[3] = (Byte)(val >> 24);
-      p->dest = dest + 4;
-      p->state = BCJ2_DEC_STATE_OK;
     }
   }
-  */
 
+  // src = p->bufs[BCJ2_STREAM_MAIN];
   for (;;)
   {
+    /*
     if (BCJ2_IS_32BIT_STREAM(p->state))
       p->state = BCJ2_DEC_STATE_OK;
     else
+    */
     {
       if (p->range < kTopValue)
       {
         if (p->bufs[BCJ2_STREAM_RC] == p->lims[BCJ2_STREAM_RC])
         {
           p->state = BCJ2_STREAM_RC;
+          p->temp = v;
           return SZ_OK;
         }
         p->range <<= 8;
         p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++;
       }
-
       {
         const Byte *src = p->bufs[BCJ2_STREAM_MAIN];
         const Byte *srcLim;
-        Byte *dest;
-        SizeT num = p->lims[BCJ2_STREAM_MAIN] - src;
-        
-        if (num == 0)
+        Byte *dest = p->dest;
         {
-          p->state = BCJ2_STREAM_MAIN;
-          return SZ_OK;
+          const SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - src);
+          SizeT num = (SizeT)(p->destLim - dest);
+          if (num >= rem)
+            num = rem;
+        #define NUM_ITERS 4
+        #if (NUM_ITERS & (NUM_ITERS - 1)) == 0
+          num &= ~((SizeT)NUM_ITERS - 1);   // if (NUM_ITERS == (1 << x))
+        #else
+          num -= num % NUM_ITERS; // if (NUM_ITERS != (1 << x))
+        #endif
+          srcLim = src + num;
         }
-        
-        dest = p->dest;
-        if (num > (SizeT)(p->destLim - dest))
+
+        #define NUM_SHIFT_BITS  24
+        #define ONE_ITER(indx) { \
+          const unsigned b = src[indx]; \
+          *dest++ = (Byte)b; \
+          v = (v << NUM_SHIFT_BITS) | b; \
+          if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
+          if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
+              ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
+            /* ++dest */; /* v = b; */ }
+          
+        if (src != srcLim)
+        for (;;)
         {
-          num = p->destLim - dest;
-          if (num == 0)
-          {
-            p->state = BCJ2_DEC_STATE_ORIG;
-            return SZ_OK;
-          }
+            /* The dependency chain of 2-cycle for (v) calculation is not big problem here.
+               But we can remove dependency chain with v = b in the end of loop. */
+          ONE_ITER(0)
+          #if (NUM_ITERS > 1)
+            ONE_ITER(1)
+          #if (NUM_ITERS > 2)
+            ONE_ITER(2)
+          #if (NUM_ITERS > 3)
+            ONE_ITER(3)
+          #if (NUM_ITERS > 4)
+            ONE_ITER(4)
+          #if (NUM_ITERS > 5)
+            ONE_ITER(5)
+          #if (NUM_ITERS > 6)
+            ONE_ITER(6)
+          #if (NUM_ITERS > 7)
+            ONE_ITER(7)
+          #endif
+          #endif
+          #endif
+          #endif
+          #endif
+          #endif
+          #endif
+          
+          src += NUM_ITERS;
+          if (src == srcLim)
+            break;
         }
-       
-        srcLim = src + num;
 
-        if (p->temp[3] == 0x0F && (src[0] & 0xF0) == 0x80)
-          *dest = src[0];
-        else for (;;)
+        if (src == srcLim)
+      #if (NUM_ITERS > 1)
+        for (;;)
+      #endif
         {
-          Byte b = *src;
-          *dest = b;
-          if (b != 0x0F)
+        #if (NUM_ITERS > 1)
+          if (src == p->lims[BCJ2_STREAM_MAIN] || dest == p->destLim)
+        #endif
           {
-            if ((b & 0xFE) == 0xE8)
-              break;
-            dest++;
-            if (++src != srcLim)
-              continue;
-            break;
+            const SizeT num = (SizeT)(src - p->bufs[BCJ2_STREAM_MAIN]);
+            p->bufs[BCJ2_STREAM_MAIN] = src;
+            p->dest = dest;
+            p->ip += (UInt32)num;
+            /* state BCJ2_STREAM_MAIN has more priority than BCJ2_STATE_ORIG */
+            p->state =
+              src == p->lims[BCJ2_STREAM_MAIN] ?
+                (unsigned)BCJ2_STREAM_MAIN :
+                (unsigned)BCJ2_DEC_STATE_ORIG;
+            p->temp = v;
+            return SZ_OK;
           }
-          dest++;
-          if (++src == srcLim)
-            break;
-          if ((*src & 0xF0) != 0x80)
-            continue;
-          *dest = *src;
-          break;
+        #if (NUM_ITERS > 1)
+          ONE_ITER(0)
+          src++;
+        #endif
         }
-        
-        num = src - p->bufs[BCJ2_STREAM_MAIN];
-        
-        if (src == srcLim)
+
         {
-          p->temp[3] = src[-1];
-          p->bufs[BCJ2_STREAM_MAIN] = src;
+          const SizeT num = (SizeT)(dest - p->dest);
+          p->dest = dest; // p->dest += num;
+          p->bufs[BCJ2_STREAM_MAIN] += num; // = src;
           p->ip += (UInt32)num;
-          p->dest += num;
-          p->state =
-            p->bufs[BCJ2_STREAM_MAIN] ==
-            p->lims[BCJ2_STREAM_MAIN] ?
-              (unsigned)BCJ2_STREAM_MAIN :
-              (unsigned)BCJ2_DEC_STATE_ORIG;
-          return SZ_OK;
         }
-        
         {
           UInt32 bound, ttt;
-          CProb *prob;
-          Byte b = src[0];
-          Byte prev = (Byte)(num == 0 ? p->temp[3] : src[-1]);
-          
-          p->temp[3] = b;
-          p->bufs[BCJ2_STREAM_MAIN] = src + 1;
-          num++;
-          p->ip += (UInt32)num;
-          p->dest += num;
-          
-          prob = p->probs + (unsigned)(b == 0xE8 ? 2 + (unsigned)prev : (b == 0xE9 ? 1 : 0));
-          
-          _IF_BIT_0
+          CBcj2Prob *prob; // unsigned index;
+          /*
+          prob = p->probs + (unsigned)((Byte)v == 0xe8 ?
+              2 + (Byte)(v >> 8) :
+              ((v >> 5) & 1));  // ((Byte)v < 0xe8 ? 0 : 1));
+          */
           {
-            _UPDATE_0
+            const unsigned c = ((v + 0x17) >> 6) & 1;
+            prob = p->probs + (unsigned)
+                (((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
+                // (Byte)
+                // 8x->0     : e9->1     : xxe8->xx+2
+                // 8x->0x100 : e9->0x101 : xxe8->xx
+                // (((0x100 - (e & ~v)) & (0x100 | (v >> 8))) + (e & v));
+                // (((0x101 + (~e | v)) & (0x100 | (v >> 8))) + (e & v));
+          }
+          ttt = *prob;
+          bound = (p->range >> kNumBitModelTotalBits) * ttt;
+          if (p->code < bound)
+          {
+            // bcj2_stats[prob - p->probs][0]++;
+            p->range = bound;
+            *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
             continue;
           }
-          _UPDATE_1
-            
+          {
+            // bcj2_stats[prob - p->probs][1]++;
+            p->range -= bound;
+            p->code -= bound;
+            *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
+          }
         }
       }
     }
-
     {
-      UInt32 val;
-      unsigned cj = (p->temp[3] == 0xE8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP;
+      /* (v == 0xe8 ? 0 : 1) uses setcc instruction with additional zero register usage in x64 MSVC. */
+      // const unsigned cj = ((Byte)v == 0xe8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP;
+      const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
       const Byte *cur = p->bufs[cj];
       Byte *dest;
       SizeT rem;
-      
       if (cur == p->lims[cj])
       {
         p->state = cj;
         break;
       }
-      
-      val = GetBe32(cur);
+      v = GetBe32a(cur);
       p->bufs[cj] = cur + 4;
-
-      p->ip += 4;
-      val -= p->ip;
+      {
+        const UInt32 ip = p->ip + 4;
+        v -= ip;
+        p->ip = ip;
+      }
       dest = p->dest;
-      rem = p->destLim - dest;
-      
+      rem = (SizeT)(p->destLim - dest);
       if (rem < 4)
       {
-        p->temp[0] = (Byte)val; if (rem > 0) dest[0] = (Byte)val; val >>= 8;
-        p->temp[1] = (Byte)val; if (rem > 1) dest[1] = (Byte)val; val >>= 8;
-        p->temp[2] = (Byte)val; if (rem > 2) dest[2] = (Byte)val; val >>= 8;
-        p->temp[3] = (Byte)val;
+        if ((unsigned)rem > 0) { dest[0] = (Byte)v;  v >>= 8;
+        if ((unsigned)rem > 1) { dest[1] = (Byte)v;  v >>= 8;
+        if ((unsigned)rem > 2) { dest[2] = (Byte)v;  v >>= 8; }}}
+        p->temp = v;
         p->dest = dest + rem;
         p->state = BCJ2_DEC_STATE_ORIG_0 + (unsigned)rem;
         break;
       }
-      
-      SetUi32(dest, val);
-      p->temp[3] = (Byte)(val >> 24);
+      SetUi32(dest, v)
+      v >>= 24;
       p->dest = dest + 4;
     }
   }
@@ -252,6 +278,13 @@ SRes Bcj2Dec_Decode(CBcj2Dec *p)
     p->range <<= 8;
     p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++;
   }
-
   return SZ_OK;
 }
+
+#undef NUM_ITERS
+#undef ONE_ITER
+#undef NUM_SHIFT_BITS
+#undef kTopValue
+#undef kNumBitModelTotalBits
+#undef kBitModelTotal
+#undef kNumMoveBits
diff --git a/src/sdk/C/Bcj2.h b/src/sdk/C/Bcj2.h
index 8824080..4575545 100644
--- a/src/sdk/C/Bcj2.h
+++ b/src/sdk/C/Bcj2.h
@@ -1,8 +1,8 @@
-/* Bcj2.h -- BCJ2 Converter for x86 code
-2014-11-10 : Igor Pavlov : Public domain */
+/* Bcj2.h -- BCJ2 converter for x86 code (Branch CALL/JUMP variant2)
+2023-03-02 : Igor Pavlov : Public domain */
 
-#ifndef __BCJ2_H
-#define __BCJ2_H
+#ifndef ZIP7_INC_BCJ2_H
+#define ZIP7_INC_BCJ2_H
 
 #include "7zTypes.h"
 
@@ -26,37 +26,68 @@ enum
   BCJ2_DEC_STATE_ORIG_3,
   
   BCJ2_DEC_STATE_ORIG,
-  BCJ2_DEC_STATE_OK
+  BCJ2_DEC_STATE_ERROR     /* after detected data error */
 };
 
 enum
 {
   BCJ2_ENC_STATE_ORIG = BCJ2_NUM_STREAMS,
-  BCJ2_ENC_STATE_OK
+  BCJ2_ENC_STATE_FINISHED  /* it's state after fully encoded stream */
 };
 
 
-#define BCJ2_IS_32BIT_STREAM(s) ((s) == BCJ2_STREAM_CALL || (s) == BCJ2_STREAM_JUMP)
+/* #define BCJ2_IS_32BIT_STREAM(s) ((s) == BCJ2_STREAM_CALL || (s) == BCJ2_STREAM_JUMP) */
+#define BCJ2_IS_32BIT_STREAM(s) ((unsigned)((unsigned)(s) - (unsigned)BCJ2_STREAM_CALL) < 2)
 
 /*
 CBcj2Dec / CBcj2Enc
 bufs sizes:
   BUF_SIZE(n) = lims[n] - bufs[n]
-bufs sizes for BCJ2_STREAM_CALL and BCJ2_STREAM_JUMP must be mutliply of 4:
+bufs sizes for BCJ2_STREAM_CALL and BCJ2_STREAM_JUMP must be multiply of 4:
     (BUF_SIZE(BCJ2_STREAM_CALL) & 3) == 0
     (BUF_SIZE(BCJ2_STREAM_JUMP) & 3) == 0
 */
 
+// typedef UInt32 CBcj2Prob;
+typedef UInt16 CBcj2Prob;
+
+/*
+BCJ2 encoder / decoder internal requirements:
+  - If last bytes of stream contain marker (e8/e8/0f8x), then
+    there is also encoded symbol (0 : no conversion) in RC stream.
+  - One case of overlapped instructions is supported,
+    if last byte of converted instruction is (0f) and next byte is (8x):
+      marker [xx xx xx 0f] 8x
+    then the pair (0f 8x) is treated as marker.
+*/
+
+/* ---------- BCJ2 Decoder ---------- */
+
 /*
 CBcj2Dec:
-dest is allowed to overlap with bufs[BCJ2_STREAM_MAIN], with the following conditions:
+(dest) is allowed to overlap with bufs[BCJ2_STREAM_MAIN], with the following conditions:
   bufs[BCJ2_STREAM_MAIN] >= dest &&
-  bufs[BCJ2_STREAM_MAIN] - dest >= tempReserv +
+  bufs[BCJ2_STREAM_MAIN] - dest >=
         BUF_SIZE(BCJ2_STREAM_CALL) +
         BUF_SIZE(BCJ2_STREAM_JUMP)
-     tempReserv = 0 : for first call of Bcj2Dec_Decode
-     tempReserv = 4 : for any other calls of Bcj2Dec_Decode
-  overlap with offset = 1 is not allowed
+  reserve = bufs[BCJ2_STREAM_MAIN] - dest -
+      ( BUF_SIZE(BCJ2_STREAM_CALL) +
+        BUF_SIZE(BCJ2_STREAM_JUMP) )
+  and additional conditions:
+  if (it's first call of Bcj2Dec_Decode() after Bcj2Dec_Init())
+  {
+    (reserve != 1) : if (ver <  v23.00)
+  }
+  else // if there are more than one calls of Bcj2Dec_Decode() after Bcj2Dec_Init())
+  {
+    (reserve >= 6) : if (ver <  v23.00)
+    (reserve >= 4) : if (ver >= v23.00)
+    We need that (reserve) because after first call of Bcj2Dec_Decode(),
+    CBcj2Dec::temp can contain up to 4 bytes for writing to (dest).
+  }
+  (reserve == 0) is allowed, if we decode full stream via single call of Bcj2Dec_Decode().
+  (reserve == 0) also is allowed in case of multi-call, if we use fixed buffers,
+     and (reserve) is calculated from full (final) sizes of all streams before first call.
 */
 
 typedef struct
@@ -68,21 +99,65 @@ typedef struct
 
   unsigned state; /* BCJ2_STREAM_MAIN has more priority than BCJ2_STATE_ORIG */
 
-  UInt32 ip;
-  Byte temp[4];
+  UInt32 ip;      /* property of starting base for decoding */
+  UInt32 temp;    /* Byte temp[4]; */
   UInt32 range;
   UInt32 code;
-  UInt16 probs[2 + 256];
+  CBcj2Prob probs[2 + 256];
 } CBcj2Dec;
 
+
+/* Note:
+   Bcj2Dec_Init() sets (CBcj2Dec::ip = 0)
+   if (ip != 0) property is required, the caller must set CBcj2Dec::ip after Bcj2Dec_Init()
+*/
 void Bcj2Dec_Init(CBcj2Dec *p);
 
-/* Returns: SZ_OK or SZ_ERROR_DATA */
+
+/* Bcj2Dec_Decode():
+   returns:
+     SZ_OK
+     SZ_ERROR_DATA : if data in 5 starting bytes of BCJ2_STREAM_RC stream are not correct
+*/
 SRes Bcj2Dec_Decode(CBcj2Dec *p);
 
-#define Bcj2Dec_IsFinished(_p_) ((_p_)->code == 0)
+/* To check that decoding was finished you can compare
+   sizes of processed streams with sizes known from another sources.
+   You must do at least one mandatory check from the two following options:
+      - the check for size of processed output (ORIG) stream.
+      - the check for size of processed input  (MAIN) stream.
+   additional optional checks:
+      - the checks for processed sizes of all input streams (MAIN, CALL, JUMP, RC)
+      - the checks Bcj2Dec_IsMaybeFinished*()
+   also before actual decoding you can check that the
+   following condition is met for stream sizes:
+     ( size(ORIG) == size(MAIN) + size(CALL) + size(JUMP) )
+*/
 
+/* (state == BCJ2_STREAM_MAIN) means that decoder is ready for
+      additional input data in BCJ2_STREAM_MAIN stream.
+   Note that (state == BCJ2_STREAM_MAIN) is allowed for non-finished decoding.
+*/
+#define Bcj2Dec_IsMaybeFinished_state_MAIN(_p_) ((_p_)->state == BCJ2_STREAM_MAIN)
 
+/* if the stream decoding was finished correctly, then range decoder
+   part of CBcj2Dec also was finished, and then (CBcj2Dec::code == 0).
+   Note that (CBcj2Dec::code == 0) is allowed for non-finished decoding.
+*/
+#define Bcj2Dec_IsMaybeFinished_code(_p_) ((_p_)->code == 0)
+
+/* use Bcj2Dec_IsMaybeFinished() only as additional check
+    after at least one mandatory check from the two following options:
+      - the check for size of processed output (ORIG) stream.
+      - the check for size of processed input  (MAIN) stream.
+*/
+#define Bcj2Dec_IsMaybeFinished(_p_) ( \
+        Bcj2Dec_IsMaybeFinished_state_MAIN(_p_) && \
+        Bcj2Dec_IsMaybeFinished_code(_p_))
+
+
+
+/* ---------- BCJ2 Encoder ---------- */
 
 typedef enum
 {
@@ -91,6 +166,91 @@ typedef enum
   BCJ2_ENC_FINISH_MODE_END_STREAM
 } EBcj2Enc_FinishMode;
 
+/*
+  BCJ2_ENC_FINISH_MODE_CONTINUE:
+     process non finished encoding.
+     It notifies the encoder that additional further calls
+     can provide more input data (src) than provided by current call.
+     In  that case the CBcj2Enc encoder still can move (src) pointer
+     up to (srcLim), but CBcj2Enc encoder can store some of the last
+     processed bytes (up to 4 bytes) from src to internal CBcj2Enc::temp[] buffer.
+   at return:
+       (CBcj2Enc::src will point to position that includes
+       processed data and data copied to (temp[]) buffer)
+       That data from (temp[]) buffer will be used in further calls.
+  
+  BCJ2_ENC_FINISH_MODE_END_BLOCK:
+     finish encoding of current block (ended at srcLim) without RC flushing.
+   at return: if (CBcj2Enc::state == BCJ2_ENC_STATE_ORIG) &&
+                  CBcj2Enc::src == CBcj2Enc::srcLim)
+        :  it shows that block encoding was finished. And the encoder is
+           ready for new (src) data or for stream finish operation.
+     finished block means
+     {
+       CBcj2Enc has completed block encoding up to (srcLim).
+       (1 + 4 bytes) or (2 + 4 bytes) CALL/JUMP cortages will
+       not cross block boundary at (srcLim).
+       temporary CBcj2Enc buffer for (ORIG) src data is empty.
+       3 output uncompressed streams (MAIN, CALL, JUMP) were flushed.
+       RC stream was not flushed. And RC stream will cross block boundary.
+     }
+     Note: some possible implementation of BCJ2 encoder could
+     write branch marker (e8/e8/0f8x) in one call of Bcj2Enc_Encode(),
+     and it could calculate symbol for RC in another call of Bcj2Enc_Encode().
+     BCJ2 encoder uses ip/fileIp/fileSize/relatLimit values to calculate RC symbol.
+     And these CBcj2Enc variables can have different values in different Bcj2Enc_Encode() calls.
+     So caller must finish each block with BCJ2_ENC_FINISH_MODE_END_BLOCK
+     to ensure that RC symbol is calculated and written in proper block.
+    
+  BCJ2_ENC_FINISH_MODE_END_STREAM
+     finish encoding of stream (ended at srcLim) fully including RC flushing.
+   at return: if (CBcj2Enc::state == BCJ2_ENC_STATE_FINISHED)
+        : it shows that stream encoding was finished fully,
+          and all output streams were flushed fully.
+     also Bcj2Enc_IsFinished() can be called.
+*/
+
+
+/*
+  32-bit relative offset in JUMP/CALL commands is
+    - (mod 4 GiB)  for 32-bit x86 code
+    - signed Int32 for 64-bit x86-64 code
+  BCJ2 encoder also does internal relative to absolute address conversions.
+  And there are 2 possible ways to do it:
+    before v23: we used 32-bit variables and (mod 4 GiB) conversion
+    since  v23: we use  64-bit variables and (signed Int32 offset) conversion.
+  The absolute address condition for conversion in v23:
+    ((UInt64)((Int64)ip64 - (Int64)fileIp64 + 5 + (Int32)offset) < (UInt64)fileSize64)
+  note that if (fileSize64 > 2 GiB). there is difference between
+  old (mod 4 GiB) way (v22) and new (signed Int32 offset) way (v23).
+  And new (v23) way is more suitable to encode 64-bit x86-64 code for (fileSize64 > 2 GiB) cases.
+*/
+
+/*
+// for old (v22) way for conversion:
+typedef UInt32 CBcj2Enc_ip_unsigned;
+typedef  Int32 CBcj2Enc_ip_signed;
+#define BCJ2_ENC_FileSize_MAX ((UInt32)1 << 31)
+*/
+typedef UInt64 CBcj2Enc_ip_unsigned;
+typedef  Int64 CBcj2Enc_ip_signed;
+
+/* maximum size of file that can be used for conversion condition */
+#define BCJ2_ENC_FileSize_MAX             ((CBcj2Enc_ip_unsigned)0 - 2)
+
+/* default value of fileSize64_minus1 variable that means
+   that absolute address limitation will not be used */
+#define BCJ2_ENC_FileSizeField_UNLIMITED  ((CBcj2Enc_ip_unsigned)0 - 1)
+
+/* calculate value that later can be set to CBcj2Enc::fileSize64_minus1 */
+#define BCJ2_ENC_GET_FileSizeField_VAL_FROM_FileSize(fileSize) \
+    ((CBcj2Enc_ip_unsigned)(fileSize) - 1)
+
+/* set CBcj2Enc::fileSize64_minus1 variable from size of file */
+#define Bcj2Enc_SET_FileSize(p, fileSize) \
+    (p)->fileSize64_minus1 = BCJ2_ENC_GET_FileSizeField_VAL_FROM_FileSize(fileSize);
+
+
 typedef struct
 {
   Byte *bufs[BCJ2_NUM_STREAMS];
@@ -101,45 +261,71 @@ typedef struct
   unsigned state;
   EBcj2Enc_FinishMode finishMode;
 
-  Byte prevByte;
+  Byte context;
+  Byte flushRem;
+  Byte isFlushState;
 
   Byte cache;
   UInt32 range;
   UInt64 low;
   UInt64 cacheSize;
+  
+  // UInt32 context;  // for marker version, it can include marker flag.
 
-  UInt32 ip;
-
-  /* 32-bit ralative offset in JUMP/CALL commands is
-       - (mod 4 GB)   in 32-bit mode
-       - signed Int32 in 64-bit mode
-     We use (mod 4 GB) check for fileSize.
-     Use fileSize up to 2 GB, if you want to support 32-bit and 64-bit code conversion. */
-  UInt32 fileIp;
-  UInt32 fileSize;    /* (fileSize <= ((UInt32)1 << 31)), 0 means no_limit */
-  UInt32 relatLimit;  /* (relatLimit <= ((UInt32)1 << 31)), 0 means desable_conversion */
+  /* (ip64) and (fileIp64) correspond to virtual source stream position
+     that doesn't include data in temp[] */
+  CBcj2Enc_ip_unsigned ip64;         /* current (ip) position */
+  CBcj2Enc_ip_unsigned fileIp64;     /* start (ip) position of current file */
+  CBcj2Enc_ip_unsigned fileSize64_minus1;   /* size of current file (for conversion limitation) */
+  UInt32 relatLimit;  /* (relatLimit <= ((UInt32)1 << 31)) : 0 means disable_conversion */
+  // UInt32 relatExcludeBits;
 
   UInt32 tempTarget;
-  unsigned tempPos;
-  Byte temp[4 * 2];
-
-  unsigned flushPos;
-  
-  UInt16 probs[2 + 256];
+  unsigned tempPos; /* the number of bytes that were copied to temp[] buffer
+                       (tempPos <= 4) outside of Bcj2Enc_Encode() */
+  // Byte temp[4]; // for marker version
+  Byte temp[8];
+  CBcj2Prob probs[2 + 256];
 } CBcj2Enc;
 
 void Bcj2Enc_Init(CBcj2Enc *p);
-void Bcj2Enc_Encode(CBcj2Enc *p);
 
-#define Bcj2Enc_Get_InputData_Size(p) ((SizeT)((p)->srcLim - (p)->src) + (p)->tempPos)
-#define Bcj2Enc_IsFinished(p) ((p)->flushPos == 5)
 
+/*
+Bcj2Enc_Encode(): at exit:
+  p->State <  BCJ2_NUM_STREAMS    : we need more buffer space for output stream
+                                    (bufs[p->State] == lims[p->State])
+  p->State == BCJ2_ENC_STATE_ORIG : we need more data in input src stream
+                                    (src == srcLim)
+  p->State == BCJ2_ENC_STATE_FINISHED : after fully encoded stream
+*/
+void Bcj2Enc_Encode(CBcj2Enc *p);
 
-#define BCJ2_RELAT_LIMIT_NUM_BITS 26
-#define BCJ2_RELAT_LIMIT ((UInt32)1 << BCJ2_RELAT_LIMIT_NUM_BITS)
+/* Bcj2Enc encoder can look ahead for up 4 bytes of source stream.
+   CBcj2Enc::tempPos : is the number of bytes that were copied from input stream to temp[] buffer.
+   (CBcj2Enc::src) after Bcj2Enc_Encode() is starting position after
+   fully processed data and after data copied to temp buffer.
+   So if the caller needs to get real number of fully processed input
+   bytes (without look ahead data in temp buffer),
+   the caller must subtruct (CBcj2Enc::tempPos) value from processed size
+   value that is calculated based on current (CBcj2Enc::src):
+     cur_processed_pos = Calc_Big_Processed_Pos(enc.src)) -
+        Bcj2Enc_Get_AvailInputSize_in_Temp(&enc);
+*/
+/* get the size of input data that was stored in temp[] buffer: */
+#define Bcj2Enc_Get_AvailInputSize_in_Temp(p) ((p)->tempPos)
 
-/* limit for CBcj2Enc::fileSize variable */
-#define BCJ2_FileSize_MAX ((UInt32)1 << 31)
+#define Bcj2Enc_IsFinished(p) ((p)->flushRem == 0)
+
+/* Note : the decoder supports overlapping of marker (0f 80).
+   But we can eliminate such overlapping cases by setting
+   the limit for relative offset conversion as
+     CBcj2Enc::relatLimit <= (0x0f << 24) == (240 MiB)
+*/
+/* default value for CBcj2Enc::relatLimit */
+#define BCJ2_ENC_RELAT_LIMIT_DEFAULT  ((UInt32)0x0f << 24)
+#define BCJ2_ENC_RELAT_LIMIT_MAX      ((UInt32)1 << 31)
+// #define BCJ2_RELAT_EXCLUDE_NUM_BITS 5
 
 EXTERN_C_END
 
diff --git a/src/sdk/C/Bcj2Enc.c b/src/sdk/C/Bcj2Enc.c
index bfbeb8e..79460bb 100644
--- a/src/sdk/C/Bcj2Enc.c
+++ b/src/sdk/C/Bcj2Enc.c
@@ -1,60 +1,62 @@
-/* Bcj2Enc.c -- BCJ2 Encoder (Converter for x86 code)
-2019-02-02 : Igor Pavlov : Public domain */
+/* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)
+2023-04-02 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 /* #define SHOW_STAT */
-
 #ifdef SHOW_STAT
 #include <stdio.h>
-#define PRF(x) x
+#define PRF2(s) printf("%s ip=%8x  tempPos=%d  src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));
 #else
-#define PRF(x)
+#define PRF2(s)
 #endif
 
-#include <string.h>
-
 #include "Bcj2.h"
 #include "CpuArch.h"
 
-#define CProb UInt16
-
 #define kTopValue ((UInt32)1 << 24)
-#define kNumModelBits 11
-#define kBitModelTotal (1 << kNumModelBits)
+#define kNumBitModelTotalBits 11
+#define kBitModelTotal (1 << kNumBitModelTotalBits)
 #define kNumMoveBits 5
 
 void Bcj2Enc_Init(CBcj2Enc *p)
 {
   unsigned i;
-
-  p->state = BCJ2_ENC_STATE_OK;
+  p->state = BCJ2_ENC_STATE_ORIG;
   p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
-
-  p->prevByte = 0;
-
+  p->context = 0;
+  p->flushRem = 5;
+  p->isFlushState = 0;
   p->cache = 0;
-  p->range = 0xFFFFFFFF;
+  p->range = 0xffffffff;
   p->low = 0;
   p->cacheSize = 1;
-
-  p->ip = 0;
-
-  p->fileIp = 0;
-  p->fileSize = 0;
-  p->relatLimit = BCJ2_RELAT_LIMIT;
-
+  p->ip64 = 0;
+  p->fileIp64 = 0;
+  p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;
+  p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;
+  // p->relatExcludeBits = 0;
   p->tempPos = 0;
-
-  p->flushPos = 0;
-
   for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
     p->probs[i] = kBitModelTotal >> 1;
 }
 
-static BoolInt MY_FAST_CALL RangeEnc_ShiftLow(CBcj2Enc *p)
+// Z7_NO_INLINE
+Z7_FORCE_INLINE
+static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
 {
-  if ((UInt32)p->low < (UInt32)0xFF000000 || (UInt32)(p->low >> 32) != 0)
+  const UInt32 low = (UInt32)p->low;
+  const unsigned high = (unsigned)
+    #if defined(Z7_MSC_VER_ORIGINAL) \
+        && defined(MY_CPU_X86) \
+        && defined(MY_CPU_LE) \
+        && !defined(MY_CPU_64BIT)
+      // we try to rid of __aullshr() call in MSVS-x86
+      (((const UInt32 *)&p->low)[1]); // [1] : for little-endian only
+    #else
+      (p->low >> 32);
+    #endif
+  if (low < (UInt32)0xff000000 || high != 0)
   {
     Byte *buf = p->bufs[BCJ2_STREAM_RC];
     do
@@ -65,247 +67,440 @@ static BoolInt MY_FAST_CALL RangeEnc_ShiftLow(CBcj2Enc *p)
         p->bufs[BCJ2_STREAM_RC] = buf;
         return True;
       }
-      *buf++ = (Byte)(p->cache + (Byte)(p->low >> 32));
-      p->cache = 0xFF;
+      *buf++ = (Byte)(p->cache + high);
+      p->cache = 0xff;
     }
     while (--p->cacheSize);
     p->bufs[BCJ2_STREAM_RC] = buf;
-    p->cache = (Byte)((UInt32)p->low >> 24);
+    p->cache = (Byte)(low >> 24);
   }
   p->cacheSize++;
-  p->low = (UInt32)p->low << 8;
+  p->low = low << 8;
   return False;
 }
 
-static void Bcj2Enc_Encode_2(CBcj2Enc *p)
-{
-  if (BCJ2_IS_32BIT_STREAM(p->state))
+
+/*
+We can use 2 alternative versions of code:
+1) non-marker version:
+  Byte CBcj2Enc::context
+  Byte temp[8];
+  Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.
+  Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction
+  with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.
+
+2) marker version:
+  UInt32 CBcj2Enc::context
+  Byte CBcj2Enc::temp[4];
+  MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.
+  it's allowed that
+    one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,
+    and another call of Bcj2Enc_Encode_2() does offset conversion.
+    So different values of (fileIp) and (fileSize) are possible
+    in these different Bcj2Enc_Encode_2() calls.
+
+Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.
+So we use non-marker version.
+*/
+
+/*
+  Corner cases with overlap in multi-block.
+  before v23: there was one corner case, where converted instruction
+    could start in one sub-stream and finish in next sub-stream.
+  If multi-block (solid) encoding is used,
+    and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.
+    and (0f) is last byte of previous sub-stream
+    and (8x) is first byte of current sub-stream
+  then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.
+  BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,
+  if that offset meets limit requirements.
+  If encoder allows 32-bit offset conversion for such overlap case,
+  then the data in 3 uncompressed BCJ2 streams for some sub-stream
+  can depend from data of previous sub-stream.
+  That corner case is not big problem, and it's rare case.
+  Since v23.00 we do additional check to prevent conversions in such overlap cases.
+*/
+
+/*
+  Bcj2Enc_Encode_2() output variables at exit:
   {
-    Byte *cur = p->bufs[p->state];
-    if (cur == p->lims[p->state])
-      return;
-    SetBe32(cur, p->tempTarget);
-    p->bufs[p->state] = cur + 4;
+    if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))
+    {
+      it means that encoder needs more input data.
+      if (p->srcLim == p->src) at exit, then
+      {
+        (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
+        all input data were read and processed, and we are ready for
+        new input data.
+      }
+      else
+      {
+        (p->srcLim != p->src)
+        (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
+          The encoder have found e8/e9/0f_8x marker,
+          and p->src points to last byte of that marker,
+          Bcj2Enc_Encode_2() needs more input data to get totally
+          5 bytes (last byte of marker and 32-bit branch offset)
+          as continuous array starting from p->src.
+        (p->srcLim - p->src < 5) requirement is met after exit.
+          So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.
+      }
+    }
   }
+*/
 
-  p->state = BCJ2_ENC_STATE_ORIG;
-
-  for (;;)
+Z7_NO_INLINE
+static void Bcj2Enc_Encode_2(CBcj2Enc *p)
+{
+  if (!p->isFlushState)
   {
-    if (p->range < kTopValue)
+    const Byte *src;
+    UInt32 v;
     {
-      if (RangeEnc_ShiftLow(p))
-        return;
-      p->range <<= 8;
+      const unsigned state = p->state;
+      if (BCJ2_IS_32BIT_STREAM(state))
+      {
+        Byte *cur = p->bufs[state];
+        if (cur == p->lims[state])
+          return;
+        SetBe32a(cur, p->tempTarget)
+        p->bufs[state] = cur + 4;
+      }
     }
+    p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit
+    src = p->src;
+    v = p->context;
+    
+    // #define WRITE_CONTEXT  p->context = v; // for marker version
+    #define WRITE_CONTEXT           p->context = (Byte)v;
+    #define WRITE_CONTEXT_AND_SRC   p->src = src;  WRITE_CONTEXT
 
+    for (;;)
     {
+      // const Byte *src;
+      // UInt32 v;
+      CBcj2Enc_ip_unsigned ip;
+      if (p->range < kTopValue)
+      {
+        // to reduce register pressure and code size: we save and restore local variables.
+        WRITE_CONTEXT_AND_SRC
+        if (Bcj2_RangeEnc_ShiftLow(p))
+          return;
+        p->range <<= 8;
+        src = p->src;
+        v = p->context;
+      }
+      // src = p->src;
+      // #define MARKER_FLAG  ((UInt32)1 << 17)
+      // if ((v & MARKER_FLAG) == 0) // for marker version
       {
-        const Byte *src = p->src;
         const Byte *srcLim;
-        Byte *dest;
-        SizeT num = p->srcLim - src;
-
-        if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
+        Byte *dest = p->bufs[BCJ2_STREAM_MAIN];
         {
-          if (num <= 4)
-            return;
-          num -= 4;
+          const SizeT remSrc = (SizeT)(p->srcLim - src);
+          SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
+          if (rem >= remSrc)
+            rem = remSrc;
+          srcLim = src + rem;
         }
-        else if (num == 0)
-          break;
-
-        dest = p->bufs[BCJ2_STREAM_MAIN];
-        if (num > (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest))
+        /* p->context contains context of previous byte:
+           bits [0 : 7]  : src[-1], if (src) was changed in this call
+           bits [8 : 31] : are undefined for non-marker version
+        */
+        // v = p->context;
+        #define NUM_SHIFT_BITS  24
+        #define CONV_FLAG  ((UInt32)1 << 16)
+        #define ONE_ITER { \
+          b = src[0]; \
+          *dest++ = (Byte)b; \
+          v = (v << NUM_SHIFT_BITS) | b; \
+          if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
+          if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
+              ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
+          src++; if (src == srcLim) { break; } }
+
+        if (src != srcLim)
+        for (;;)
         {
-          num = p->lims[BCJ2_STREAM_MAIN] - dest;
-          if (num == 0)
-          {
-            p->state = BCJ2_STREAM_MAIN;
-            return;
-          }
+          /* clang can generate ineffective code with setne instead of two jcc instructions.
+             we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */
+          unsigned b;
+          ONE_ITER
+          ONE_ITER
         }
-       
-        srcLim = src + num;
+        
+        ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);
+        p->bufs[BCJ2_STREAM_MAIN] = dest;
+        p->ip64 = ip;
 
-        if (p->prevByte == 0x0F && (src[0] & 0xF0) == 0x80)
-          *dest = src[0];
-        else for (;;)
+        if (src == srcLim)
         {
-          Byte b = *src;
-          *dest = b;
-          if (b != 0x0F)
+          WRITE_CONTEXT_AND_SRC
+          if (src != p->srcLim)
           {
-            if ((b & 0xFE) == 0xE8)
-              break;
-            dest++;
-            if (++src != srcLim)
-              continue;
-            break;
+            p->state = BCJ2_STREAM_MAIN;
+            return;
           }
-          dest++;
-          if (++src == srcLim)
-            break;
-          if ((*src & 0xF0) != 0x80)
-            continue;
-          *dest = *src;
+          /* (p->src == p->srcLim)
+          (p->state == BCJ2_ENC_STATE_ORIG) */
+          if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
+            return;
+          /* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */
+          // (p->flushRem == 5);
+          p->isFlushState = 1;
           break;
         }
-        
-        num = src - p->src;
-        
-        if (src == srcLim)
-        {
-          p->prevByte = src[-1];
-          p->bufs[BCJ2_STREAM_MAIN] = dest;
-          p->src = src;
-          p->ip += (UInt32)num;
-          continue;
-        }
- 
+        src++;
+        // p->src = src;
+      }
+      // ip = p->ip; // for marker version
+      /* marker was found */
+      /* (v) contains marker that was found:
+           bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]
+                         : value of src[-2] : xx/xx/0f
+           bits [0 : 7]  : value of src[-1] : e8/e9/8x
+      */
+      {
         {
-          Byte context = (Byte)(num == 0 ? p->prevByte : src[-1]);
-          BoolInt needConvert;
-
-          p->bufs[BCJ2_STREAM_MAIN] = dest + 1;
-          p->ip += (UInt32)num + 1;
-          src++;
-          
-          needConvert = False;
-
+        #if NUM_SHIFT_BITS != 24
+          v &= ~(UInt32)CONV_FLAG;
+        #endif
+          // UInt32 relat = 0;
           if ((SizeT)(p->srcLim - src) >= 4)
           {
-            UInt32 relatVal = GetUi32(src);
-            if ((p->fileSize == 0 || (UInt32)(p->ip + 4 + relatVal - p->fileIp) < p->fileSize)
-                && ((relatVal + p->relatLimit) >> 1) < p->relatLimit)
-              needConvert = True;
+            /*
+            if (relat != 0 || (Byte)v != 0xe8)
+            BoolInt isBigOffset = True;
+            */
+            const UInt32 relat = GetUi32(src);
+            /*
+            #define EXCLUDE_FLAG  ((UInt32)1 << 4)
+            #define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)
+            if (p->relatExcludeBits != 0)
+            {
+              const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);
+              isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);
+            }
+            // isBigOffset = False; // for debug
+            */
+            ip -= p->fileIp64;
+            // Use the following if check, if (ip) is 64-bit:
+            if (ip > (((v + 0x20) >> 5) & 1))  // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)
+            if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)
+            if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)
+              v |= CONV_FLAG;
           }
-
+          else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
           {
-            UInt32 bound;
-            unsigned ttt;
-            Byte b = src[-1];
-            CProb *prob = p->probs + (unsigned)(b == 0xE8 ? 2 + (unsigned)context : (b == 0xE9 ? 1 : 0));
-
-            ttt = *prob;
-            bound = (p->range >> kNumModelBits) * ttt;
-            
-            if (!needConvert)
+            // (p->srcLim - src < 4)
+            // /*
+            // for non-marker version
+            p->ip64--; // p->ip = ip - 1;
+            p->bufs[BCJ2_STREAM_MAIN]--;
+            src--;
+            v >>= NUM_SHIFT_BITS;
+            // (0 < p->srcLim - p->src <= 4)
+            // */
+            // v |= MARKER_FLAG; // for marker version
+            /* (p->state == BCJ2_ENC_STATE_ORIG) */
+            WRITE_CONTEXT_AND_SRC
+            return;
+          }
+          {
+            const unsigned c = ((v + 0x17) >> 6) & 1;
+            CBcj2Prob *prob = p->probs + (unsigned)
+                (((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
+            /*
+                ((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :
+                ((Byte)v < 0xe8 ? 0 : 1));  // ((v >> 5) & 1));
+            */
+            const unsigned ttt = *prob;
+            const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;
+            if ((v & CONV_FLAG) == 0)
             {
+              // static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);
+              // v = (Byte)v; // for marker version
               p->range = bound;
-              *prob = (CProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
-              p->src = src;
-              p->prevByte = b;
+              *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
+              // WRITE_CONTEXT_AND_SRC
               continue;
             }
-            
             p->low += bound;
             p->range -= bound;
-            *prob = (CProb)(ttt - (ttt >> kNumMoveBits));
-
+            *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
+          }
+          // p->context = src[3];
+          {
+            // const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);
+            const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
+            ip = p->ip64;
+            v = GetUi32(src); // relat
+            ip += 4;
+            p->ip64 = ip;
+            src += 4;
+            // p->src = src;
             {
-              UInt32 relatVal = GetUi32(src);
-              UInt32 absVal;
-              p->ip += 4;
-              absVal = p->ip + relatVal;
-              p->prevByte = src[3];
-              src += 4;
-              p->src = src;
+              const UInt32 absol = (UInt32)ip + v;
+              Byte *cur = p->bufs[cj];
+              v >>= 24;
+              // WRITE_CONTEXT
+              if (cur == p->lims[cj])
               {
-                unsigned cj = (b == 0xE8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP;
-                Byte *cur = p->bufs[cj];
-                if (cur == p->lims[cj])
-                {
-                  p->state = cj;
-                  p->tempTarget = absVal;
-                  return;
-                }
-                SetBe32(cur, absVal);
-                p->bufs[cj] = cur + 4;
+                p->state = cj;
+                p->tempTarget = absol;
+                WRITE_CONTEXT_AND_SRC
+                return;
               }
+              SetBe32a(cur, absol)
+              p->bufs[cj] = cur + 4;
             }
           }
         }
       }
-    }
+    } // end of loop
   }
 
-  if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
-    return;
-
-  for (; p->flushPos < 5; p->flushPos++)
-    if (RangeEnc_ShiftLow(p))
+  for (; p->flushRem != 0; p->flushRem--)
+    if (Bcj2_RangeEnc_ShiftLow(p))
       return;
-  p->state = BCJ2_ENC_STATE_OK;
+  p->state = BCJ2_ENC_STATE_FINISHED;
 }
 
 
+/*
+BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.
+So base function Bcj2Enc_Encode_2()
+  in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with
+  (p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)
+Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.
+  so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),
+    then (p->src == p->srcLim).
+  And the caller's code is simpler with Bcj2Enc_Encode().
+*/
+
+Z7_NO_INLINE
 void Bcj2Enc_Encode(CBcj2Enc *p)
 {
-  PRF(printf("\n"));
-  PRF(printf("---- ip = %8d   tempPos = %8d   src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src));
-
+  PRF2("\n----")
   if (p->tempPos != 0)
   {
+    /* extra: number of bytes that were copied from (src) to (temp) buffer in this call */
     unsigned extra = 0;
-   
+    /* We will touch only minimal required number of bytes in input (src) stream.
+       So we will add input bytes from (src) stream to temp[] with step of 1 byte.
+       We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call
+         in first loop iteration because
+         - previous call of Bcj2Enc_Encode() could use another (finishMode),
+         - previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).
+       the case with full temp[] buffer (p->tempPos == 4) is possible here.
+    */
     for (;;)
     {
+      // (0 < p->tempPos <= 5) // in non-marker version
+      /* p->src : the current src data position including extra bytes
+                  that were copied to temp[] buffer in this call */
       const Byte *src = p->src;
       const Byte *srcLim = p->srcLim;
-      EBcj2Enc_FinishMode finishMode = p->finishMode;
-      
-      p->src = p->temp;
-      p->srcLim = p->temp + p->tempPos;
+      const EBcj2Enc_FinishMode finishMode = p->finishMode;
       if (src != srcLim)
+      {
+        /* if there are some src data after the data copied to temp[],
+           then we use MODE_CONTINUE for temp data */
         p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
-      
-      PRF(printf("     ip = %8d   tempPos = %8d   src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src));
-
+      }
+      p->src = p->temp;
+      p->srcLim = p->temp + p->tempPos;
+      PRF2("    ")
       Bcj2Enc_Encode_2(p);
-      
       {
-        unsigned num = (unsigned)(p->src - p->temp);
-        unsigned tempPos = p->tempPos - num;
+        const unsigned num = (unsigned)(p->src - p->temp);
+        const unsigned tempPos = p->tempPos - num;
         unsigned i;
         p->tempPos = tempPos;
         for (i = 0; i < tempPos; i++)
-          p->temp[i] = p->temp[(size_t)i + num];
-      
+          p->temp[i] = p->temp[(SizeT)i + num];
+        // tempPos : number of bytes in temp buffer
         p->src = src;
         p->srcLim = srcLim;
         p->finishMode = finishMode;
-        
-        if (p->state != BCJ2_ENC_STATE_ORIG || src == srcLim)
+        if (p->state != BCJ2_ENC_STATE_ORIG)
+        {
+          // (p->tempPos <= 4) // in non-marker version
+          /* if (the reason of exit from Bcj2Enc_Encode_2()
+                 is not BCJ2_ENC_STATE_ORIG),
+             then we exit from Bcj2Enc_Encode() with same reason */
+          // optional code begin : we rollback (src) and tempPos, if it's possible:
+          if (extra >= tempPos)
+            extra = tempPos;
+          p->src = src - extra;
+          p->tempPos = tempPos - extra;
+          // optional code end : rollback of (src) and tempPos
           return;
-        
+        }
+        /* (p->tempPos <= 4)
+           (p->state == BCJ2_ENC_STATE_ORIG)
+             so encoder needs more data than in temp[] */
+        if (src == srcLim)
+          return; // src buffer has no more input data.
+        /* (src != srcLim)
+           so we can provide more input data from src for Bcj2Enc_Encode_2() */
         if (extra >= tempPos)
         {
-          p->src = src - tempPos;
+          /* (extra >= tempPos) means that temp buffer contains
+             only data from src buffer of this call.
+             So now we can encode without temp buffer */
+          p->src = src - tempPos; // rollback (src)
           p->tempPos = 0;
           break;
         }
-        
-        p->temp[tempPos] = src[0];
+        // we append one additional extra byte from (src) to temp[] buffer:
+        p->temp[tempPos] = *src;
         p->tempPos = tempPos + 1;
+        // (0 < p->tempPos <= 5) // in non-marker version
         p->src = src + 1;
         extra++;
       }
     }
   }
 
-  PRF(printf("++++ ip = %8d   tempPos = %8d   src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src));
-
+  PRF2("++++")
+  // (p->tempPos == 0)
   Bcj2Enc_Encode_2(p);
+  PRF2("====")
   
   if (p->state == BCJ2_ENC_STATE_ORIG)
   {
     const Byte *src = p->src;
-    unsigned rem = (unsigned)(p->srcLim - src);
-    unsigned i;
-    for (i = 0; i < rem; i++)
-      p->temp[i] = src[i];
-    p->tempPos = rem;
-    p->src = src + rem;
+    const Byte *srcLim = p->srcLim;
+    const unsigned rem = (unsigned)(srcLim - src);
+    /* (rem <= 4) here.
+       if (p->src != p->srcLim), then
+         - we copy non-processed bytes from (p->src) to temp[] buffer,
+         - we set p->src equal to p->srcLim.
+    */
+    if (rem)
+    {
+      unsigned i = 0;
+      p->src = srcLim;
+      p->tempPos = rem;
+      // (0 < p->tempPos <= 4)
+      do
+        p->temp[i] = src[i];
+      while (++i != rem);
+    }
+    // (p->tempPos <= 4)
+    // (p->src == p->srcLim)
   }
 }
+
+#undef PRF2
+#undef CONV_FLAG
+#undef MARKER_FLAG
+#undef WRITE_CONTEXT
+#undef WRITE_CONTEXT_AND_SRC
+#undef ONE_ITER
+#undef NUM_SHIFT_BITS
+#undef kTopValue
+#undef kNumBitModelTotalBits
+#undef kBitModelTotal
+#undef kNumMoveBits
diff --git a/src/sdk/C/Bra.c b/src/sdk/C/Bra.c
index aed17e3..e61edf8 100644
--- a/src/sdk/C/Bra.c
+++ b/src/sdk/C/Bra.c
@@ -1,230 +1,709 @@
-/* Bra.c -- Converters for RISC code
-2017-04-04 : Igor Pavlov : Public domain */
+/* Bra.c -- Branch converters for RISC code
+2024-01-20 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
-#include "CpuArch.h"
 #include "Bra.h"
+#include "RotateDefs.h"
+#include "CpuArch.h"
+
+#if defined(MY_CPU_SIZEOF_POINTER) \
+    && ( MY_CPU_SIZEOF_POINTER == 4 \
+      || MY_CPU_SIZEOF_POINTER == 8)
+  #define BR_CONV_USE_OPT_PC_PTR
+#endif
+
+#ifdef BR_CONV_USE_OPT_PC_PTR
+#define BR_PC_INIT  pc -= (UInt32)(SizeT)p;
+#define BR_PC_GET   (pc + (UInt32)(SizeT)p)
+#else
+#define BR_PC_INIT  pc += (UInt32)size;
+#define BR_PC_GET   (pc - (UInt32)(SizeT)(lim - p))
+// #define BR_PC_INIT
+// #define BR_PC_GET   (pc + (UInt32)(SizeT)(p - data))
+#endif
+
+#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c;
+// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c;
+
+#define Z7_BRANCH_CONV(name) z7_ ## name
+
+#define Z7_BRANCH_FUNC_MAIN(name) \
+static \
+Z7_FORCE_INLINE \
+Z7_ATTRIB_NO_VECTOR \
+Byte *Z7_BRANCH_CONV(name)(Byte *p, SizeT size, UInt32 pc, int encoding)
+
+#define Z7_BRANCH_FUNC_IMP(name, m, encoding) \
+Z7_NO_INLINE \
+Z7_ATTRIB_NO_VECTOR \
+Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \
+  { return Z7_BRANCH_CONV(name)(data, size, pc, encoding); } \
+
+#ifdef Z7_EXTRACT_ONLY
+#define Z7_BRANCH_FUNCS_IMP(name) \
+  Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0)
+#else
+#define Z7_BRANCH_FUNCS_IMP(name) \
+  Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) \
+  Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC_2, 1)
+#endif
 
-SizeT ARM_Convert(Byte *data, SizeT size, UInt32 ip, int encoding)
+#if defined(__clang__)
+#define BR_EXTERNAL_FOR
+#define BR_NEXT_ITERATION  continue;
+#else
+#define BR_EXTERNAL_FOR    for (;;)
+#define BR_NEXT_ITERATION  break;
+#endif
+
+#if defined(__clang__) && (__clang_major__ >= 8) \
+  || defined(__GNUC__) && (__GNUC__ >= 1000) \
+  // GCC is not good for __builtin_expect() here
+  /* || defined(_MSC_VER) && (_MSC_VER >= 1920) */
+  // #define Z7_unlikely [[unlikely]]
+  // #define Z7_LIKELY(x)   (__builtin_expect((x), 1))
+  #define Z7_UNLIKELY(x) (__builtin_expect((x), 0))
+  // #define Z7_likely [[likely]]
+#else
+  // #define Z7_LIKELY(x)   (x)
+  #define Z7_UNLIKELY(x) (x)
+  // #define Z7_likely
+#endif
+
+
+Z7_BRANCH_FUNC_MAIN(BranchConv_ARM64)
 {
-  Byte *p;
+  // Byte *p = data;
   const Byte *lim;
-  size &= ~(size_t)3;
-  ip += 4;
-  p = data;
-  lim = data + size;
+  const UInt32 flag = (UInt32)1 << (24 - 4);
+  const UInt32 mask = ((UInt32)1 << 24) - (flag << 1);
+  size &= ~(SizeT)3;
+  // if (size == 0) return p;
+  lim = p + size;
+  BR_PC_INIT
+  pc -= 4;  // because (p) will point to next instruction
+  
+  BR_EXTERNAL_FOR
+  {
+    // Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+    for (;;)
+    {
+      UInt32 v;
+      if Z7_UNLIKELY(p == lim)
+        return p;
+      v = GetUi32a(p);
+      p += 4;
+      if Z7_UNLIKELY(((v - 0x94000000) & 0xfc000000) == 0)
+      {
+        UInt32 c = BR_PC_GET >> 2;
+        BR_CONVERT_VAL(v, c)
+        v &= 0x03ffffff;
+        v |= 0x94000000;
+        SetUi32a(p - 4, v)
+        BR_NEXT_ITERATION
+      }
+      // v = rotlFixed(v, 8);  v += (flag << 8) - 0x90;  if Z7_UNLIKELY((v & ((mask << 8) + 0x9f)) == 0)
+      v -= 0x90000000;  if Z7_UNLIKELY((v & 0x9f000000) == 0)
+      {
+        UInt32 z, c;
+        // v = rotrFixed(v, 8);
+        v += flag; if Z7_UNLIKELY(v & mask) continue;
+        z = (v & 0xffffffe0) | (v >> 26);
+        c = (BR_PC_GET >> (12 - 3)) & ~(UInt32)7;
+        BR_CONVERT_VAL(z, c)
+        v &= 0x1f;
+        v |= 0x90000000;
+        v |= z << 26;
+        v |= 0x00ffffe0 & ((z & (((flag << 1) - 1))) - flag);
+        SetUi32a(p - 4, v)
+      }
+    }
+  }
+}
+Z7_BRANCH_FUNCS_IMP(BranchConv_ARM64)
 
-  if (encoding)
 
+Z7_BRANCH_FUNC_MAIN(BranchConv_ARM)
+{
+  // Byte *p = data;
+  const Byte *lim;
+  size &= ~(SizeT)3;
+  lim = p + size;
+  BR_PC_INIT
+  /* in ARM: branch offset is relative to the +2 instructions from current instruction.
+     (p) will point to next instruction */
+  pc += 8 - 4;
+  
   for (;;)
   {
     for (;;)
     {
-      if (p >= lim)
-        return p - data;
-      p += 4;
-      if (p[-1] == 0xEB)
-        break;
+      if Z7_UNLIKELY(p >= lim) { return p; }  p += 4;  if Z7_UNLIKELY(p[-1] == 0xeb) break;
+      if Z7_UNLIKELY(p >= lim) { return p; }  p += 4;  if Z7_UNLIKELY(p[-1] == 0xeb) break;
     }
     {
-      UInt32 v = GetUi32(p - 4);
-      v <<= 2;
-        v += ip + (UInt32)(p - data);
-      v >>= 2;
-      v &= 0x00FFFFFF;
-      v |= 0xEB000000;
-      SetUi32(p - 4, v);
+      UInt32 v = GetUi32a(p - 4);
+      UInt32 c = BR_PC_GET >> 2;
+      BR_CONVERT_VAL(v, c)
+      v &= 0x00ffffff;
+      v |= 0xeb000000;
+      SetUi32a(p - 4, v)
     }
   }
+}
+Z7_BRANCH_FUNCS_IMP(BranchConv_ARM)
 
+
+Z7_BRANCH_FUNC_MAIN(BranchConv_PPC)
+{
+  // Byte *p = data;
+  const Byte *lim;
+  size &= ~(SizeT)3;
+  lim = p + size;
+  BR_PC_INIT
+  pc -= 4;  // because (p) will point to next instruction
+  
   for (;;)
   {
+    UInt32 v;
     for (;;)
     {
-      if (p >= lim)
-        return p - data;
+      if Z7_UNLIKELY(p == lim)
+        return p;
+      // v = GetBe32a(p);
+      v = *(UInt32 *)(void *)p;
       p += 4;
-      if (p[-1] == 0xEB)
-        break;
+      // if ((v & 0xfc000003) == 0x48000001) break;
+      // if ((p[-4] & 0xFC) == 0x48 && (p[-1] & 3) == 1) break;
+      if Z7_UNLIKELY(
+          ((v - Z7_CONV_BE_TO_NATIVE_CONST32(0x48000001))
+              & Z7_CONV_BE_TO_NATIVE_CONST32(0xfc000003)) == 0) break;
     }
     {
-      UInt32 v = GetUi32(p - 4);
-      v <<= 2;
-        v -= ip + (UInt32)(p - data);
-      v >>= 2;
-      v &= 0x00FFFFFF;
-      v |= 0xEB000000;
-      SetUi32(p - 4, v);
+      v = Z7_CONV_NATIVE_TO_BE_32(v);
+      {
+        UInt32 c = BR_PC_GET;
+        BR_CONVERT_VAL(v, c)
+      }
+      v &= 0x03ffffff;
+      v |= 0x48000000;
+      SetBe32a(p - 4, v)
     }
   }
 }
+Z7_BRANCH_FUNCS_IMP(BranchConv_PPC)
+
 
+#ifdef Z7_CPU_FAST_ROTATE_SUPPORTED
+#define BR_SPARC_USE_ROTATE
+#endif
 
-SizeT ARMT_Convert(Byte *data, SizeT size, UInt32 ip, int encoding)
+Z7_BRANCH_FUNC_MAIN(BranchConv_SPARC)
 {
-  Byte *p;
+  // Byte *p = data;
   const Byte *lim;
-  size &= ~(size_t)1;
-  p = data;
-  lim = data + size - 4;
-
-  if (encoding)
-  
+  const UInt32 flag = (UInt32)1 << 22;
+  size &= ~(SizeT)3;
+  lim = p + size;
+  BR_PC_INIT
+  pc -= 4;  // because (p) will point to next instruction
   for (;;)
   {
-    UInt32 b1;
+    UInt32 v;
     for (;;)
     {
-      UInt32 b3;
-      if (p > lim)
-        return p - data;
-      b1 = p[1];
-      b3 = p[3];
-      p += 2;
-      b1 ^= 8;
-      if ((b3 & b1) >= 0xF8)
+      if Z7_UNLIKELY(p == lim)
+        return p;
+      /* // the code without GetBe32a():
+      { const UInt32 v = GetUi16a(p) & 0xc0ff; p += 4; if (v == 0x40 || v == 0xc07f) break; }
+      */
+      v = GetBe32a(p);
+      p += 4;
+    #ifdef BR_SPARC_USE_ROTATE
+      v = rotlFixed(v, 2);
+      v += (flag << 2) - 1;
+      if Z7_UNLIKELY((v & (3 - (flag << 3))) == 0)
+    #else
+      v += (UInt32)5 << 29;
+      v ^= (UInt32)7 << 29;
+      v += flag;
+      if Z7_UNLIKELY((v & (0 - (flag << 1))) == 0)
+    #endif
         break;
     }
     {
-      UInt32 v =
-             ((UInt32)b1 << 19)
-          + (((UInt32)p[1] & 0x7) << 8)
-          + (((UInt32)p[-2] << 11))
-          + (p[0]);
-
-      p += 2;
+      // UInt32 v = GetBe32a(p - 4);
+    #ifndef BR_SPARC_USE_ROTATE
+      v <<= 2;
+    #endif
       {
-        UInt32 cur = (ip + (UInt32)(p - data)) >> 1;
-          v += cur;
+        UInt32 c = BR_PC_GET;
+        BR_CONVERT_VAL(v, c)
       }
-
-      p[-4] = (Byte)(v >> 11);
-      p[-3] = (Byte)(0xF0 | ((v >> 19) & 0x7));
-      p[-2] = (Byte)v;
-      p[-1] = (Byte)(0xF8 | (v >> 8));
+      v &= (flag << 3) - 1;
+    #ifdef BR_SPARC_USE_ROTATE
+      v -= (flag << 2) - 1;
+      v = rotrFixed(v, 2);
+    #else
+      v -= (flag << 2);
+      v >>= 2;
+      v |= (UInt32)1 << 30;
+    #endif
+      SetBe32a(p - 4, v)
     }
   }
+}
+Z7_BRANCH_FUNCS_IMP(BranchConv_SPARC)
+
+
+Z7_BRANCH_FUNC_MAIN(BranchConv_ARMT)
+{
+  // Byte *p = data;
+  Byte *lim;
+  size &= ~(SizeT)1;
+  // if (size == 0) return p;
+  if (size <= 2) return p;
+  size -= 2;
+  lim = p + size;
+  BR_PC_INIT
+  /* in ARM: branch offset is relative to the +2 instructions from current instruction.
+     (p) will point to the +2 instructions from current instruction */
+  // pc += 4 - 4;
+  // if (encoding) pc -= 0xf800 << 1; else pc += 0xf800 << 1;
+  // #define ARMT_TAIL_PROC { goto armt_tail; }
+  #define ARMT_TAIL_PROC { return p; }
   
-  for (;;)
+  do
   {
-    UInt32 b1;
+    /* in MSVC 32-bit x86 compilers:
+       UInt32 version : it loads value from memory with movzx
+       Byte   version : it loads value to 8-bit register (AL/CL)
+       movzx version is slightly faster in some cpus
+    */
+    unsigned b1;
+    // Byte / unsigned
+    b1 = p[1];
+    // optimized version to reduce one (p >= lim) check:
+    // unsigned a1 = p[1];  b1 = p[3];  p += 2;  if Z7_LIKELY((b1 & (a1 ^ 8)) < 0xf8)
     for (;;)
     {
-      UInt32 b3;
-      if (p > lim)
-        return p - data;
-      b1 = p[1];
-      b3 = p[3];
-      p += 2;
-      b1 ^= 8;
-      if ((b3 & b1) >= 0xF8)
-        break;
+      unsigned b3; // Byte / UInt32
+      /* (Byte)(b3) normalization can use low byte computations in MSVC.
+         It gives smaller code, and no loss of speed in some compilers/cpus.
+         But new MSVC 32-bit x86 compilers use more slow load
+         from memory to low byte register in that case.
+         So we try to use full 32-bit computations for faster code.
+      */
+      // if (p >= lim) { ARMT_TAIL_PROC }  b3 = b1 + 8;  b1 = p[3];  p += 2;  if ((b3 & b1) >= 0xf8) break;
+      if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC }  b3 = p[3];  p += 2;  if Z7_UNLIKELY((b3 & (b1 ^ 8)) >= 0xf8) break;
+      if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC }  b1 = p[3];  p += 2;  if Z7_UNLIKELY((b1 & (b3 ^ 8)) >= 0xf8) break;
     }
     {
+      /* we can adjust pc for (0xf800) to rid of (& 0x7FF) operation.
+         But gcc/clang for arm64 can use bfi instruction for full code here */
       UInt32 v =
-             ((UInt32)b1 << 19)
+          ((UInt32)GetUi16a(p - 2) << 11) |
+          ((UInt32)GetUi16a(p) & 0x7FF);
+      /*
+      UInt32 v =
+            ((UInt32)p[1 - 2] << 19)
           + (((UInt32)p[1] & 0x7) << 8)
           + (((UInt32)p[-2] << 11))
           + (p[0]);
-
+      */
       p += 2;
       {
-        UInt32 cur = (ip + (UInt32)(p - data)) >> 1;
-          v -= cur;
+        UInt32 c = BR_PC_GET >> 1;
+        BR_CONVERT_VAL(v, c)
       }
-
+      SetUi16a(p - 4, (UInt16)(((v >> 11) & 0x7ff) | 0xf000))
+      SetUi16a(p - 2, (UInt16)(v | 0xf800))
       /*
-      SetUi16(p - 4, (UInt16)(((v >> 11) & 0x7FF) | 0xF000));
-      SetUi16(p - 2, (UInt16)(v | 0xF800));
-      */
-      
       p[-4] = (Byte)(v >> 11);
-      p[-3] = (Byte)(0xF0 | ((v >> 19) & 0x7));
+      p[-3] = (Byte)(0xf0 | ((v >> 19) & 0x7));
       p[-2] = (Byte)v;
-      p[-1] = (Byte)(0xF8 | (v >> 8));
+      p[-1] = (Byte)(0xf8 | (v >> 8));
+      */
     }
   }
+  while (p < lim);
+  return p;
+  // armt_tail:
+  // if ((Byte)((lim[1] & 0xf8)) != 0xf0) { lim += 2; }  return lim;
+  // return (Byte *)(lim + ((Byte)((lim[1] ^ 0xf0) & 0xf8) == 0 ? 0 : 2));
+  // return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2));
+  // return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2));
 }
+Z7_BRANCH_FUNCS_IMP(BranchConv_ARMT)
 
 
-SizeT PPC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding)
+// #define BR_IA64_NO_INLINE
+
+Z7_BRANCH_FUNC_MAIN(BranchConv_IA64)
 {
-  Byte *p;
+  // Byte *p = data;
   const Byte *lim;
-  size &= ~(size_t)3;
-  ip -= 4;
-  p = data;
-  lim = data + size;
-
+  size &= ~(SizeT)15;
+  lim = p + size;
+  pc -= 1 << 4;
+  pc >>= 4 - 1;
+  // pc -= 1 << 1;
+  
   for (;;)
   {
+    unsigned m;
     for (;;)
     {
-      if (p >= lim)
-        return p - data;
-      p += 4;
-      /* if ((v & 0xFC000003) == 0x48000001) */
-      if ((p[-4] & 0xFC) == 0x48 && (p[-1] & 3) == 1)
+      if Z7_UNLIKELY(p == lim)
+        return p;
+      m = (unsigned)((UInt32)0x334b0000 >> (*p & 0x1e));
+      p += 16;
+      pc += 1 << 1;
+      if (m &= 3)
         break;
     }
     {
-      UInt32 v = GetBe32(p - 4);
-      if (encoding)
-        v += ip + (UInt32)(p - data);
-      else
-        v -= ip + (UInt32)(p - data);
-      v &= 0x03FFFFFF;
-      v |= 0x48000000;
-      SetBe32(p - 4, v);
+      p += (ptrdiff_t)m * 5 - 20; // negative value is expected here.
+      do
+      {
+        const UInt32 t =
+          #if defined(MY_CPU_X86_OR_AMD64)
+            // we use 32-bit load here to reduce code size on x86:
+            GetUi32(p);
+          #else
+            GetUi16(p);
+          #endif
+        UInt32 z = GetUi32(p + 1) >> m;
+        p += 5;
+        if (((t >> m) & (0x70 << 1)) == 0
+            && ((z - (0x5000000 << 1)) & (0xf000000 << 1)) == 0)
+        {
+          UInt32 v = (UInt32)((0x8fffff << 1) | 1) & z;
+          z ^= v;
+        #ifdef BR_IA64_NO_INLINE
+          v |= (v & ((UInt32)1 << (23 + 1))) >> 3;
+          {
+            UInt32 c = pc;
+            BR_CONVERT_VAL(v, c)
+          }
+          v &= (0x1fffff << 1) | 1;
+        #else
+          {
+            if (encoding)
+            {
+              // pc &= ~(0xc00000 << 1); // we just need to clear at least 2 bits
+              pc &= (0x1fffff << 1) | 1;
+              v += pc;
+            }
+            else
+            {
+              // pc |= 0xc00000 << 1; // we need to set at least 2 bits
+              pc |= ~(UInt32)((0x1fffff << 1) | 1);
+              v -= pc;
+            }
+          }
+          v &= ~(UInt32)(0x600000 << 1);
+        #endif
+          v += (0x700000 << 1);
+          v &= (0x8fffff << 1) | 1;
+          z |= v;
+          z <<= m;
+          SetUi32(p + 1 - 5, z)
+        }
+        m++;
+      }
+      while (m &= 3); // while (m < 4);
     }
   }
 }
+Z7_BRANCH_FUNCS_IMP(BranchConv_IA64)
+
+
+#define BR_CONVERT_VAL_ENC(v)  v += BR_PC_GET;
+#define BR_CONVERT_VAL_DEC(v)  v -= BR_PC_GET;
 
+#if 1 && defined(MY_CPU_LE_UNALIGN)
+  #define RISCV_USE_UNALIGNED_LOAD
+#endif
 
-SizeT SPARC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding)
+#ifdef RISCV_USE_UNALIGNED_LOAD
+  #define RISCV_GET_UI32(p)      GetUi32(p)
+  #define RISCV_SET_UI32(p, v)   { SetUi32(p, v) }
+#else
+  #define RISCV_GET_UI32(p) \
+    ((UInt32)GetUi16a(p) + \
+    ((UInt32)GetUi16a((p) + 2) << 16))
+  #define RISCV_SET_UI32(p, v) { \
+    SetUi16a(p, (UInt16)(v)) \
+    SetUi16a((p) + 2, (UInt16)(v >> 16)) }
+#endif
+
+#if 1 && defined(MY_CPU_LE)
+  #define RISCV_USE_16BIT_LOAD
+#endif
+
+#ifdef RISCV_USE_16BIT_LOAD
+  #define RISCV_LOAD_VAL(p)  GetUi16a(p)
+#else
+  #define RISCV_LOAD_VAL(p)  (*(p))
+#endif
+
+#define RISCV_INSTR_SIZE  2
+#define RISCV_STEP_1      (4 + RISCV_INSTR_SIZE)
+#define RISCV_STEP_2      4
+#define RISCV_REG_VAL     (2 << 7)
+#define RISCV_CMD_VAL     3
+#if 1
+  // for code size optimization:
+  #define RISCV_DELTA_7F  0x7f
+#else
+  #define RISCV_DELTA_7F  0
+#endif
+
+#define RISCV_CHECK_1(v, b) \
+    (((((b) - RISCV_CMD_VAL) ^ ((v) << 8)) & (0xf8000 + RISCV_CMD_VAL)) == 0)
+
+#if 1
+  #define RISCV_CHECK_2(v, r) \
+    ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL | 8)) \
+           << 18) \
+     < ((r) & 0x1d))
+#else
+  // this branch gives larger code, because
+  // compilers generate larger code for big constants.
+  #define RISCV_CHECK_2(v, r) \
+    ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \
+           & ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \
+     < ((r) & 0x1d))
+#endif
+
+
+#define RISCV_SCAN_LOOP \
+  Byte *lim; \
+  size &= ~(SizeT)(RISCV_INSTR_SIZE - 1); \
+  if (size <= 6) return p; \
+  size -= 6; \
+  lim = p + size; \
+  BR_PC_INIT \
+  for (;;) \
+  { \
+    UInt32 a, v; \
+    /* Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE */ \
+    for (;;) \
+    { \
+      if Z7_UNLIKELY(p >= lim) { return p; } \
+      a = (RISCV_LOAD_VAL(p) ^ 0x10u) + 1; \
+      if ((a & 0x77) == 0) break; \
+      a = (RISCV_LOAD_VAL(p + RISCV_INSTR_SIZE) ^ 0x10u) + 1; \
+      p += RISCV_INSTR_SIZE * 2; \
+      if ((a & 0x77) == 0) \
+      { \
+        p -= RISCV_INSTR_SIZE; \
+        if Z7_UNLIKELY(p >= lim) { return p; } \
+        break; \
+      } \
+    }
+// (xx6f ^ 10) + 1 = xx7f + 1 = xx80       : JAL
+// (xxef ^ 10) + 1 = xxff + 1 = xx00 + 100 : JAL
+// (xx17 ^ 10) + 1 = xx07 + 1 = xx08       : AUIPC
+// (xx97 ^ 10) + 1 = xx87 + 1 = xx88       : AUIPC
+
+Byte * Z7_BRANCH_CONV_ENC(RISCV)(Byte *p, SizeT size, UInt32 pc)
 {
-  Byte *p;
-  const Byte *lim;
-  size &= ~(size_t)3;
-  ip -= 4;
-  p = data;
-  lim = data + size;
+  RISCV_SCAN_LOOP
+    v = a;
+    a = RISCV_GET_UI32(p);
+#ifndef RISCV_USE_16BIT_LOAD
+    v += (UInt32)p[1] << 8;
+#endif
 
-  for (;;)
-  {
-    for (;;)
+    if ((v & 8) == 0) // JAL
     {
-      if (p >= lim)
-        return p - data;
-      /*
-      v = GetBe32(p);
-      p += 4;
-      m = v + ((UInt32)5 << 29);
-      m ^= (UInt32)7 << 29;
-      m += (UInt32)1 << 22;
-      if ((m & ((UInt32)0x1FF << 23)) == 0)
-        break;
-      */
+      if ((v - (0x100 /* - RISCV_DELTA_7F */)) & 0xd80)
+      {
+        p += RISCV_INSTR_SIZE;
+        continue;
+      }
+      {
+        v = ((a &    1u << 31) >> 11)
+          | ((a & 0x3ff << 21) >> 20)
+          | ((a &     1 << 20) >> 9)
+          |  (a &  0xff << 12);
+        BR_CONVERT_VAL_ENC(v)
+        // ((v & 1) == 0)
+        // v: bits [1 : 20] contain offset bits
+#if 0 && defined(RISCV_USE_UNALIGNED_LOAD)
+        a &= 0xfff;
+        a |= ((UInt32)(v << 23))
+          |  ((UInt32)(v <<  7) & ((UInt32)0xff << 16))
+          |  ((UInt32)(v >>  5) & ((UInt32)0xf0 << 8));
+        RISCV_SET_UI32(p, a)
+#else // aligned
+#if 0
+        SetUi16a(p, (UInt16)(((v >> 5) & 0xf000) | (a & 0xfff)))
+#else
+        p[1] = (Byte)(((v >> 13) & 0xf0) | ((a >> 8) & 0xf));
+#endif
+
+#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
+        v <<= 15;
+        v = Z7_BSWAP32(v);
+        SetUi16a(p + 2, (UInt16)v)
+#else
+        p[2] = (Byte)(v >> 9);
+        p[3] = (Byte)(v >> 1);
+#endif
+#endif // aligned
+      }
       p += 4;
-      if ((p[-4] == 0x40 && (p[-3] & 0xC0) == 0) ||
-          (p[-4] == 0x7F && (p[-3] >= 0xC0)))
-        break;
+      continue;
+    } // JAL
+
+    {
+      // AUIPC
+      if (v & 0xe80)  // (not x0) and (not x2)
+      {
+        const UInt32 b = RISCV_GET_UI32(p + 4);
+        if (RISCV_CHECK_1(v, b))
+        {
+          {
+            const UInt32 temp = (b << 12) | (0x17 + RISCV_REG_VAL);
+            RISCV_SET_UI32(p, temp)
+          }
+          a &= 0xfffff000;
+          {
+#if 1
+          const int t = -1 >> 1;
+          if (t != -1)
+            a += (b >> 20) - ((b >> 19) & 0x1000); // arithmetic right shift emulation
+          else
+#endif
+            a += (UInt32)((Int32)b >> 20); // arithmetic right shift (sign-extension).
+          }
+          BR_CONVERT_VAL_ENC(a)
+#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
+          a = Z7_BSWAP32(a);
+          RISCV_SET_UI32(p + 4, a)
+#else
+          SetBe32(p + 4, a)
+#endif
+          p += 8;
+        }
+        else
+          p += RISCV_STEP_1;
+      }
+      else
+      {
+        UInt32 r = a >> 27;
+        if (RISCV_CHECK_2(v, r))
+        {
+          v = RISCV_GET_UI32(p + 4);
+          r = (r << 7) + 0x17 + (v & 0xfffff000);
+          a = (a >> 12) | (v << 20);
+          RISCV_SET_UI32(p, r)
+          RISCV_SET_UI32(p + 4, a)
+          p += 8;
+        }
+        else
+          p += RISCV_STEP_2;
+      }
     }
+  } // for
+}
+
+
+Byte * Z7_BRANCH_CONV_DEC(RISCV)(Byte *p, SizeT size, UInt32 pc)
+{
+  RISCV_SCAN_LOOP
+#ifdef RISCV_USE_16BIT_LOAD
+    if ((a & 8) == 0)
     {
-      UInt32 v = GetBe32(p - 4);
-      v <<= 2;
-      if (encoding)
-        v += ip + (UInt32)(p - data);
+#else
+    v = a;
+    a += (UInt32)p[1] << 8;
+    if ((v & 8) == 0)
+    {
+#endif
+      // JAL
+      a -= 0x100 - RISCV_DELTA_7F;
+      if (a & 0xd80)
+      {
+        p += RISCV_INSTR_SIZE;
+        continue;
+      }
+      {
+        const UInt32 a_old = (a + (0xef - RISCV_DELTA_7F)) & 0xfff;
+#if 0 // unaligned
+        a = GetUi32(p);
+        v = (UInt32)(a >> 23) & ((UInt32)0xff << 1)
+          | (UInt32)(a >>  7) & ((UInt32)0xff << 9)
+#elif 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
+        v = GetUi16a(p + 2);
+        v = Z7_BSWAP32(v) >> 15
+#else
+        v = (UInt32)p[3] << 1
+          | (UInt32)p[2] << 9
+#endif
+          | (UInt32)((a & 0xf000) << 5);
+        BR_CONVERT_VAL_DEC(v)
+        a = a_old
+          | (v << 11 &    1u << 31)
+          | (v << 20 & 0x3ff << 21)
+          | (v <<  9 &     1 << 20)
+          | (v       &  0xff << 12);
+        RISCV_SET_UI32(p, a)
+      }
+      p += 4;
+      continue;
+    } // JAL
+
+    {
+      // AUIPC
+      v = a;
+#if 1 && defined(RISCV_USE_UNALIGNED_LOAD)
+      a = GetUi32(p);
+#else
+      a |= (UInt32)GetUi16a(p + 2) << 16;
+#endif
+      if ((v & 0xe80) == 0)  // x0/x2
+      {
+        const UInt32 r = a >> 27;
+        if (RISCV_CHECK_2(v, r))
+        {
+          UInt32 b;
+#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
+          b = RISCV_GET_UI32(p + 4);
+          b = Z7_BSWAP32(b);
+#else
+          b = GetBe32(p + 4);
+#endif
+          v = a >> 12;
+          BR_CONVERT_VAL_DEC(b)
+          a = (r << 7) + 0x17;
+          a += (b + 0x800) & 0xfffff000;
+          v |= b << 20;
+          RISCV_SET_UI32(p, a)
+          RISCV_SET_UI32(p + 4, v)
+          p += 8;
+        }
+        else
+          p += RISCV_STEP_2;
+      }
       else
-        v -= ip + (UInt32)(p - data);
-      
-      v &= 0x01FFFFFF;
-      v -= (UInt32)1 << 24;
-      v ^= 0xFF000000;
-      v >>= 2;
-      v |= 0x40000000;
-      SetBe32(p - 4, v);
+      {
+        const UInt32 b = RISCV_GET_UI32(p + 4);
+        if (!RISCV_CHECK_1(v, b))
+          p += RISCV_STEP_1;
+        else
+        {
+          v = (a & 0xfffff000) | (b >> 20);
+          a = (b << 12) | (0x17 + RISCV_REG_VAL);
+          RISCV_SET_UI32(p, a)
+          RISCV_SET_UI32(p + 4, v)
+          p += 8;
+        }
+      }
     }
-  }
+  } // for
 }
diff --git a/src/sdk/C/Bra.h b/src/sdk/C/Bra.h
index 855e37a..b47112c 100644
--- a/src/sdk/C/Bra.h
+++ b/src/sdk/C/Bra.h
@@ -1,64 +1,105 @@
 /* Bra.h -- Branch converters for executables
-2013-01-18 : Igor Pavlov : Public domain */
+2024-01-20 : Igor Pavlov : Public domain */
 
-#ifndef __BRA_H
-#define __BRA_H
+#ifndef ZIP7_INC_BRA_H
+#define ZIP7_INC_BRA_H
 
 #include "7zTypes.h"
 
 EXTERN_C_BEGIN
 
+/* #define PPC BAD_PPC_11 // for debug */
+
+#define Z7_BRANCH_CONV_DEC_2(name)  z7_ ## name ## _Dec
+#define Z7_BRANCH_CONV_ENC_2(name)  z7_ ## name ## _Enc
+#define Z7_BRANCH_CONV_DEC(name)    Z7_BRANCH_CONV_DEC_2(BranchConv_ ## name)
+#define Z7_BRANCH_CONV_ENC(name)    Z7_BRANCH_CONV_ENC_2(BranchConv_ ## name)
+#define Z7_BRANCH_CONV_ST_DEC(name) z7_BranchConvSt_ ## name ## _Dec
+#define Z7_BRANCH_CONV_ST_ENC(name) z7_BranchConvSt_ ## name ## _Enc
+
+#define Z7_BRANCH_CONV_DECL(name)    Byte * name(Byte *data, SizeT size, UInt32 pc)
+#define Z7_BRANCH_CONV_ST_DECL(name) Byte * name(Byte *data, SizeT size, UInt32 pc, UInt32 *state)
+
+typedef Z7_BRANCH_CONV_DECL(   (*z7_Func_BranchConv));
+typedef Z7_BRANCH_CONV_ST_DECL((*z7_Func_BranchConvSt));
+
+#define Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL 0
+Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_DEC(X86));
+Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_ENC(X86));
+
+#define Z7_BRANCH_FUNCS_DECL(name) \
+Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_DEC_2(name)); \
+Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_ENC_2(name));
+
+Z7_BRANCH_FUNCS_DECL (BranchConv_ARM64)
+Z7_BRANCH_FUNCS_DECL (BranchConv_ARM)
+Z7_BRANCH_FUNCS_DECL (BranchConv_ARMT)
+Z7_BRANCH_FUNCS_DECL (BranchConv_PPC)
+Z7_BRANCH_FUNCS_DECL (BranchConv_SPARC)
+Z7_BRANCH_FUNCS_DECL (BranchConv_IA64)
+Z7_BRANCH_FUNCS_DECL (BranchConv_RISCV)
+
 /*
-These functions convert relative addresses to absolute addresses
-in CALL instructions to increase the compression ratio.
-  
-  In:
-    data     - data buffer
-    size     - size of data
-    ip       - current virtual Instruction Pinter (IP) value
-    state    - state variable for x86 converter
-    encoding - 0 (for decoding), 1 (for encoding)
-  
-  Out:
-    state    - state variable for x86 converter
+These functions convert data that contain CPU instructions.
+Each such function converts relative addresses to absolute addresses in some
+branch instructions: CALL (in all converters) and JUMP (X86 converter only).
+Such conversion allows to increase compression ratio, if we compress that data.
+
+There are 2 types of converters:
+  Byte * Conv_RISC (Byte *data, SizeT size, UInt32 pc);
+  Byte * ConvSt_X86(Byte *data, SizeT size, UInt32 pc, UInt32 *state);
+Each Converter supports 2 versions: one for encoding
+and one for decoding (_Enc/_Dec postfixes in function name).
 
-  Returns:
-    The number of processed bytes. If you call these functions with multiple calls,
-    you must start next call with first byte after block of processed bytes.
+In params:
+  data  : data buffer
+  size  : size of data
+  pc    : current virtual Program Counter (Instruction Pointer) value
+In/Out param:
+  state : pointer to state variable (for X86 converter only)
+
+Return:
+  The pointer to position in (data) buffer after last byte that was processed.
+  If the caller calls converter again, it must call it starting with that position.
+  But the caller is allowed to move data in buffer. So pointer to
+  current processed position also will be changed for next call.
+  Also the caller must increase internal (pc) value for next call.
   
+Each converter has some characteristics: Endian, Alignment, LookAhead.
   Type   Endian  Alignment  LookAhead
   
-  x86    little      1          4
+  X86    little      1          4
   ARMT   little      2          2
+  RISCV  little      2          6
   ARM    little      4          0
+  ARM64  little      4          0
   PPC     big        4          0
   SPARC   big        4          0
   IA64   little     16          0
 
-  size must be >= Alignment + LookAhead, if it's not last block.
-  If (size < Alignment + LookAhead), converter returns 0.
-
-  Example:
+  (data) must be aligned for (Alignment).
+  processed size can be calculated as:
+    SizeT processed = Conv(data, size, pc) - data;
+  if (processed == 0)
+    it means that converter needs more data for processing.
+  If (size < Alignment + LookAhead)
+    then (processed == 0) is allowed.
 
-    UInt32 ip = 0;
-    for ()
-    {
-      ; size must be >= Alignment + LookAhead, if it's not last block
-      SizeT processed = Convert(data, size, ip, 1);
-      data += processed;
-      size -= processed;
-      ip += processed;
-    }
+Example code for conversion in loop:
+  UInt32 pc = 0;
+  size = 0;
+  for (;;)
+  {
+    size += Load_more_input_data(data + size);
+    SizeT processed = Conv(data, size, pc) - data;
+    if (processed == 0 && no_more_input_data_after_size)
+      break; // we stop convert loop
+    data += processed;
+    size -= processed;
+    pc += processed;
+  }
 */
 
-#define x86_Convert_Init(state) { state = 0; }
-SizeT x86_Convert(Byte *data, SizeT size, UInt32 ip, UInt32 *state, int encoding);
-SizeT ARM_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
-SizeT ARMT_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
-SizeT PPC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
-SizeT SPARC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
-SizeT IA64_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
-
 EXTERN_C_END
 
 #endif
diff --git a/src/sdk/C/Bra86.c b/src/sdk/C/Bra86.c
index 93ed4d7..d81f392 100644
--- a/src/sdk/C/Bra86.c
+++ b/src/sdk/C/Bra86.c
@@ -1,82 +1,187 @@
-/* Bra86.c -- Converter for x86 code (BCJ)
-2017-04-03 : Igor Pavlov : Public domain */
+/* Bra86.c -- Branch converter for X86 code (BCJ)
+2023-04-02 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include "Bra.h"
+#include "CpuArch.h"
 
-#define Test86MSByte(b) ((((b) + 1) & 0xFE) == 0)
 
-SizeT x86_Convert(Byte *data, SizeT size, UInt32 ip, UInt32 *state, int encoding)
+#if defined(MY_CPU_SIZEOF_POINTER) \
+    && ( MY_CPU_SIZEOF_POINTER == 4 \
+      || MY_CPU_SIZEOF_POINTER == 8)
+  #define BR_CONV_USE_OPT_PC_PTR
+#endif
+
+#ifdef BR_CONV_USE_OPT_PC_PTR
+#define BR_PC_INIT  pc -= (UInt32)(SizeT)p; // (MY_uintptr_t)
+#define BR_PC_GET   (pc + (UInt32)(SizeT)p)
+#else
+#define BR_PC_INIT  pc += (UInt32)size;
+#define BR_PC_GET   (pc - (UInt32)(SizeT)(lim - p))
+// #define BR_PC_INIT
+// #define BR_PC_GET   (pc + (UInt32)(SizeT)(p - data))
+#endif
+
+#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c;
+// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c;
+
+#define Z7_BRANCH_CONV_ST(name) z7_BranchConvSt_ ## name
+
+#define BR86_NEED_CONV_FOR_MS_BYTE(b) ((((b) + 1) & 0xfe) == 0)
+
+#ifdef MY_CPU_LE_UNALIGN
+  #define BR86_PREPARE_BCJ_SCAN  const UInt32 v = GetUi32(p) ^ 0xe8e8e8e8;
+  #define BR86_IS_BCJ_BYTE(n)    ((v & ((UInt32)0xfe << (n) * 8)) == 0)
+#else
+  #define BR86_PREPARE_BCJ_SCAN
+  // bad for MSVC X86 (partial write to byte reg):
+  #define BR86_IS_BCJ_BYTE(n)    ((p[n - 4] & 0xfe) == 0xe8)
+  // bad for old MSVC (partial write to byte reg):
+  // #define BR86_IS_BCJ_BYTE(n)    (((*p ^ 0xe8) & 0xfe) == 0)
+#endif
+ 
+static
+Z7_FORCE_INLINE
+Z7_ATTRIB_NO_VECTOR
+Byte *Z7_BRANCH_CONV_ST(X86)(Byte *p, SizeT size, UInt32 pc, UInt32 *state, int encoding)
 {
-  SizeT pos = 0;
-  UInt32 mask = *state & 7;
   if (size < 5)
-    return 0;
-  size -= 4;
-  ip += 5;
+    return p;
+ {
+  // Byte *p = data;
+  const Byte *lim = p + size - 4;
+  unsigned mask = (unsigned)*state;  // & 7;
+#ifdef BR_CONV_USE_OPT_PC_PTR
+  /* if BR_CONV_USE_OPT_PC_PTR is defined: we need to adjust (pc) for (+4),
+        because call/jump offset is relative to the next instruction.
+     if BR_CONV_USE_OPT_PC_PTR is not defined : we don't need to adjust (pc) for (+4),
+         because  BR_PC_GET uses (pc - (lim - p)), and lim was adjusted for (-4) before.
+  */
+  pc += 4;
+#endif
+  BR_PC_INIT
+  goto start;
 
-  for (;;)
+  for (;; mask |= 4)
   {
-    Byte *p = data + pos;
-    const Byte *limit = data + size;
-    for (; p < limit; p++)
-      if ((*p & 0xFE) == 0xE8)
-        break;
-
+    // cont: mask |= 4;
+  start:
+    if (p >= lim)
+      goto fin;
     {
-      SizeT d = (SizeT)(p - data - pos);
-      pos = (SizeT)(p - data);
-      if (p >= limit)
-      {
-        *state = (d > 2 ? 0 : mask >> (unsigned)d);
-        return pos;
-      }
-      if (d > 2)
-        mask = 0;
-      else
-      {
-        mask >>= (unsigned)d;
-        if (mask != 0 && (mask > 4 || mask == 3 || Test86MSByte(p[(size_t)(mask >> 1) + 1])))
-        {
-          mask = (mask >> 1) | 4;
-          pos++;
-          continue;
-        }
-      }
+      BR86_PREPARE_BCJ_SCAN
+      p += 4;
+      if (BR86_IS_BCJ_BYTE(0))  { goto m0; }  mask >>= 1;
+      if (BR86_IS_BCJ_BYTE(1))  { goto m1; }  mask >>= 1;
+      if (BR86_IS_BCJ_BYTE(2))  { goto m2; }  mask = 0;
+      if (BR86_IS_BCJ_BYTE(3))  { goto a3; }
     }
+    goto main_loop;
 
-    if (Test86MSByte(p[4]))
+  m0: p--;
+  m1: p--;
+  m2: p--;
+    if (mask == 0)
+      goto a3;
+    if (p > lim)
+      goto fin_p;
+   
+    // if (((0x17u >> mask) & 1) == 0)
+    if (mask > 4 || mask == 3)
+    {
+      mask >>= 1;
+      continue; // goto cont;
+    }
+    mask >>= 1;
+    if (BR86_NEED_CONV_FOR_MS_BYTE(p[mask]))
+      continue; // goto cont;
+    // if (!BR86_NEED_CONV_FOR_MS_BYTE(p[3])) continue; // goto cont;
     {
-      UInt32 v = ((UInt32)p[4] << 24) | ((UInt32)p[3] << 16) | ((UInt32)p[2] << 8) | ((UInt32)p[1]);
-      UInt32 cur = ip + (UInt32)pos;
-      pos += 5;
-      if (encoding)
-        v += cur;
-      else
-        v -= cur;
-      if (mask != 0)
+      UInt32 v = GetUi32(p);
+      UInt32 c;
+      v += (1 << 24);  if (v & 0xfe000000) continue; // goto cont;
+      c = BR_PC_GET;
+      BR_CONVERT_VAL(v, c)
       {
-        unsigned sh = (mask & 6) << 2;
-        if (Test86MSByte((Byte)(v >> sh)))
+        mask <<= 3;
+        if (BR86_NEED_CONV_FOR_MS_BYTE(v >> mask))
         {
-          v ^= (((UInt32)0x100 << sh) - 1);
-          if (encoding)
-            v += cur;
-          else
-            v -= cur;
+          v ^= (((UInt32)0x100 << mask) - 1);
+          #ifdef MY_CPU_X86
+          // for X86 : we can recalculate (c) to reduce register pressure
+            c = BR_PC_GET;
+          #endif
+          BR_CONVERT_VAL(v, c)
         }
         mask = 0;
       }
-      p[1] = (Byte)v;
-      p[2] = (Byte)(v >> 8);
-      p[3] = (Byte)(v >> 16);
-      p[4] = (Byte)(0 - ((v >> 24) & 1));
+      // v = (v & ((1 << 24) - 1)) - (v & (1 << 24));
+      v &= (1 << 25) - 1;  v -= (1 << 24);
+      SetUi32(p, v)
+      p += 4;
+      goto main_loop;
     }
-    else
+
+  main_loop:
+    if (p >= lim)
+      goto fin;
+    for (;;)
     {
-      mask = (mask >> 1) | 4;
-      pos++;
+      BR86_PREPARE_BCJ_SCAN
+      p += 4;
+      if (BR86_IS_BCJ_BYTE(0))  { goto a0; }
+      if (BR86_IS_BCJ_BYTE(1))  { goto a1; }
+      if (BR86_IS_BCJ_BYTE(2))  { goto a2; }
+      if (BR86_IS_BCJ_BYTE(3))  { goto a3; }
+      if (p >= lim)
+        goto fin;
+    }
+  
+  a0: p--;
+  a1: p--;
+  a2: p--;
+  a3:
+    if (p > lim)
+      goto fin_p;
+    // if (!BR86_NEED_CONV_FOR_MS_BYTE(p[3])) continue; // goto cont;
+    {
+      UInt32 v = GetUi32(p);
+      UInt32 c;
+      v += (1 << 24);  if (v & 0xfe000000) continue; // goto cont;
+      c = BR_PC_GET;
+      BR_CONVERT_VAL(v, c)
+      // v = (v & ((1 << 24) - 1)) - (v & (1 << 24));
+      v &= (1 << 25) - 1;  v -= (1 << 24);
+      SetUi32(p, v)
+      p += 4;
+      goto main_loop;
     }
   }
+
+fin_p:
+  p--;
+fin:
+  // the following processing for tail is optional and can be commented
+  /*
+  lim += 4;
+  for (; p < lim; p++, mask >>= 1)
+    if ((*p & 0xfe) == 0xe8)
+      break;
+  */
+  *state = (UInt32)mask;
+  return p;
+ }
 }
+
+
+#define Z7_BRANCH_CONV_ST_FUNC_IMP(name, m, encoding) \
+Z7_NO_INLINE \
+Z7_ATTRIB_NO_VECTOR \
+Byte *m(name)(Byte *data, SizeT size, UInt32 pc, UInt32 *state) \
+  { return Z7_BRANCH_CONV_ST(name)(data, size, pc, state, encoding); }
+
+Z7_BRANCH_CONV_ST_FUNC_IMP(X86, Z7_BRANCH_CONV_ST_DEC, 0)
+#ifndef Z7_EXTRACT_ONLY
+Z7_BRANCH_CONV_ST_FUNC_IMP(X86, Z7_BRANCH_CONV_ST_ENC, 1)
+#endif
diff --git a/src/sdk/C/BraIA64.c b/src/sdk/C/BraIA64.c
index d1dbc62..9dfe3e2 100644
--- a/src/sdk/C/BraIA64.c
+++ b/src/sdk/C/BraIA64.c
@@ -1,53 +1,14 @@
 /* BraIA64.c -- Converter for IA-64 code
-2017-01-26 : Igor Pavlov : Public domain */
+2023-02-20 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
-#include "CpuArch.h"
-#include "Bra.h"
+// the code was moved to Bra.c
 
-SizeT IA64_Convert(Byte *data, SizeT size, UInt32 ip, int encoding)
-{
-  SizeT i;
-  if (size < 16)
-    return 0;
-  size -= 16;
-  i = 0;
-  do
-  {
-    unsigned m = ((UInt32)0x334B0000 >> (data[i] & 0x1E)) & 3;
-    if (m)
-    {
-      m++;
-      do
-      {
-        Byte *p = data + (i + (size_t)m * 5 - 8);
-        if (((p[3] >> m) & 15) == 5
-            && (((p[-1] | ((UInt32)p[0] << 8)) >> m) & 0x70) == 0)
-        {
-          unsigned raw = GetUi32(p);
-          unsigned v = raw >> m;
-          v = (v & 0xFFFFF) | ((v & (1 << 23)) >> 3);
-          
-          v <<= 4;
-          if (encoding)
-            v += ip + (UInt32)i;
-          else
-            v -= ip + (UInt32)i;
-          v >>= 4;
-          
-          v &= 0x1FFFFF;
-          v += 0x700000;
-          v &= 0x8FFFFF;
-          raw &= ~((UInt32)0x8FFFFF << m);
-          raw |= (v << m);
-          SetUi32(p, raw);
-        }
-      }
-      while (++m <= 4);
-    }
-    i += 16;
-  }
-  while (i <= size);
-  return i;
-}
+#ifdef _MSC_VER
+#pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty
+#endif
+
+#if defined(__clang__)
+#pragma GCC diagnostic ignored "-Wempty-translation-unit"
+#endif
diff --git a/src/sdk/C/Compiler.h b/src/sdk/C/Compiler.h
index 0cc409d..b266b27 100644
--- a/src/sdk/C/Compiler.h
+++ b/src/sdk/C/Compiler.h
@@ -1,8 +1,105 @@
-/* Compiler.h
-2017-04-03 : Igor Pavlov : Public domain */
+/* Compiler.h : Compiler specific defines and pragmas
+: Igor Pavlov : Public domain */
+
+#ifndef ZIP7_INC_COMPILER_H
+#define ZIP7_INC_COMPILER_H
+
+#if defined(__clang__)
+# define Z7_CLANG_VERSION  (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+#endif
+#if defined(__clang__) && defined(__apple_build_version__)
+# define Z7_APPLE_CLANG_VERSION   Z7_CLANG_VERSION
+#elif defined(__clang__)
+# define Z7_LLVM_CLANG_VERSION    Z7_CLANG_VERSION
+#elif defined(__GNUC__)
+# define Z7_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+#ifdef _MSC_VER
+#if !defined(__clang__) && !defined(__GNUC__)
+#define Z7_MSC_VER_ORIGINAL _MSC_VER
+#endif
+#endif
+
+#if defined(__MINGW32__) || defined(__MINGW64__)
+#define Z7_MINGW
+#endif
+
+#if defined(__LCC__) && (defined(__MCST__) || defined(__e2k__))
+#define Z7_MCST_LCC
+#define Z7_MCST_LCC_VERSION (__LCC__ * 100 + __LCC_MINOR__)
+#endif
+
+/*
+#if defined(__AVX2__) \
+    || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
+    || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
+    || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \
+    || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
+    || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
+    #define Z7_COMPILER_AVX2_SUPPORTED
+  #endif
+#endif
+*/
+
+// #pragma GCC diagnostic ignored "-Wunknown-pragmas"
+
+#ifdef __clang__
+// padding size of '' with 4 bytes to alignment boundary
+#pragma GCC diagnostic ignored "-Wpadded"
+
+#if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) \
+  && defined(__FreeBSD__)
+// freebsd:
+#pragma GCC diagnostic ignored "-Wexcess-padding"
+#endif
+
+#if __clang_major__ >= 16
+#pragma GCC diagnostic ignored "-Wunsafe-buffer-usage"
+#endif
+
+#if __clang_major__ == 13
+#if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
+// cheri
+#pragma GCC diagnostic ignored "-Wcapability-to-integer-cast"
+#endif
+#endif
+
+#if __clang_major__ == 13
+  // for <arm_neon.h>
+  #pragma GCC diagnostic ignored "-Wreserved-identifier"
+#endif
+
+#endif // __clang__
+
+#if defined(_WIN32) && defined(__clang__) && __clang_major__ >= 16
+// #pragma GCC diagnostic ignored "-Wcast-function-type-strict"
+#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION \
+  _Pragma("GCC diagnostic ignored \"-Wcast-function-type-strict\"")
+#else
+#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
+#endif
+
+typedef void (*Z7_void_Function)(void);
+#if defined(__clang__) || defined(__GNUC__)
+#define Z7_CAST_FUNC_C  (Z7_void_Function)
+#elif defined(_MSC_VER) && _MSC_VER > 1920
+#define Z7_CAST_FUNC_C  (void *)
+// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()'
+#else
+#define Z7_CAST_FUNC_C
+#endif
+/*
+#if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__)
+  // #pragma GCC diagnostic ignored "-Wcast-function-type"
+#endif
+*/
+#ifdef __GNUC__
+#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40000) && (Z7_GCC_VERSION < 70000)
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+#endif
 
-#ifndef __7Z_COMPILER_H
-#define __7Z_COMPILER_H
 
 #ifdef _MSC_VER
 
@@ -13,18 +110,134 @@
     #pragma warning(disable : 4214) // nonstandard extension used : bit field types other than int
   #endif
 
-  #if _MSC_VER >= 1300
-    #pragma warning(disable : 4996) // This function or variable may be unsafe
-  #else
-    #pragma warning(disable : 4511) // copy constructor could not be generated
-    #pragma warning(disable : 4512) // assignment operator could not be generated
-    #pragma warning(disable : 4514) // unreferenced inline function has been removed
-    #pragma warning(disable : 4702) // unreachable code
-    #pragma warning(disable : 4710) // not inlined
-    #pragma warning(disable : 4714) // function marked as __forceinline not inlined
-    #pragma warning(disable : 4786) // identifier was truncated to '255' characters in the debug information
-  #endif
+#if defined(_MSC_VER) && _MSC_VER >= 1800
+#pragma warning(disable : 4464) // relative include path contains '..'
+#endif
+
+// == 1200 : -O1 : for __forceinline
+// >= 1900 : -O1 : for printf
+#pragma warning(disable : 4710) // function not inlined
+
+#if _MSC_VER < 1900
+// winnt.h: 'Int64ShllMod32'
+#pragma warning(disable : 4514) // unreferenced inline function has been removed
+#endif
+    
+#if _MSC_VER < 1300
+// #pragma warning(disable : 4702) // unreachable code
+// Bra.c : -O1:
+#pragma warning(disable : 4714) // function marked as __forceinline not inlined
+#endif
+
+/*
+#if _MSC_VER > 1400 && _MSC_VER <= 1900
+// strcat: This function or variable may be unsafe
+// sysinfoapi.h: kit10: GetVersion was declared deprecated
+#pragma warning(disable : 4996)
+#endif
+*/
+
+#if _MSC_VER > 1200
+// -Wall warnings
+
+#pragma warning(disable : 4711) // function selected for automatic inline expansion
+#pragma warning(disable : 4820) // '2' bytes padding added after data member
+
+#if _MSC_VER >= 1400 && _MSC_VER < 1920
+// 1400: string.h: _DBG_MEMCPY_INLINE_
+// 1600 - 191x : smmintrin.h __cplusplus'
+// is not defined as a preprocessor macro, replacing with '0' for '#if/#elif'
+#pragma warning(disable : 4668)
+
+// 1400 - 1600 : WinDef.h : 'FARPROC' :
+// 1900 - 191x : immintrin.h: _readfsbase_u32
+// no function prototype given : converting '()' to '(void)'
+#pragma warning(disable : 4255)
+#endif
+
+#if _MSC_VER >= 1914
+// Compiler will insert Spectre mitigation for memory load if /Qspectre switch specified
+#pragma warning(disable : 5045)
+#endif
+
+#endif // _MSC_VER > 1200
+#endif // _MSC_VER
+
+
+#if defined(__clang__) && (__clang_major__ >= 4)
+  #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
+    _Pragma("clang loop unroll(disable)") \
+    _Pragma("clang loop vectorize(disable)")
+  #define Z7_ATTRIB_NO_VECTORIZE
+#elif defined(__GNUC__) && (__GNUC__ >= 5) \
+    && (!defined(Z7_MCST_LCC_VERSION) || (Z7_MCST_LCC_VERSION >= 12610))
+  #define Z7_ATTRIB_NO_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
+  // __attribute__((optimize("no-unroll-loops")));
+  #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+#elif defined(_MSC_VER) && (_MSC_VER >= 1920)
+  #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
+    _Pragma("loop( no_vector )")
+  #define Z7_ATTRIB_NO_VECTORIZE
+#else
+  #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  #define Z7_ATTRIB_NO_VECTORIZE
+#endif
+
+#if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920)
+  #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )")
+  #define Z7_PRAGMA_OPTIMIZE_DEFAULT       _Pragma("optimize ( \"\", on )")
+#else
+  #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE
+  #define Z7_PRAGMA_OPTIMIZE_DEFAULT
+#endif
+
+
+
+#if defined(MY_CPU_X86_OR_AMD64) && ( \
+       defined(__clang__) && (__clang_major__ >= 4) \
+    || defined(__GNUC__) && (__GNUC__ >= 5))
+  #define Z7_ATTRIB_NO_SSE  __attribute__((__target__("no-sse")))
+#else
+  #define Z7_ATTRIB_NO_SSE
+#endif
+
+#define Z7_ATTRIB_NO_VECTOR \
+  Z7_ATTRIB_NO_VECTORIZE \
+  Z7_ATTRIB_NO_SSE
+
+
+#if defined(__clang__) && (__clang_major__ >= 8) \
+  || defined(__GNUC__) && (__GNUC__ >= 1000) \
+  /* || defined(_MSC_VER) && (_MSC_VER >= 1920) */
+  // GCC is not good for __builtin_expect()
+  #define Z7_LIKELY(x)   (__builtin_expect((x), 1))
+  #define Z7_UNLIKELY(x) (__builtin_expect((x), 0))
+  // #define Z7_unlikely [[unlikely]]
+  // #define Z7_likely [[likely]]
+#else
+  #define Z7_LIKELY(x)   (x)
+  #define Z7_UNLIKELY(x) (x)
+  // #define Z7_likely
+#endif
+
+
+#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30600))
+
+#if (Z7_CLANG_VERSION < 130000)
+#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \
+  _Pragma("GCC diagnostic push") \
+  _Pragma("GCC diagnostic ignored \"-Wreserved-id-macro\"")
+#else
+#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \
+  _Pragma("GCC diagnostic push") \
+  _Pragma("GCC diagnostic ignored \"-Wreserved-macro-identifier\"")
+#endif
 
+#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \
+  _Pragma("GCC diagnostic pop")
+#else
+#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
 #endif
 
 #define UNUSED_VAR(x) (void)x;
diff --git a/src/sdk/C/CpuArch.c b/src/sdk/C/CpuArch.c
index 02e482e..6e02551 100644
--- a/src/sdk/C/CpuArch.c
+++ b/src/sdk/C/CpuArch.c
@@ -1,144 +1,357 @@
 /* CpuArch.c -- CPU specific code
-2018-02-18: Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
+// #include <stdio.h>
+
 #include "CpuArch.h"
 
 #ifdef MY_CPU_X86_OR_AMD64
 
-#if (defined(_MSC_VER) && !defined(MY_CPU_AMD64)) || defined(__GNUC__)
-#define USE_ASM
+#undef NEED_CHECK_FOR_CPUID
+#if !defined(MY_CPU_AMD64)
+#define NEED_CHECK_FOR_CPUID
 #endif
 
-#if !defined(USE_ASM) && _MSC_VER >= 1500
-#include <intrin.h>
+/*
+  cpuid instruction supports (subFunction) parameter in ECX,
+  that is used only with some specific (function) parameter values.
+  most functions use only (subFunction==0).
+*/
+/*
+  __cpuid(): MSVC and GCC/CLANG use same function/macro name
+             but parameters are different.
+   We use MSVC __cpuid() parameters style for our z7_x86_cpuid() function.
+*/
+
+#if defined(__GNUC__) /* && (__GNUC__ >= 10) */ \
+    || defined(__clang__) /* && (__clang_major__ >= 10) */
+
+/* there was some CLANG/GCC compilers that have issues with
+   rbx(ebx) handling in asm blocks in -fPIC mode (__PIC__ is defined).
+   compiler's <cpuid.h> contains the macro __cpuid() that is similar to our code.
+   The history of __cpuid() changes in CLANG/GCC:
+   GCC:
+     2007: it preserved ebx for (__PIC__ && __i386__)
+     2013: it preserved rbx and ebx for __PIC__
+     2014: it doesn't preserves rbx and ebx anymore
+     we suppose that (__GNUC__ >= 5) fixed that __PIC__ ebx/rbx problem.
+   CLANG:
+     2014+: it preserves rbx, but only for 64-bit code. No __PIC__ check.
+   Why CLANG cares about 64-bit mode only, and doesn't care about ebx (in 32-bit)?
+   Do we need __PIC__ test for CLANG or we must care about rbx even if
+   __PIC__ is not defined?
+*/
+
+#define ASM_LN "\n"
+   
+#if defined(MY_CPU_AMD64) && defined(__PIC__) \
+    && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
+
+  /* "=&r" selects free register. It can select even rbx, if that register is free.
+     "=&D" for (RDI) also works, but the code can be larger with "=&D"
+     "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */
+
+#define x86_cpuid_MACRO_2(p, func, subFunc) { \
+  __asm__ __volatile__ ( \
+    ASM_LN   "mov     %%rbx, %q1"  \
+    ASM_LN   "cpuid"               \
+    ASM_LN   "xchg    %%rbx, %q1"  \
+    : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
+
+#elif defined(MY_CPU_X86) && defined(__PIC__) \
+    && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
+
+#define x86_cpuid_MACRO_2(p, func, subFunc) { \
+  __asm__ __volatile__ ( \
+    ASM_LN   "mov     %%ebx, %k1"  \
+    ASM_LN   "cpuid"               \
+    ASM_LN   "xchg    %%ebx, %k1"  \
+    : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
+
+#else
+
+#define x86_cpuid_MACRO_2(p, func, subFunc) { \
+  __asm__ __volatile__ ( \
+    ASM_LN   "cpuid"               \
+    : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
+
 #endif
 
-#if defined(USE_ASM) && !defined(MY_CPU_AMD64)
-static UInt32 CheckFlag(UInt32 flag)
+#define x86_cpuid_MACRO(p, func)  x86_cpuid_MACRO_2(p, func, 0)
+
+void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
 {
-  #ifdef _MSC_VER
-  __asm pushfd;
-  __asm pop EAX;
-  __asm mov EDX, EAX;
-  __asm xor EAX, flag;
-  __asm push EAX;
-  __asm popfd;
-  __asm pushfd;
-  __asm pop EAX;
-  __asm xor EAX, EDX;
-  __asm push EDX;
-  __asm popfd;
-  __asm and flag, EAX;
-  #else
-  __asm__ __volatile__ (
-    "pushf\n\t"
-    "pop  %%EAX\n\t"
-    "movl %%EAX,%%EDX\n\t"
-    "xorl %0,%%EAX\n\t"
-    "push %%EAX\n\t"
-    "popf\n\t"
-    "pushf\n\t"
-    "pop  %%EAX\n\t"
-    "xorl %%EDX,%%EAX\n\t"
-    "push %%EDX\n\t"
-    "popf\n\t"
-    "andl %%EAX, %0\n\t":
-    "=c" (flag) : "c" (flag) :
-    "%eax", "%edx");
-  #endif
-  return flag;
+  x86_cpuid_MACRO(p, func)
 }
-#define CHECK_CPUID_IS_SUPPORTED if (CheckFlag(1 << 18) == 0 || CheckFlag(1 << 21) == 0) return False;
-#else
-#define CHECK_CPUID_IS_SUPPORTED
-#endif
 
-void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d)
+static
+void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
 {
-  #ifdef USE_ASM
+  x86_cpuid_MACRO_2(p, func, subFunc)
+}
 
-  #ifdef _MSC_VER
 
-  UInt32 a2, b2, c2, d2;
-  __asm xor EBX, EBX;
-  __asm xor ECX, ECX;
-  __asm xor EDX, EDX;
-  __asm mov EAX, function;
-  __asm cpuid;
-  __asm mov a2, EAX;
-  __asm mov b2, EBX;
-  __asm mov c2, ECX;
-  __asm mov d2, EDX;
+Z7_NO_INLINE
+UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
+{
+ #if defined(NEED_CHECK_FOR_CPUID)
+  #define EFALGS_CPUID_BIT 21
+  UInt32 a;
+  __asm__ __volatile__ (
+    ASM_LN   "pushf"
+    ASM_LN   "pushf"
+    ASM_LN   "pop     %0"
+    // ASM_LN   "movl    %0, %1"
+    // ASM_LN   "xorl    $0x200000, %0"
+    ASM_LN   "btc     %1, %0"
+    ASM_LN   "push    %0"
+    ASM_LN   "popf"
+    ASM_LN   "pushf"
+    ASM_LN   "pop     %0"
+    ASM_LN   "xorl    (%%esp), %0"
 
-  *a = a2;
-  *b = b2;
-  *c = c2;
-  *d = d2;
+    ASM_LN   "popf"
+    ASM_LN
+    : "=&r" (a) // "=a"
+    : "i" (EFALGS_CPUID_BIT)
+    );
+  if ((a & (1 << EFALGS_CPUID_BIT)) == 0)
+    return 0;
+ #endif
+  {
+    UInt32 p[4];
+    x86_cpuid_MACRO(p, 0)
+    return p[0];
+  }
+}
 
-  #else
+#undef ASM_LN
 
-  __asm__ __volatile__ (
-  #if defined(MY_CPU_AMD64) && defined(__PIC__)
-    "mov %%rbx, %%rdi;"
-    "cpuid;"
-    "xchg %%rbx, %%rdi;"
-    : "=a" (*a) ,
-      "=D" (*b) ,
-  #elif defined(MY_CPU_X86) && defined(__PIC__)
-    "mov %%ebx, %%edi;"
-    "cpuid;"
-    "xchgl %%ebx, %%edi;"
-    : "=a" (*a) ,
-      "=D" (*b) ,
-  #else
-    "cpuid"
-    : "=a" (*a) ,
-      "=b" (*b) ,
-  #endif
-      "=c" (*c) ,
-      "=d" (*d)
-    : "0" (function)) ;
+#elif !defined(_MSC_VER)
 
-  #endif
-  
-  #else
+/*
+// for gcc/clang and other: we can try to use __cpuid macro:
+#include <cpuid.h>
+void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
+{
+  __cpuid(func, p[0], p[1], p[2], p[3]);
+}
+UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
+{
+  return (UInt32)__get_cpuid_max(0, NULL);
+}
+*/
+// for unsupported cpuid:
+void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
+{
+  UNUSED_VAR(func)
+  p[0] = p[1] = p[2] = p[3] = 0;
+}
+UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
+{
+  return 0;
+}
+
+#else // _MSC_VER
 
-  int CPUInfo[4];
-  __cpuid(CPUInfo, function);
-  *a = CPUInfo[0];
-  *b = CPUInfo[1];
-  *c = CPUInfo[2];
-  *d = CPUInfo[3];
+#if !defined(MY_CPU_AMD64)
 
+UInt32 __declspec(naked) Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
+{
+  #if defined(NEED_CHECK_FOR_CPUID)
+  #define EFALGS_CPUID_BIT 21
+  __asm   pushfd
+  __asm   pushfd
+  /*
+  __asm   pop     eax
+  // __asm   mov     edx, eax
+  __asm   btc     eax, EFALGS_CPUID_BIT
+  __asm   push    eax
+  */
+  __asm   btc     dword ptr [esp], EFALGS_CPUID_BIT
+  __asm   popfd
+  __asm   pushfd
+  __asm   pop     eax
+  // __asm   xor     eax, edx
+  __asm   xor     eax, [esp]
+  // __asm   push    edx
+  __asm   popfd
+  __asm   and     eax, (1 shl EFALGS_CPUID_BIT)
+  __asm   jz end_func
+  #endif
+  __asm   push    ebx
+  __asm   xor     eax, eax    // func
+  __asm   xor     ecx, ecx    // subFunction (optional) for (func == 0)
+  __asm   cpuid
+  __asm   pop     ebx
+  #if defined(NEED_CHECK_FOR_CPUID)
+  end_func:
   #endif
+  __asm   ret 0
 }
 
-BoolInt x86cpuid_CheckAndRead(Cx86cpuid *p)
+void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
+{
+  UNUSED_VAR(p)
+  UNUSED_VAR(func)
+  __asm   push    ebx
+  __asm   push    edi
+  __asm   mov     edi, ecx    // p
+  __asm   mov     eax, edx    // func
+  __asm   xor     ecx, ecx    // subfunction (optional) for (func == 0)
+  __asm   cpuid
+  __asm   mov     [edi     ], eax
+  __asm   mov     [edi +  4], ebx
+  __asm   mov     [edi +  8], ecx
+  __asm   mov     [edi + 12], edx
+  __asm   pop     edi
+  __asm   pop     ebx
+  __asm   ret     0
+}
+
+static
+void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
+{
+  UNUSED_VAR(p)
+  UNUSED_VAR(func)
+  UNUSED_VAR(subFunc)
+  __asm   push    ebx
+  __asm   push    edi
+  __asm   mov     edi, ecx    // p
+  __asm   mov     eax, edx    // func
+  __asm   mov     ecx, [esp + 12]  // subFunc
+  __asm   cpuid
+  __asm   mov     [edi     ], eax
+  __asm   mov     [edi +  4], ebx
+  __asm   mov     [edi +  8], ecx
+  __asm   mov     [edi + 12], edx
+  __asm   pop     edi
+  __asm   pop     ebx
+  __asm   ret     4
+}
+
+#else // MY_CPU_AMD64
+
+    #if _MSC_VER >= 1600
+      #include <intrin.h>
+      #define MY_cpuidex  __cpuidex
+
+static
+void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
+{
+  __cpuidex((int *)p, func, subFunc);
+}
+
+    #else
+/*
+ __cpuid (func == (0 or 7)) requires subfunction number in ECX.
+  MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction.
+   __cpuid() in new MSVC clears ECX.
+   __cpuid() in old MSVC (14.00) x64 doesn't clear ECX
+ We still can use __cpuid for low (func) values that don't require ECX,
+ but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
+ So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
+ where ECX value is first parameter for FASTCALL / NO_INLINE func.
+ So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
+ old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
+ 
+DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!!
+*/
+static
+Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo)
+{
+  UNUSED_VAR(subFunction)
+  __cpuid(CPUInfo, func);
+}
+      #define MY_cpuidex(info, func, func2)  MY_cpuidex_HACK(func2, func, info)
+      #pragma message("======== MY_cpuidex_HACK WAS USED ========")
+static
+void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
+{
+  MY_cpuidex_HACK(subFunc, func, (Int32 *)p);
+}
+    #endif // _MSC_VER >= 1600
+
+#if !defined(MY_CPU_AMD64)
+/* inlining for __cpuid() in MSVC x86 (32-bit) produces big ineffective code,
+   so we disable inlining here */
+Z7_NO_INLINE
+#endif
+void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
+{
+  MY_cpuidex((Int32 *)p, (Int32)func, 0);
+}
+
+Z7_NO_INLINE
+UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
+{
+  Int32 a[4];
+  MY_cpuidex(a, 0, 0);
+  return a[0];
+}
+
+#endif // MY_CPU_AMD64
+#endif // _MSC_VER
+
+#if defined(NEED_CHECK_FOR_CPUID)
+#define CHECK_CPUID_IS_SUPPORTED { if (z7_x86_cpuid_GetMaxFunc() == 0) return 0; }
+#else
+#define CHECK_CPUID_IS_SUPPORTED
+#endif
+#undef NEED_CHECK_FOR_CPUID
+
+
+static
+BoolInt x86cpuid_Func_1(UInt32 *p)
 {
   CHECK_CPUID_IS_SUPPORTED
-  MyCPUID(0, &p->maxFunc, &p->vendor[0], &p->vendor[2], &p->vendor[1]);
-  MyCPUID(1, &p->ver, &p->b, &p->c, &p->d);
+  z7_x86_cpuid(p, 1);
   return True;
 }
 
-static const UInt32 kVendors[][3] =
+/*
+static const UInt32 kVendors[][1] =
 {
-  { 0x756E6547, 0x49656E69, 0x6C65746E},
-  { 0x68747541, 0x69746E65, 0x444D4163},
-  { 0x746E6543, 0x48727561, 0x736C7561}
+  { 0x756E6547 }, // , 0x49656E69, 0x6C65746E },
+  { 0x68747541 }, // , 0x69746E65, 0x444D4163 },
+  { 0x746E6543 }  // , 0x48727561, 0x736C7561 }
 };
+*/
+
+/*
+typedef struct
+{
+  UInt32 maxFunc;
+  UInt32 vendor[3];
+  UInt32 ver;
+  UInt32 b;
+  UInt32 c;
+  UInt32 d;
+} Cx86cpuid;
+
+enum
+{
+  CPU_FIRM_INTEL,
+  CPU_FIRM_AMD,
+  CPU_FIRM_VIA
+};
+int x86cpuid_GetFirm(const Cx86cpuid *p);
+#define x86cpuid_ver_GetFamily(ver) (((ver >> 16) & 0xff0) | ((ver >> 8) & 0xf))
+#define x86cpuid_ver_GetModel(ver)  (((ver >> 12) &  0xf0) | ((ver >> 4) & 0xf))
+#define x86cpuid_ver_GetStepping(ver) (ver & 0xf)
 
 int x86cpuid_GetFirm(const Cx86cpuid *p)
 {
   unsigned i;
-  for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[i]); i++)
+  for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[0]); i++)
   {
     const UInt32 *v = kVendors[i];
-    if (v[0] == p->vendor[0] &&
-        v[1] == p->vendor[1] &&
-        v[2] == p->vendor[2])
+    if (v[0] == p->vendor[0]
+        // && v[1] == p->vendor[1]
+        // && v[2] == p->vendor[2]
+        )
       return (int)i;
   }
   return -1;
@@ -147,72 +360,611 @@ int x86cpuid_GetFirm(const Cx86cpuid *p)
 BoolInt CPU_Is_InOrder()
 {
   Cx86cpuid p;
-  int firm;
   UInt32 family, model;
   if (!x86cpuid_CheckAndRead(&p))
     return True;
 
-  family = x86cpuid_GetFamily(p.ver);
-  model = x86cpuid_GetModel(p.ver);
-  
-  firm = x86cpuid_GetFirm(&p);
+  family = x86cpuid_ver_GetFamily(p.ver);
+  model = x86cpuid_ver_GetModel(p.ver);
 
-  switch (firm)
+  switch (x86cpuid_GetFirm(&p))
   {
     case CPU_FIRM_INTEL: return (family < 6 || (family == 6 && (
-        /* In-Order Atom CPU */
-           model == 0x1C  /* 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330 */
-        || model == 0x26  /* 45 nm, Z6xx */
-        || model == 0x27  /* 32 nm, Z2460 */
-        || model == 0x35  /* 32 nm, Z2760 */
-        || model == 0x36  /* 32 nm, N2xxx, D2xxx */
+        // In-Order Atom CPU
+           model == 0x1C  // 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330
+        || model == 0x26  // 45 nm, Z6xx
+        || model == 0x27  // 32 nm, Z2460
+        || model == 0x35  // 32 nm, Z2760
+        || model == 0x36  // 32 nm, N2xxx, D2xxx
         )));
     case CPU_FIRM_AMD: return (family < 5 || (family == 5 && (model < 6 || model == 0xA)));
     case CPU_FIRM_VIA: return (family < 6 || (family == 6 && model < 0xF));
   }
-  return True;
+  return False; // v23 : unknown processors are not In-Order
 }
+*/
+
+#ifdef _WIN32
+#include "7zWindows.h"
+#endif
 
 #if !defined(MY_CPU_AMD64) && defined(_WIN32)
-#include <windows.h>
-static BoolInt CPU_Sys_Is_SSE_Supported()
+
+/* for legacy SSE ia32: there is no user-space cpu instruction to check
+   that OS supports SSE register storing/restoring on context switches.
+   So we need some OS-specific function to check that it's safe to use SSE registers.
+*/
+
+Z7_FORCE_INLINE
+static BoolInt CPU_Sys_Is_SSE_Supported(void)
 {
-  OSVERSIONINFO vi;
-  vi.dwOSVersionInfoSize = sizeof(vi);
-  if (!GetVersionEx(&vi))
-    return False;
-  return (vi.dwMajorVersion >= 5);
+#ifdef _MSC_VER
+  #pragma warning(push)
+  #pragma warning(disable : 4996) // `GetVersion': was declared deprecated
+#endif
+  /* low byte is major version of Windows
+     We suppose that any Windows version since
+     Windows2000 (major == 5) supports SSE registers */
+  return (Byte)GetVersion() >= 5;
+#if defined(_MSC_VER)
+  #pragma warning(pop)
+#endif
 }
 #define CHECK_SYS_SSE_SUPPORT if (!CPU_Sys_Is_SSE_Supported()) return False;
 #else
 #define CHECK_SYS_SSE_SUPPORT
 #endif
 
-BoolInt CPU_Is_Aes_Supported()
+
+#if !defined(MY_CPU_AMD64)
+
+BoolInt CPU_IsSupported_CMOV(void)
+{
+  UInt32 a[4];
+  if (!x86cpuid_Func_1(&a[0]))
+    return 0;
+  return (BoolInt)(a[3] >> 15) & 1;
+}
+
+BoolInt CPU_IsSupported_SSE(void)
+{
+  UInt32 a[4];
+  CHECK_SYS_SSE_SUPPORT
+  if (!x86cpuid_Func_1(&a[0]))
+    return 0;
+  return (BoolInt)(a[3] >> 25) & 1;
+}
+
+BoolInt CPU_IsSupported_SSE2(void)
+{
+  UInt32 a[4];
+  CHECK_SYS_SSE_SUPPORT
+  if (!x86cpuid_Func_1(&a[0]))
+    return 0;
+  return (BoolInt)(a[3] >> 26) & 1;
+}
+
+#endif
+
+
+static UInt32 x86cpuid_Func_1_ECX(void)
 {
-  Cx86cpuid p;
+  UInt32 a[4];
   CHECK_SYS_SSE_SUPPORT
-  if (!x86cpuid_CheckAndRead(&p))
+  if (!x86cpuid_Func_1(&a[0]))
+    return 0;
+  return a[2];
+}
+
+BoolInt CPU_IsSupported_AES(void)
+{
+  return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1;
+}
+
+BoolInt CPU_IsSupported_SSSE3(void)
+{
+  return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1;
+}
+
+BoolInt CPU_IsSupported_SSE41(void)
+{
+  return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1;
+}
+
+BoolInt CPU_IsSupported_SHA(void)
+{
+  CHECK_SYS_SSE_SUPPORT
+
+  if (z7_x86_cpuid_GetMaxFunc() < 7)
     return False;
-  return (p.c >> 25) & 1;
+  {
+    UInt32 d[4];
+    z7_x86_cpuid(d, 7);
+    return (BoolInt)(d[1] >> 29) & 1;
+  }
 }
 
-BoolInt CPU_IsSupported_PageGB()
+
+BoolInt CPU_IsSupported_SHA512(void)
 {
-  Cx86cpuid cpuid;
-  if (!x86cpuid_CheckAndRead(&cpuid))
+  if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here
+
+  if (z7_x86_cpuid_GetMaxFunc() < 7)
     return False;
   {
-    UInt32 d[4] = { 0 };
-    MyCPUID(0x80000000, &d[0], &d[1], &d[2], &d[3]);
+    UInt32 d[4];
+    z7_x86_cpuid_subFunc(d, 7, 0);
+    if (d[0] < 1) // d[0] - is max supported subleaf value
+      return False;
+    z7_x86_cpuid_subFunc(d, 7, 1);
+    return (BoolInt)(d[0]) & 1;
+  }
+}
+
+/*
+MSVC: _xgetbv() intrinsic is available since VS2010SP1.
+   MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
+   <immintrin.h> that we can use or check.
+   For any 32-bit x86 we can use asm code in MSVC,
+   but MSVC asm code is huge after compilation.
+   So _xgetbv() is better
+
+ICC: _xgetbv() intrinsic is available (in what version of ICC?)
+   ICC defines (__GNUC___) and it supports gnu assembler
+   also ICC supports MASM style code with -use-msasm switch.
+   but ICC doesn't support __attribute__((__target__))
+
+GCC/CLANG 9:
+  _xgetbv() is macro that works via __builtin_ia32_xgetbv()
+  and we need __attribute__((__target__("xsave")).
+  But with __target__("xsave") the function will be not
+  inlined to function that has no __target__("xsave") attribute.
+  If we want _xgetbv() call inlining, then we should use asm version
+  instead of calling _xgetbv().
+  Note:intrinsic is broke before GCC 8.2:
+    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85684
+*/
+
+#if    defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1100) \
+    || defined(_MSC_VER) && (_MSC_VER >= 1600) && (_MSC_FULL_VER >= 160040219)  \
+    || defined(__GNUC__) && (__GNUC__ >= 9) \
+    || defined(__clang__) && (__clang_major__ >= 9)
+// we define ATTRIB_XGETBV, if we want to use predefined _xgetbv() from compiler
+#if defined(__INTEL_COMPILER)
+#define ATTRIB_XGETBV
+#elif defined(__GNUC__) || defined(__clang__)
+// we don't define ATTRIB_XGETBV here, because asm version is better for inlining.
+// #define ATTRIB_XGETBV __attribute__((__target__("xsave")))
+#else
+#define ATTRIB_XGETBV
+#endif
+#endif
+
+#if defined(ATTRIB_XGETBV)
+#include <immintrin.h>
+#endif
+
+
+// XFEATURE_ENABLED_MASK/XCR0
+#define MY_XCR_XFEATURE_ENABLED_MASK 0
+
+#if defined(ATTRIB_XGETBV)
+ATTRIB_XGETBV
+#endif
+static UInt64 x86_xgetbv_0(UInt32 num)
+{
+#if defined(ATTRIB_XGETBV)
+  {
+    return
+      #if (defined(_MSC_VER))
+        _xgetbv(num);
+      #else
+        __builtin_ia32_xgetbv(
+          #if !defined(__clang__)
+            (int)
+          #endif
+            num);
+      #endif
+  }
+
+#elif defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_CC)
+
+  UInt32 a, d;
+ #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
+  __asm__
+  (
+    "xgetbv"
+    : "=a"(a), "=d"(d) : "c"(num) : "cc"
+  );
+ #else // is old gcc
+  __asm__
+  (
+    ".byte 0x0f, 0x01, 0xd0" "\n\t"
+    : "=a"(a), "=d"(d) : "c"(num) : "cc"
+  );
+ #endif
+  return ((UInt64)d << 32) | a;
+  // return a;
+
+#elif defined(_MSC_VER) && !defined(MY_CPU_AMD64)
+  
+  UInt32 a, d;
+  __asm {
+    push eax
+    push edx
+    push ecx
+    mov ecx, num;
+    // xor ecx, ecx // = MY_XCR_XFEATURE_ENABLED_MASK
+    _emit 0x0f
+    _emit 0x01
+    _emit 0xd0
+    mov a, eax
+    mov d, edx
+    pop ecx
+    pop edx
+    pop eax
+  }
+  return ((UInt64)d << 32) | a;
+  // return a;
+
+#else // it's unknown compiler
+  // #error "Need xgetbv function"
+  UNUSED_VAR(num)
+  // for MSVC-X64 we could call external function from external file.
+  /* Actually we had checked OSXSAVE/AVX in cpuid before.
+     So it's expected that OS supports at least AVX and below. */
+  // if (num != MY_XCR_XFEATURE_ENABLED_MASK) return 0; // if not XCR0
+  return
+      // (1 << 0) |  // x87
+        (1 << 1)   // SSE
+      | (1 << 2);  // AVX
+  
+#endif
+}
+
+#ifdef _WIN32
+/*
+  Windows versions do not know about new ISA extensions that
+  can be introduced. But we still can use new extensions,
+  even if Windows doesn't report about supporting them,
+  But we can use new extensions, only if Windows knows about new ISA extension
+  that changes the number or size of registers: SSE, AVX/XSAVE, AVX512
+  So it's enough to check
+    MY_PF_AVX_INSTRUCTIONS_AVAILABLE
+      instead of
+    MY_PF_AVX2_INSTRUCTIONS_AVAILABLE
+*/
+#define MY_PF_XSAVE_ENABLED                            17
+// #define MY_PF_SSSE3_INSTRUCTIONS_AVAILABLE             36
+// #define MY_PF_SSE4_1_INSTRUCTIONS_AVAILABLE            37
+// #define MY_PF_SSE4_2_INSTRUCTIONS_AVAILABLE            38
+// #define MY_PF_AVX_INSTRUCTIONS_AVAILABLE               39
+// #define MY_PF_AVX2_INSTRUCTIONS_AVAILABLE              40
+// #define MY_PF_AVX512F_INSTRUCTIONS_AVAILABLE           41
+#endif
+
+BoolInt CPU_IsSupported_AVX(void)
+{
+  #ifdef _WIN32
+  if (!IsProcessorFeaturePresent(MY_PF_XSAVE_ENABLED))
+    return False;
+  /* PF_AVX_INSTRUCTIONS_AVAILABLE probably is supported starting from
+     some latest Win10 revisions. But we need AVX in older Windows also.
+     So we don't use the following check: */
+  /*
+  if (!IsProcessorFeaturePresent(MY_PF_AVX_INSTRUCTIONS_AVAILABLE))
+    return False;
+  */
+  #endif
+
+  /*
+    OS must use new special XSAVE/XRSTOR instructions to save
+    AVX registers when it required for context switching.
+    At OS statring:
+      OS sets CR4.OSXSAVE flag to signal the processor that OS supports the XSAVE extensions.
+      Also OS sets bitmask in XCR0 register that defines what
+      registers will be processed by XSAVE instruction:
+        XCR0.SSE[bit 0] - x87 registers and state
+        XCR0.SSE[bit 1] - SSE registers and state
+        XCR0.AVX[bit 2] - AVX registers and state
+    CR4.OSXSAVE is reflected to CPUID.1:ECX.OSXSAVE[bit 27].
+       So we can read that bit in user-space.
+    XCR0 is available for reading in user-space by new XGETBV instruction.
+  */
+  {
+    const UInt32 c = x86cpuid_Func_1_ECX();
+    if (0 == (1
+        & (c >> 28)   // AVX instructions are supported by hardware
+        & (c >> 27))) // OSXSAVE bit: XSAVE and related instructions are enabled by OS.
+      return False;
+  }
+
+  /* also we can check
+     CPUID.1:ECX.XSAVE [bit 26] : that shows that
+        XSAVE, XRESTOR, XSETBV, XGETBV instructions are supported by hardware.
+     But that check is redundant, because if OSXSAVE bit is set, then XSAVE is also set */
+
+  /* If OS have enabled XSAVE extension instructions (OSXSAVE == 1),
+     in most cases we expect that OS also will support storing/restoring
+     for AVX and SSE states at least.
+     But to be ensure for that we call user-space instruction
+     XGETBV(0) to get XCR0 value that contains bitmask that defines
+     what exact states(registers) OS have enabled for storing/restoring.
+  */
+
+  {
+    const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);
+    // printf("\n=== XGetBV=0x%x\n", bm);
+    return 1
+        & (BoolInt)(bm >> 1)  // SSE state is supported (set by OS) for storing/restoring
+        & (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring
+  }
+  // since Win7SP1: we can use GetEnabledXStateFeatures();
+}
+
+
+BoolInt CPU_IsSupported_AVX2(void)
+{
+  if (!CPU_IsSupported_AVX())
+    return False;
+  if (z7_x86_cpuid_GetMaxFunc() < 7)
+    return False;
+  {
+    UInt32 d[4];
+    z7_x86_cpuid(d, 7);
+    // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
+    return 1
+      & (BoolInt)(d[1] >> 5); // avx2
+  }
+}
+
+#if 0
+BoolInt CPU_IsSupported_AVX512F_AVX512VL(void)
+{
+  if (!CPU_IsSupported_AVX())
+    return False;
+  if (z7_x86_cpuid_GetMaxFunc() < 7)
+    return False;
+  {
+    UInt32 d[4];
+    BoolInt v;
+    z7_x86_cpuid(d, 7);
+    // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
+    v = 1
+      & (BoolInt)(d[1] >> 16)  // avx512f
+      & (BoolInt)(d[1] >> 31); // avx512vl
+    if (!v)
+      return False;
+  }
+  {
+    const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);
+    // printf("\n=== XGetBV=0x%x\n", bm);
+    return 1
+        & (BoolInt)(bm >> 5)  // OPMASK
+        & (BoolInt)(bm >> 6)  // ZMM upper 256-bit
+        & (BoolInt)(bm >> 7); // ZMM16 ... ZMM31
+  }
+}
+#endif
+
+BoolInt CPU_IsSupported_VAES_AVX2(void)
+{
+  if (!CPU_IsSupported_AVX())
+    return False;
+  if (z7_x86_cpuid_GetMaxFunc() < 7)
+    return False;
+  {
+    UInt32 d[4];
+    z7_x86_cpuid(d, 7);
+    // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
+    return 1
+      & (BoolInt)(d[1] >> 5) // avx2
+      // & (d[1] >> 31) // avx512vl
+      & (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX
+  }
+}
+
+BoolInt CPU_IsSupported_PageGB(void)
+{
+  CHECK_CPUID_IS_SUPPORTED
+  {
+    UInt32 d[4];
+    z7_x86_cpuid(d, 0x80000000);
     if (d[0] < 0x80000001)
       return False;
+    z7_x86_cpuid(d, 0x80000001);
+    return (BoolInt)(d[3] >> 26) & 1;
   }
+}
+
+
+#elif defined(MY_CPU_ARM_OR_ARM64)
+
+#ifdef _WIN32
+
+#include "7zWindows.h"
+
+BoolInt CPU_IsSupported_CRC32(void)  { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
+BoolInt CPU_IsSupported_CRYPTO(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
+BoolInt CPU_IsSupported_NEON(void)   { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
+
+#else
+
+#if defined(__APPLE__)
+
+/*
+#include <stdio.h>
+#include <string.h>
+static void Print_sysctlbyname(const char *name)
+{
+  size_t bufSize = 256;
+  char buf[256];
+  int res = sysctlbyname(name, &buf, &bufSize, NULL, 0);
   {
-    UInt32 d[4] = { 0 };
-    MyCPUID(0x80000001, &d[0], &d[1], &d[2], &d[3]);
-    return (d[3] >> 26) & 1;
+    int i;
+    printf("\nres = %d : %s : '%s' : bufSize = %d, numeric", res, name, buf, (unsigned)bufSize);
+    for (i = 0; i < 20; i++)
+      printf(" %2x", (unsigned)(Byte)buf[i]);
+
   }
 }
+*/
+/*
+  Print_sysctlbyname("hw.pagesize");
+  Print_sysctlbyname("machdep.cpu.brand_string");
+*/
+
+static BoolInt z7_sysctlbyname_Get_BoolInt(const char *name)
+{
+  UInt32 val = 0;
+  if (z7_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1)
+    return 1;
+  return 0;
+}
+
+BoolInt CPU_IsSupported_CRC32(void)
+{
+  return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32");
+}
+
+BoolInt CPU_IsSupported_NEON(void)
+{
+  return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
+}
+
+BoolInt CPU_IsSupported_SHA512(void)
+{
+  return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512");
+}
+
+/*
+BoolInt CPU_IsSupported_SHA3(void)
+{
+  return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3");
+}
+*/
+
+#ifdef MY_CPU_ARM64
+#define APPLE_CRYPTO_SUPPORT_VAL 1
+#else
+#define APPLE_CRYPTO_SUPPORT_VAL 0
+#endif
+
+BoolInt CPU_IsSupported_SHA1(void) { return APPLE_CRYPTO_SUPPORT_VAL; }
+BoolInt CPU_IsSupported_SHA2(void) { return APPLE_CRYPTO_SUPPORT_VAL; }
+BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; }
+
+
+#else // __APPLE__
+
+#if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216)
+  #define Z7_GETAUXV_AVAILABLE
+#else
+// #pragma message("=== is not NEW GLIBC === ")
+  #if defined __has_include
+  #if __has_include (<sys/auxv.h>)
+// #pragma message("=== sys/auxv.h is avail=== ")
+    #define Z7_GETAUXV_AVAILABLE
+  #endif
+  #endif
+#endif
+
+#ifdef Z7_GETAUXV_AVAILABLE
+// #pragma message("=== Z7_GETAUXV_AVAILABLE === ")
+#include <sys/auxv.h>
+#define USE_HWCAP
+#endif
+
+#ifdef USE_HWCAP
+
+#if defined(__FreeBSD__)
+static unsigned long MY_getauxval(int aux)
+{
+  unsigned long val;
+  if (elf_aux_info(aux, &val, sizeof(val)))
+    return 0;
+  return val;
+}
+#else
+#define MY_getauxval  getauxval
+  #if defined __has_include
+  #if __has_include (<asm/hwcap.h>)
+#include <asm/hwcap.h>
+  #endif
+  #endif
+#endif
+
+  #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \
+  BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP)  & (HWCAP_  ## name2)); }
+
+#ifdef MY_CPU_ARM64
+  #define MY_HWCAP_CHECK_FUNC(name) \
+  MY_HWCAP_CHECK_FUNC_2(name, name)
+#if 1 || defined(__ARM_NEON)
+  BoolInt CPU_IsSupported_NEON(void) { return True; }
+#else
+  MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD)
+#endif
+// MY_HWCAP_CHECK_FUNC (ASIMD)
+#elif defined(MY_CPU_ARM)
+  #define MY_HWCAP_CHECK_FUNC(name) \
+  BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); }
+  MY_HWCAP_CHECK_FUNC_2(NEON, NEON)
+#endif
+
+#else // USE_HWCAP
+
+  #define MY_HWCAP_CHECK_FUNC(name) \
+  BoolInt CPU_IsSupported_ ## name(void) { return 0; }
+#if defined(__ARM_NEON)
+  BoolInt CPU_IsSupported_NEON(void) { return True; }
+#else
+  MY_HWCAP_CHECK_FUNC(NEON)
+#endif
+
+#endif // USE_HWCAP
+
+MY_HWCAP_CHECK_FUNC (CRC32)
+MY_HWCAP_CHECK_FUNC (SHA1)
+MY_HWCAP_CHECK_FUNC (SHA2)
+MY_HWCAP_CHECK_FUNC (AES)
+#ifdef MY_CPU_ARM64
+// <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017.
+// we define them here, if they are not defined
+#ifndef HWCAP_SHA3
+// #define HWCAP_SHA3    (1 << 17)
+#endif
+#ifndef HWCAP_SHA512
+// #pragma message("=== HWCAP_SHA512 define === ")
+#define HWCAP_SHA512  (1 << 21)
+#endif
+MY_HWCAP_CHECK_FUNC (SHA512)
+// MY_HWCAP_CHECK_FUNC (SHA3)
+#endif
+
+#endif // __APPLE__
+#endif // _WIN32
+
+#endif // MY_CPU_ARM_OR_ARM64
+
+
+
+#ifdef __APPLE__
+
+#include <sys/sysctl.h>
+
+int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize)
+{
+  return sysctlbyname(name, buf, bufSize, NULL, 0);
+}
+
+int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val)
+{
+  size_t bufSize = sizeof(*val);
+  const int res = z7_sysctlbyname_Get(name, val, &bufSize);
+  if (res == 0 && bufSize != sizeof(*val))
+    return EFAULT;
+  return res;
+}
 
 #endif
diff --git a/src/sdk/C/CpuArch.h b/src/sdk/C/CpuArch.h
index f1edae3..1690a5b 100644
--- a/src/sdk/C/CpuArch.h
+++ b/src/sdk/C/CpuArch.h
@@ -1,8 +1,8 @@
 /* CpuArch.h -- CPU specific code
-2018-02-18 : Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */
 
-#ifndef __CPU_ARCH_H
-#define __CPU_ARCH_H
+#ifndef ZIP7_INC_CPU_ARCH_H
+#define ZIP7_INC_CPU_ARCH_H
 
 #include "7zTypes.h"
 
@@ -14,8 +14,13 @@ MY_CPU_BE means that CPU is BIG ENDIAN.
 If MY_CPU_LE and MY_CPU_BE are not defined, we don't know about ENDIANNESS of platform.
 
 MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned memory accesses.
+
+MY_CPU_64BIT means that processor can work with 64-bit registers.
+  MY_CPU_64BIT can be used to select fast code branch
+  MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8)
 */
 
+#if !defined(_M_ARM64EC)
 #if  defined(_M_X64) \
   || defined(_M_AMD64) \
   || defined(__x86_64__) \
@@ -24,27 +29,52 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
   #define MY_CPU_AMD64
   #ifdef __ILP32__
     #define MY_CPU_NAME "x32"
+    #define MY_CPU_SIZEOF_POINTER 4
   #else
     #define MY_CPU_NAME "x64"
+    #define MY_CPU_SIZEOF_POINTER 8
   #endif
   #define MY_CPU_64BIT
 #endif
+#endif
 
 
 #if  defined(_M_IX86) \
   || defined(__i386__)
   #define MY_CPU_X86
   #define MY_CPU_NAME "x86"
-  #define MY_CPU_32BIT
+  /* #define MY_CPU_32BIT */
+  #define MY_CPU_SIZEOF_POINTER 4
+#endif
+
+#if defined(__SSE2__) \
+    || defined(MY_CPU_AMD64) \
+    || defined(_M_IX86_FP) && (_M_IX86_FP >= 2)
+#define MY_CPU_SSE2
 #endif
 
 
 #if  defined(_M_ARM64) \
+  || defined(_M_ARM64EC) \
   || defined(__AARCH64EL__) \
   || defined(__AARCH64EB__) \
   || defined(__aarch64__)
   #define MY_CPU_ARM64
-  #define MY_CPU_NAME "arm64"
+#if   defined(__ILP32__) \
+   || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
+    #define MY_CPU_NAME "arm64-32"
+    #define MY_CPU_SIZEOF_POINTER 4
+#elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
+    #define MY_CPU_NAME "arm64-128"
+    #define MY_CPU_SIZEOF_POINTER 16
+#else
+#if defined(_M_ARM64EC)
+    #define MY_CPU_NAME "arm64ec"
+#else
+    #define MY_CPU_NAME "arm64"
+#endif
+    #define MY_CPU_SIZEOF_POINTER 8
+#endif
   #define MY_CPU_64BIT
 #endif
 
@@ -59,8 +89,16 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
   || defined(__THUMBEL__) \
   || defined(__THUMBEB__)
   #define MY_CPU_ARM
-  #define MY_CPU_NAME "arm"
-  #define MY_CPU_32BIT
+
+  #if defined(__thumb__) || defined(__THUMBEL__) || defined(_M_ARMT)
+    #define MY_CPU_ARMT
+    #define MY_CPU_NAME "armt"
+  #else
+    #define MY_CPU_ARM32
+    #define MY_CPU_NAME "arm"
+  #endif
+  /* #define MY_CPU_32BIT */
+  #define MY_CPU_SIZEOF_POINTER 4
 #endif
 
 
@@ -84,26 +122,104 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
 
 
 #if  defined(__ppc64__) \
-  || defined(__powerpc64__)
+  || defined(__powerpc64__) \
+  || defined(__ppc__) \
+  || defined(__powerpc__) \
+  || defined(__PPC__) \
+  || defined(_POWER)
+
+#define MY_CPU_PPC_OR_PPC64
+
+#if  defined(__ppc64__) \
+  || defined(__powerpc64__) \
+  || defined(_LP64) \
+  || defined(__64BIT__)
   #ifdef __ILP32__
     #define MY_CPU_NAME "ppc64-32"
+    #define MY_CPU_SIZEOF_POINTER 4
   #else
     #define MY_CPU_NAME "ppc64"
+    #define MY_CPU_SIZEOF_POINTER 8
   #endif
   #define MY_CPU_64BIT
-#elif defined(__ppc__) \
-  || defined(__powerpc__)
+#else
   #define MY_CPU_NAME "ppc"
-  #define MY_CPU_32BIT
+  #define MY_CPU_SIZEOF_POINTER 4
+  /* #define MY_CPU_32BIT */
+#endif
 #endif
 
 
-#if  defined(__sparc64__)
-  #define MY_CPU_NAME "sparc64"
+#if   defined(__sparc__) \
+   || defined(__sparc)
+  #define MY_CPU_SPARC
+  #if  defined(__LP64__) \
+    || defined(_LP64) \
+    || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
+    #define MY_CPU_NAME "sparcv9"
+    #define MY_CPU_SIZEOF_POINTER 8
+    #define MY_CPU_64BIT
+  #elif defined(__sparc_v9__) \
+     || defined(__sparcv9)
+    #define MY_CPU_64BIT
+    #if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
+      #define MY_CPU_NAME "sparcv9-32"
+    #else
+      #define MY_CPU_NAME "sparcv9m"
+    #endif
+  #elif defined(__sparc_v8__) \
+     || defined(__sparcv8)
+    #define MY_CPU_NAME "sparcv8"
+    #define MY_CPU_SIZEOF_POINTER 4
+  #else
+    #define MY_CPU_NAME "sparc"
+  #endif
+#endif
+
+
+#if  defined(__riscv) \
+  || defined(__riscv__)
+    #define MY_CPU_RISCV
+  #if __riscv_xlen == 32
+    #define MY_CPU_NAME "riscv32"
+  #elif __riscv_xlen == 64
+    #define MY_CPU_NAME "riscv64"
+  #else
+    #define MY_CPU_NAME "riscv"
+  #endif
+#endif
+
+
+#if defined(__loongarch__)
+  #define MY_CPU_LOONGARCH
+  #if defined(__loongarch64) || defined(__loongarch_grlen) && (__loongarch_grlen == 64)
+  #define MY_CPU_64BIT
+  #endif
+  #if defined(__loongarch64)
+  #define MY_CPU_NAME "loongarch64"
+  #define MY_CPU_LOONGARCH64
+  #else
+  #define MY_CPU_NAME "loongarch"
+  #endif
+#endif
+
+
+// #undef MY_CPU_NAME
+// #undef MY_CPU_SIZEOF_POINTER
+// #define __e2k__
+// #define __SIZEOF_POINTER__ 4
+#if  defined(__e2k__)
+  #define MY_CPU_E2K
+  #if defined(__ILP32__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
+    #define MY_CPU_NAME "e2k-32"
+    #define MY_CPU_SIZEOF_POINTER 4
+  #else
+    #define MY_CPU_NAME "e2k"
+    #if defined(__LP64__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
+      #define MY_CPU_SIZEOF_POINTER 8
+    #endif
+  #endif
   #define MY_CPU_64BIT
-#elif defined(__sparc__)
-  #define MY_CPU_NAME "sparc"
-  /* #define MY_CPU_32BIT */
 #endif
 
 
@@ -111,6 +227,10 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
 #define MY_CPU_X86_OR_AMD64
 #endif
 
+#if defined(MY_CPU_ARM) || defined(MY_CPU_ARM64)
+#define MY_CPU_ARM_OR_ARM64
+#endif
+
 
 #ifdef _WIN32
 
@@ -133,6 +253,7 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
     || defined(MY_CPU_ARM_LE) \
     || defined(MY_CPU_ARM64_LE) \
     || defined(MY_CPU_IA64_LE) \
+    || defined(_LITTLE_ENDIAN) \
     || defined(__LITTLE_ENDIAN__) \
     || defined(__ARMEL__) \
     || defined(__THUMBEL__) \
@@ -165,13 +286,51 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
   #error Stop_Compiling_Bad_Endian
 #endif
 
+#if !defined(MY_CPU_LE) && !defined(MY_CPU_BE)
+  #error Stop_Compiling_CPU_ENDIAN_must_be_detected_at_compile_time
+#endif
 
 #if defined(MY_CPU_32BIT) && defined(MY_CPU_64BIT)
   #error Stop_Compiling_Bad_32_64_BIT
 #endif
 
+#ifdef __SIZEOF_POINTER__
+  #ifdef MY_CPU_SIZEOF_POINTER
+    #if MY_CPU_SIZEOF_POINTER != __SIZEOF_POINTER__
+      #error Stop_Compiling_Bad_MY_CPU_PTR_SIZE
+    #endif
+  #else
+    #define MY_CPU_SIZEOF_POINTER  __SIZEOF_POINTER__
+  #endif
+#endif
+
+#if defined(MY_CPU_SIZEOF_POINTER) && (MY_CPU_SIZEOF_POINTER == 4)
+#if defined (_LP64)
+      #error Stop_Compiling_Bad_MY_CPU_PTR_SIZE
+#endif
+#endif
+
+#ifdef _MSC_VER
+  #if _MSC_VER >= 1300
+    #define MY_CPU_pragma_pack_push_1   __pragma(pack(push, 1))
+    #define MY_CPU_pragma_pop           __pragma(pack(pop))
+  #else
+    #define MY_CPU_pragma_pack_push_1
+    #define MY_CPU_pragma_pop
+  #endif
+#else
+  #ifdef __xlC__
+    #define MY_CPU_pragma_pack_push_1   _Pragma("pack(1)")
+    #define MY_CPU_pragma_pop           _Pragma("pack()")
+  #else
+    #define MY_CPU_pragma_pack_push_1   _Pragma("pack(push, 1)")
+    #define MY_CPU_pragma_pop           _Pragma("pack(pop)")
+  #endif
+#endif
+
 
 #ifndef MY_CPU_NAME
+  // #define MY_CPU_IS_UNKNOWN
   #ifdef MY_CPU_LE
     #define MY_CPU_NAME "LE"
   #elif defined(MY_CPU_BE)
@@ -187,11 +346,121 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
 
 
 
+#ifdef __has_builtin
+  #define Z7_has_builtin(x)  __has_builtin(x)
+#else
+  #define Z7_has_builtin(x)  0
+#endif
+
+
+#define Z7_BSWAP32_CONST(v) \
+       ( (((UInt32)(v) << 24)                   ) \
+       | (((UInt32)(v) <<  8) & (UInt32)0xff0000) \
+       | (((UInt32)(v) >>  8) & (UInt32)0xff00  ) \
+       | (((UInt32)(v) >> 24)                   ))
+
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1300)
+
+#include <stdlib.h>
+
+/* Note: these macros will use bswap instruction (486), that is unsupported in 386 cpu */
+
+#pragma intrinsic(_byteswap_ushort)
+#pragma intrinsic(_byteswap_ulong)
+#pragma intrinsic(_byteswap_uint64)
+
+#define Z7_BSWAP16(v)  _byteswap_ushort(v)
+#define Z7_BSWAP32(v)  _byteswap_ulong (v)
+#define Z7_BSWAP64(v)  _byteswap_uint64(v)
+#define Z7_CPU_FAST_BSWAP_SUPPORTED
+
+/* GCC can generate slow code that calls function for __builtin_bswap32() for:
+     - GCC for RISCV, if Zbb/XTHeadBb extension is not used.
+     - GCC for SPARC.
+   The code from CLANG for SPARC also is not fastest.
+   So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases.
+*/
+#elif (!defined(MY_CPU_RISCV) || defined (__riscv_zbb) || defined(__riscv_xtheadbb)) \
+    && !defined(MY_CPU_SPARC) \
+    && ( \
+       (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
+    || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \
+    )
+
+#define Z7_BSWAP16(v)  __builtin_bswap16(v)
+#define Z7_BSWAP32(v)  __builtin_bswap32(v)
+#define Z7_BSWAP64(v)  __builtin_bswap64(v)
+#define Z7_CPU_FAST_BSWAP_SUPPORTED
+
+#else
+
+#define Z7_BSWAP16(v) ((UInt16) \
+       ( ((UInt32)(v) << 8) \
+       | ((UInt32)(v) >> 8) \
+       ))
+
+#define Z7_BSWAP32(v) Z7_BSWAP32_CONST(v)
+
+#define Z7_BSWAP64(v) \
+       ( ( ( (UInt64)(v)                           ) << 8 * 7 ) \
+       | ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 1) ) << 8 * 5 ) \
+       | ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 2) ) << 8 * 3 ) \
+       | ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 3) ) << 8 * 1 ) \
+       | ( ( (UInt64)(v) >> 8 * 1 ) & ((UInt32)0xff << 8 * 3) ) \
+       | ( ( (UInt64)(v) >> 8 * 3 ) & ((UInt32)0xff << 8 * 2) ) \
+       | ( ( (UInt64)(v) >> 8 * 5 ) & ((UInt32)0xff << 8 * 1) ) \
+       | ( ( (UInt64)(v) >> 8 * 7 )                           ) \
+       )
+
+#endif
+
+
+
 #ifdef MY_CPU_LE
   #if defined(MY_CPU_X86_OR_AMD64) \
       || defined(MY_CPU_ARM64) \
-      || defined(__ARM_FEATURE_UNALIGNED)
+      || defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \
+      || defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6)
     #define MY_CPU_LE_UNALIGN
+    #define MY_CPU_LE_UNALIGN_64
+  #elif defined(__ARM_FEATURE_UNALIGNED)
+/* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions.
+  Description of problems:
+problem-1 : 32-bit ARM architecture:
+  multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM)
+  require 32-bit (WORD) alignment (by 32-bit ARM architecture).
+  So there is "Alignment fault exception", if data is not aligned for 32-bit.
+
+problem-2 : 32-bit kernels and arm64 kernels:
+  32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception".
+  So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux.
+ 
+  But some arm64 kernels do not handle these faults in 32-bit programs.
+  So we have unhandled exception for such instructions.
+  Probably some new arm64 kernels have fixed it, and unaligned
+  paired-access instructions work in new kernels?
+
+problem-3 : compiler for 32-bit arm:
+  Compilers use LDRD/STRD/LDM/STM for UInt64 accesses
+  and for another cases where two 32-bit accesses are fused
+  to one multi-access instruction.
+  So UInt64 variables must be aligned for 32-bit, and each
+  32-bit access must be aligned for 32-bit, if we want to
+  avoid "Alignment fault" exception (handled or unhandled).
+
+problem-4 : performace:
+  Even if unaligned access is handled by kernel, it will be slow.
+  So if we allow unaligned access, we can get fast unaligned
+  single-access, and slow unaligned paired-access.
+
+  We don't allow unaligned access on 32-bit arm, because compiler
+  genarates paired-access instructions that require 32-bit alignment,
+  and some arm64 kernels have no handler for these instructions.
+  Also unaligned paired-access instructions will be slow, if kernel handles them.
+*/
+    // it must be disabled:
+    // #define MY_CPU_LE_UNALIGN
   #endif
 #endif
 
@@ -200,11 +469,13 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
 
 #define GetUi16(p) (*(const UInt16 *)(const void *)(p))
 #define GetUi32(p) (*(const UInt32 *)(const void *)(p))
+#ifdef MY_CPU_LE_UNALIGN_64
 #define GetUi64(p) (*(const UInt64 *)(const void *)(p))
+#define SetUi64(p, v) { *(UInt64 *)(void *)(p) = (v); }
+#endif
 
-#define SetUi16(p, v) { *(UInt16 *)(p) = (v); }
-#define SetUi32(p, v) { *(UInt32 *)(p) = (v); }
-#define SetUi64(p, v) { *(UInt64 *)(p) = (v); }
+#define SetUi16(p, v) { *(UInt16 *)(void *)(p) = (v); }
+#define SetUi32(p, v) { *(UInt32 *)(void *)(p) = (v); }
 
 #else
 
@@ -218,8 +489,6 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
     ((UInt32)((const Byte *)(p))[2] << 16) | \
     ((UInt32)((const Byte *)(p))[3] << 24))
 
-#define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32))
-
 #define SetUi16(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \
     _ppp_[0] = (Byte)_vvv_; \
     _ppp_[1] = (Byte)(_vvv_ >> 8); }
@@ -230,43 +499,36 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
     _ppp_[2] = (Byte)(_vvv_ >> 16); \
     _ppp_[3] = (Byte)(_vvv_ >> 24); }
 
-#define SetUi64(p, v) { Byte *_ppp2_ = (Byte *)(p); UInt64 _vvv2_ = (v); \
-    SetUi32(_ppp2_    , (UInt32)_vvv2_); \
-    SetUi32(_ppp2_ + 4, (UInt32)(_vvv2_ >> 32)); }
-
 #endif
 
-#ifdef __has_builtin
-  #define MY__has_builtin(x) __has_builtin(x)
-#else
-  #define MY__has_builtin(x) 0
-#endif
-
-#if defined(MY_CPU_LE_UNALIGN) && /* defined(_WIN64) && */ (_MSC_VER >= 1300)
 
-/* Note: we use bswap instruction, that is unsupported in 386 cpu */
-
-#include <stdlib.h>
+#ifndef GetUi64
+#define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32))
+#endif
 
-#pragma intrinsic(_byteswap_ushort)
-#pragma intrinsic(_byteswap_ulong)
-#pragma intrinsic(_byteswap_uint64)
+#ifndef SetUi64
+#define SetUi64(p, v) { Byte *_ppp2_ = (Byte *)(p); UInt64 _vvv2_ = (v); \
+    SetUi32(_ppp2_    , (UInt32)_vvv2_) \
+    SetUi32(_ppp2_ + 4, (UInt32)(_vvv2_ >> 32)) }
+#endif
 
-/* #define GetBe16(p) _byteswap_ushort(*(const UInt16 *)(const Byte *)(p)) */
-#define GetBe32(p) _byteswap_ulong(*(const UInt32 *)(const Byte *)(p))
-#define GetBe64(p) _byteswap_uint64(*(const UInt64 *)(const Byte *)(p))
 
-#define SetBe32(p, v) (*(UInt32 *)(void *)(p)) = _byteswap_ulong(v)
+#if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
 
-#elif defined(MY_CPU_LE_UNALIGN) && ( \
-       (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
-    || (defined(__clang__) && MY__has_builtin(__builtin_bswap16)) )
+#if 0
+// Z7_BSWAP16 can be slow for x86-msvc
+#define GetBe16_to32(p)  (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p)))
+#else
+#define GetBe16_to32(p)  (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16)
+#endif
 
-/* #define GetBe16(p) __builtin_bswap16(*(const UInt16 *)(const Byte *)(p)) */
-#define GetBe32(p) __builtin_bswap32(*(const UInt32 *)(const Byte *)(p))
-#define GetBe64(p) __builtin_bswap64(*(const UInt64 *)(const Byte *)(p))
+#define GetBe32(p)  Z7_BSWAP32 (*(const UInt32 *)(const void *)(p))
+#define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); }
 
-#define SetBe32(p, v) (*(UInt32 *)(void *)(p)) = __builtin_bswap32(v)
+#if defined(MY_CPU_LE_UNALIGN_64)
+#define GetBe64(p)  Z7_BSWAP64 (*(const UInt64 *)(const void *)(p))
+#define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); }
+#endif
 
 #else
 
@@ -276,8 +538,6 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
     ((UInt32)((const Byte *)(p))[2] <<  8) | \
              ((const Byte *)(p))[3] )
 
-#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
-
 #define SetBe32(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \
     _ppp_[0] = (Byte)(_vvv_ >> 24); \
     _ppp_[1] = (Byte)(_vvv_ >> 16); \
@@ -286,49 +546,139 @@ MY_CPU_LE_UNALIGN means that CPU is LITTLE ENDIAN and CPU supports unaligned mem
 
 #endif
 
+#ifndef GetBe64
+#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
+#endif
 
-#ifndef GetBe16
+#ifndef SetBe64
+#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \
+    _ppp_[0] = (Byte)(_vvv_ >> 56); \
+    _ppp_[1] = (Byte)(_vvv_ >> 48); \
+    _ppp_[2] = (Byte)(_vvv_ >> 40); \
+    _ppp_[3] = (Byte)(_vvv_ >> 32); \
+    _ppp_[4] = (Byte)(_vvv_ >> 24); \
+    _ppp_[5] = (Byte)(_vvv_ >> 16); \
+    _ppp_[6] = (Byte)(_vvv_ >> 8); \
+    _ppp_[7] = (Byte)_vvv_; }
+#endif
 
+#ifndef GetBe16
+#ifdef GetBe16_to32
+#define GetBe16(p) ( (UInt16) GetBe16_to32(p))
+#else
 #define GetBe16(p) ( (UInt16) ( \
     ((UInt16)((const Byte *)(p))[0] << 8) | \
              ((const Byte *)(p))[1] ))
+#endif
+#endif
+
+
+#if defined(MY_CPU_BE)
+#define Z7_CONV_BE_TO_NATIVE_CONST32(v)  (v)
+#define Z7_CONV_LE_TO_NATIVE_CONST32(v)  Z7_BSWAP32_CONST(v)
+#define Z7_CONV_NATIVE_TO_BE_32(v)       (v)
+// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1)  ((b1) | ((b0) << 8))
+#elif defined(MY_CPU_LE)
+#define Z7_CONV_BE_TO_NATIVE_CONST32(v)  Z7_BSWAP32_CONST(v)
+#define Z7_CONV_LE_TO_NATIVE_CONST32(v)  (v)
+#define Z7_CONV_NATIVE_TO_BE_32(v)       Z7_BSWAP32(v)
+// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1)  ((b0) | ((b1) << 8))
+#else
+#error Stop_Compiling_Unknown_Endian_CONV
+#endif
+
+
+#if defined(MY_CPU_BE)
+
+#define GetBe64a(p)      (*(const UInt64 *)(const void *)(p))
+#define GetBe32a(p)      (*(const UInt32 *)(const void *)(p))
+#define GetBe16a(p)      (*(const UInt16 *)(const void *)(p))
+#define SetBe32a(p, v)   { *(UInt32 *)(void *)(p) = (v); }
+#define SetBe16a(p, v)   { *(UInt16 *)(void *)(p) = (v); }
+
+#define GetUi64a(p)      GetUi64(p)
+#define GetUi32a(p)      GetUi32(p)
+#define GetUi16a(p)      GetUi16(p)
+#define SetUi32a(p, v)   SetUi32(p, v)
+#define SetUi16a(p, v)   SetUi16(p, v)
+
+#elif defined(MY_CPU_LE)
+
+#define GetUi64a(p)      (*(const UInt64 *)(const void *)(p))
+#define GetUi32a(p)      (*(const UInt32 *)(const void *)(p))
+#define GetUi16a(p)      (*(const UInt16 *)(const void *)(p))
+#define SetUi32a(p, v)   { *(UInt32 *)(void *)(p) = (v); }
+#define SetUi16a(p, v)   { *(UInt16 *)(void *)(p) = (v); }
 
+#define GetBe64a(p)      GetBe64(p)
+#define GetBe32a(p)      GetBe32(p)
+#define GetBe16a(p)      GetBe16(p)
+#define SetBe32a(p, v)   SetBe32(p, v)
+#define SetBe16a(p, v)   SetBe16(p, v)
+
+#else
+#error Stop_Compiling_Unknown_Endian_CPU_a
 #endif
 
 
+#ifndef GetBe16_to32
+#define GetBe16_to32(p) GetBe16(p)
+#endif
+
+
+#if defined(MY_CPU_X86_OR_AMD64) \
+  || defined(MY_CPU_ARM_OR_ARM64) \
+  || defined(MY_CPU_PPC_OR_PPC64)
+  #define Z7_CPU_FAST_ROTATE_SUPPORTED
+#endif
+
 
 #ifdef MY_CPU_X86_OR_AMD64
 
-typedef struct
-{
-  UInt32 maxFunc;
-  UInt32 vendor[3];
-  UInt32 ver;
-  UInt32 b;
-  UInt32 c;
-  UInt32 d;
-} Cx86cpuid;
-
-enum
-{
-  CPU_FIRM_INTEL,
-  CPU_FIRM_AMD,
-  CPU_FIRM_VIA
-};
-
-void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d);
-
-BoolInt x86cpuid_CheckAndRead(Cx86cpuid *p);
-int x86cpuid_GetFirm(const Cx86cpuid *p);
-
-#define x86cpuid_GetFamily(ver) (((ver >> 16) & 0xFF0) | ((ver >> 8) & 0xF))
-#define x86cpuid_GetModel(ver)  (((ver >> 12) &  0xF0) | ((ver >> 4) & 0xF))
-#define x86cpuid_GetStepping(ver) (ver & 0xF)
-
-BoolInt CPU_Is_InOrder(void);
-BoolInt CPU_Is_Aes_Supported(void);
+void Z7_FASTCALL z7_x86_cpuid(UInt32 a[4], UInt32 function);
+UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void);
+#if defined(MY_CPU_AMD64)
+#define Z7_IF_X86_CPUID_SUPPORTED
+#else
+#define Z7_IF_X86_CPUID_SUPPORTED if (z7_x86_cpuid_GetMaxFunc())
+#endif
+
+BoolInt CPU_IsSupported_AES(void);
+BoolInt CPU_IsSupported_AVX(void);
+BoolInt CPU_IsSupported_AVX2(void);
+BoolInt CPU_IsSupported_AVX512F_AVX512VL(void);
+BoolInt CPU_IsSupported_VAES_AVX2(void);
+BoolInt CPU_IsSupported_CMOV(void);
+BoolInt CPU_IsSupported_SSE(void);
+BoolInt CPU_IsSupported_SSE2(void);
+BoolInt CPU_IsSupported_SSSE3(void);
+BoolInt CPU_IsSupported_SSE41(void);
+BoolInt CPU_IsSupported_SHA(void);
+BoolInt CPU_IsSupported_SHA512(void);
 BoolInt CPU_IsSupported_PageGB(void);
 
+#elif defined(MY_CPU_ARM_OR_ARM64)
+
+BoolInt CPU_IsSupported_CRC32(void);
+BoolInt CPU_IsSupported_NEON(void);
+
+#if defined(_WIN32)
+BoolInt CPU_IsSupported_CRYPTO(void);
+#define CPU_IsSupported_SHA1  CPU_IsSupported_CRYPTO
+#define CPU_IsSupported_SHA2  CPU_IsSupported_CRYPTO
+#define CPU_IsSupported_AES   CPU_IsSupported_CRYPTO
+#else
+BoolInt CPU_IsSupported_SHA1(void);
+BoolInt CPU_IsSupported_SHA2(void);
+BoolInt CPU_IsSupported_AES(void);
+#endif
+BoolInt CPU_IsSupported_SHA512(void);
+
+#endif
+
+#if defined(__APPLE__)
+int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize);
+int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val);
 #endif
 
 EXTERN_C_END
diff --git a/src/sdk/C/Delta.c b/src/sdk/C/Delta.c
index e3edd21..c4a4499 100644
--- a/src/sdk/C/Delta.c
+++ b/src/sdk/C/Delta.c
@@ -1,5 +1,5 @@
 /* Delta.c -- Delta converter
-2009-05-26 : Igor Pavlov : Public domain */
+2021-02-09 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -12,53 +12,158 @@ void Delta_Init(Byte *state)
     state[i] = 0;
 }
 
-static void MyMemCpy(Byte *dest, const Byte *src, unsigned size)
-{
-  unsigned i;
-  for (i = 0; i < size; i++)
-    dest[i] = src[i];
-}
 
 void Delta_Encode(Byte *state, unsigned delta, Byte *data, SizeT size)
 {
-  Byte buf[DELTA_STATE_SIZE];
-  unsigned j = 0;
-  MyMemCpy(buf, state, delta);
+  Byte temp[DELTA_STATE_SIZE];
+
+  if (size == 0)
+    return;
+
+  {
+    unsigned i = 0;
+    do
+      temp[i] = state[i];
+    while (++i != delta);
+  }
+
+  if (size <= delta)
+  {
+    unsigned i = 0, k;
+    do
+    {
+      Byte b = *data;
+      *data++ = (Byte)(b - temp[i]);
+      temp[i] = b;
+    }
+    while (++i != size);
+    
+    k = 0;
+    
+    do
+    {
+      if (i == delta)
+        i = 0;
+      state[k] = temp[i++];
+    }
+    while (++k != delta);
+    
+    return;
+  }
+    
   {
-    SizeT i;
-    for (i = 0; i < size;)
+    Byte *p = data + size - delta;
+    {
+      unsigned i = 0;
+      do
+        state[i] = *p++;
+      while (++i != delta);
+    }
     {
-      for (j = 0; j < delta && i < size; i++, j++)
+      const Byte *lim = data + delta;
+      ptrdiff_t dif = -(ptrdiff_t)delta;
+      
+      if (((ptrdiff_t)size + dif) & 1)
       {
-        Byte b = data[i];
-        data[i] = (Byte)(b - buf[j]);
-        buf[j] = b;
+        --p;  *p = (Byte)(*p - p[dif]);
       }
+
+      while (p != lim)
+      {
+        --p;  *p = (Byte)(*p - p[dif]);
+        --p;  *p = (Byte)(*p - p[dif]);
+      }
+      
+      dif = -dif;
+      
+      do
+      {
+        --p;  *p = (Byte)(*p - temp[--dif]);
+      }
+      while (dif != 0);
     }
   }
-  if (j == delta)
-    j = 0;
-  MyMemCpy(state, buf + j, delta - j);
-  MyMemCpy(state + delta - j, buf, j);
 }
 
+
 void Delta_Decode(Byte *state, unsigned delta, Byte *data, SizeT size)
 {
-  Byte buf[DELTA_STATE_SIZE];
-  unsigned j = 0;
-  MyMemCpy(buf, state, delta);
+  unsigned i;
+  const Byte *lim;
+
+  if (size == 0)
+    return;
+  
+  i = 0;
+  lim = data + size;
+  
+  if (size <= delta)
+  {
+    do
+      *data = (Byte)(*data + state[i++]);
+    while (++data != lim);
+
+    for (; delta != i; state++, delta--)
+      *state = state[i];
+    data -= i;
+  }
+  else
   {
-    SizeT i;
-    for (i = 0; i < size;)
+    /*
+    #define B(n) b ## n
+    #define I(n) Byte B(n) = state[n];
+    #define U(n) { B(n) = (Byte)((B(n)) + *data++); data[-1] = (B(n)); }
+    #define F(n) if (data != lim) { U(n) }
+
+    if (delta == 1)
+    {
+      I(0)
+      if ((lim - data) & 1) { U(0) }
+      while (data != lim) { U(0) U(0) }
+      data -= 1;
+    }
+    else if (delta == 2)
     {
-      for (j = 0; j < delta && i < size; i++, j++)
+      I(0) I(1)
+      lim -= 1; while (data < lim) { U(0) U(1) }
+      lim += 1; F(0)
+      data -= 2;
+    }
+    else if (delta == 3)
+    {
+      I(0) I(1) I(2)
+      lim -= 2; while (data < lim) { U(0) U(1) U(2) }
+      lim += 2; F(0) F(1)
+      data -= 3;
+    }
+    else if (delta == 4)
+    {
+      I(0) I(1) I(2) I(3)
+      lim -= 3; while (data < lim) { U(0) U(1) U(2) U(3) }
+      lim += 3; F(0) F(1) F(2)
+      data -= 4;
+    }
+    else
+    */
+    {
+      do
+      {
+        *data = (Byte)(*data + state[i++]);
+        data++;
+      }
+      while (i != delta);
+  
       {
-        buf[j] = data[i] = (Byte)(buf[j] + data[i]);
+        ptrdiff_t dif = -(ptrdiff_t)delta;
+        do
+          *data = (Byte)(*data + data[dif]);
+        while (++data != lim);
+        data += dif;
       }
     }
   }
-  if (j == delta)
-    j = 0;
-  MyMemCpy(state, buf + j, delta - j);
-  MyMemCpy(state + delta - j, buf, j);
+
+  do
+    *state++ = *data;
+  while (++data != lim);
 }
diff --git a/src/sdk/C/Delta.h b/src/sdk/C/Delta.h
index 2fa54ad..7060954 100644
--- a/src/sdk/C/Delta.h
+++ b/src/sdk/C/Delta.h
@@ -1,8 +1,8 @@
 /* Delta.h -- Delta converter
-2013-01-18 : Igor Pavlov : Public domain */
+2023-03-03 : Igor Pavlov : Public domain */
 
-#ifndef __DELTA_H
-#define __DELTA_H
+#ifndef ZIP7_INC_DELTA_H
+#define ZIP7_INC_DELTA_H
 
 #include "7zTypes.h"
 
diff --git a/src/sdk/C/DllSecur.c b/src/sdk/C/DllSecur.c
index 5ea108a..bbbfc0a 100644
--- a/src/sdk/C/DllSecur.c
+++ b/src/sdk/C/DllSecur.c
@@ -1,108 +1,99 @@
 /* DllSecur.c -- DLL loading security
-2018-02-21 : Igor Pavlov : Public domain */
+2023-12-03 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #ifdef _WIN32
 
-#include <windows.h>
+#include "7zWindows.h"
 
 #include "DllSecur.h"
 
 #ifndef UNDER_CE
 
+Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
+
 typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags);
 
 #define MY_LOAD_LIBRARY_SEARCH_USER_DIRS 0x400
 #define MY_LOAD_LIBRARY_SEARCH_SYSTEM32  0x800
 
+#define DELIM "\0"
+
 static const char * const g_Dlls =
+         "userenv"
+  DELIM  "setupapi"
+  DELIM  "apphelp"
+  DELIM  "propsys"
+  DELIM  "dwmapi"
+  DELIM  "cryptbase"
+  DELIM  "oleacc"
+  DELIM  "clbcatq"
+  DELIM  "version"
   #ifndef _CONSOLE
-  "UXTHEME\0"
+  DELIM  "uxtheme"
   #endif
-  "USERENV\0"
-  "SETUPAPI\0"
-  "APPHELP\0"
-  "PROPSYS\0"
-  "DWMAPI\0"
-  "CRYPTBASE\0"
-  "OLEACC\0"
-  "CLBCATQ\0"
-  "VERSION\0"
-  ;
+  DELIM;
+
+#endif
 
+#ifdef __clang__
+  #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+#if defined (_MSC_VER) && _MSC_VER >= 1900
+// sysinfoapi.h: kit10: GetVersion was declared deprecated
+#pragma warning(disable : 4996)
 #endif
 
-void My_SetDefaultDllDirectories()
+#define IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN \
+    if ((UInt16)GetVersion() != 6) { \
+      const \
+       Func_SetDefaultDllDirectories setDllDirs = \
+      (Func_SetDefaultDllDirectories) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \
+           "SetDefaultDllDirectories"); \
+      if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; }
+
+void My_SetDefaultDllDirectories(void)
 {
   #ifndef UNDER_CE
-  
-    OSVERSIONINFO vi;
-    vi.dwOSVersionInfoSize = sizeof(vi);
-    GetVersionEx(&vi);
-    if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0)
-    {
-      Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories)
-          GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories");
-      if (setDllDirs)
-        if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS))
-          return;
-    }
-
+  IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN
   #endif
 }
 
 
-void LoadSecurityDlls()
+void LoadSecurityDlls(void)
 {
   #ifndef UNDER_CE
-  
-  wchar_t buf[MAX_PATH + 100];
-
-  {
-    // at Vista (ver 6.0) : CoCreateInstance(CLSID_ShellLink, ...) doesn't work after SetDefaultDllDirectories() : Check it ???
-    OSVERSIONINFO vi;
-    vi.dwOSVersionInfoSize = sizeof(vi);
-    if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0)
-    {
-      Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories)
-          GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories");
-      if (setDllDirs)
-        if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS))
-          return;
-    }
-  }
-
-  {
-    unsigned len = GetSystemDirectoryW(buf, MAX_PATH + 2);
-    if (len == 0 || len > MAX_PATH)
-      return;
-  }
+  // at Vista (ver 6.0) : CoCreateInstance(CLSID_ShellLink, ...) doesn't work after SetDefaultDllDirectories() : Check it ???
+  IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN
   {
+    wchar_t buf[MAX_PATH + 100];
     const char *dll;
-    unsigned pos = (unsigned)lstrlenW(buf);
-
+    unsigned pos = GetSystemDirectoryW(buf, MAX_PATH + 2);
+    if (pos == 0 || pos > MAX_PATH)
+      return;
     if (buf[pos - 1] != '\\')
       buf[pos++] = '\\';
-    
-    for (dll = g_Dlls; dll[0] != 0;)
+    for (dll = g_Dlls; *dll != 0;)
     {
-      unsigned k = 0;
+      wchar_t *dest = &buf[pos];
       for (;;)
       {
-        char c = *dll++;
-        buf[pos + k] = (Byte)c;
-        k++;
+        const char c = *dll++;
         if (c == 0)
           break;
+        *dest++ = (Byte)c;
       }
-
-      lstrcatW(buf, L".dll");
+      dest[0] = '.';
+      dest[1] = 'd';
+      dest[2] = 'l';
+      dest[3] = 'l';
+      dest[4] = 0;
+      // lstrcatW(buf, L".dll");
       LoadLibraryExW(buf, NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
     }
   }
-  
   #endif
 }
 
-#endif
+#endif // _WIN32
diff --git a/src/sdk/C/DllSecur.h b/src/sdk/C/DllSecur.h
index e2a049a..9fa4153 100644
--- a/src/sdk/C/DllSecur.h
+++ b/src/sdk/C/DllSecur.h
@@ -1,8 +1,8 @@
 /* DllSecur.h -- DLL loading for security
-2018-02-19 : Igor Pavlov : Public domain */
+2023-03-03 : Igor Pavlov : Public domain */
 
-#ifndef __DLL_SECUR_H
-#define __DLL_SECUR_H
+#ifndef ZIP7_INC_DLL_SECUR_H
+#define ZIP7_INC_DLL_SECUR_H
 
 #include "7zTypes.h"
 
@@ -10,8 +10,8 @@ EXTERN_C_BEGIN
 
 #ifdef _WIN32
 
-void My_SetDefaultDllDirectories();
-void LoadSecurityDlls();
+void My_SetDefaultDllDirectories(void);
+void LoadSecurityDlls(void);
 
 #endif
 
diff --git a/src/sdk/C/LzFind.c b/src/sdk/C/LzFind.c
index df55e86..330bc17 100644
--- a/src/sdk/C/LzFind.c
+++ b/src/sdk/C/LzFind.c
@@ -1,74 +1,140 @@
 /* LzFind.c -- Match finder for LZ algorithms
-2018-07-08 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include <string.h>
+// #include <stdio.h>
 
+#include "CpuArch.h"
 #include "LzFind.h"
 #include "LzHash.h"
 
+#define kBlockMoveAlign       (1 << 7)    // alignment for memmove()
+#define kBlockSizeAlign       (1 << 16)   // alignment for block allocation
+#define kBlockSizeReserveMin  (1 << 24)   // it's 1/256 from 4 GB dictinary
+
 #define kEmptyHashValue 0
-#define kMaxValForNormalize ((UInt32)0xFFFFFFFF)
-#define kNormalizeStepMin (1 << 10) /* it must be power of 2 */
-#define kNormalizeMask (~(UInt32)(kNormalizeStepMin - 1))
-#define kMaxHistorySize ((UInt32)7 << 29)
 
-#define kStartMaxLen 3
+#define kMaxValForNormalize ((UInt32)0)
+// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug
+
+// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
+
+#define GET_AVAIL_BYTES(p) \
+  Inline_MatchFinder_GetNumAvailableBytes(p)
+
+
+// #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
+#define kFix5HashSize kFix4HashSize
+
+/*
+ HASH2_CALC:
+   if (hv) match, then cur[0] and cur[1] also match
+*/
+#define HASH2_CALC hv = GetUi16(cur);
+
+// (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255]
+
+/*
+ HASH3_CALC:
+   if (cur[0]) and (h2) match, then cur[1]            also match
+   if (cur[0]) and (hv) match, then cur[1] and cur[2] also match
+*/
+#define HASH3_CALC { \
+  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
+  h2 = temp & (kHash2Size - 1); \
+  hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; }
+
+#define HASH4_CALC { \
+  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
+  h2 = temp & (kHash2Size - 1); \
+  temp ^= ((UInt32)cur[2] << 8); \
+  h3 = temp & (kHash3Size - 1); \
+  hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; }
+
+#define HASH5_CALC { \
+  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
+  h2 = temp & (kHash2Size - 1); \
+  temp ^= ((UInt32)cur[2] << 8); \
+  h3 = temp & (kHash3Size - 1); \
+  temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \
+  /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \
+  hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; }
+
+#define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF;
+
 
 static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc)
 {
-  if (!p->directInput)
+  // if (!p->directInput)
   {
-    ISzAlloc_Free(alloc, p->bufferBase);
-    p->bufferBase = NULL;
+    ISzAlloc_Free(alloc, p->bufBase);
+    p->bufBase = NULL;
   }
 }
 
-/* keepSizeBefore + keepSizeAfter + keepSizeReserv must be < 4G) */
 
-static int LzInWindow_Create(CMatchFinder *p, UInt32 keepSizeReserv, ISzAllocPtr alloc)
+static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc)
 {
-  UInt32 blockSize = p->keepSizeBefore + p->keepSizeAfter + keepSizeReserv;
-  if (p->directInput)
-  {
-    p->blockSize = blockSize;
-    return 1;
-  }
-  if (!p->bufferBase || p->blockSize != blockSize)
+  if (blockSize == 0)
+    return 0;
+  if (!p->bufBase || p->blockSize != blockSize)
   {
+    // size_t blockSizeT;
     LzInWindow_Free(p, alloc);
     p->blockSize = blockSize;
-    p->bufferBase = (Byte *)ISzAlloc_Alloc(alloc, (size_t)blockSize);
+    // blockSizeT = blockSize;
+    
+    // printf("\nblockSize = 0x%x\n", blockSize);
+    /*
+    #if defined _WIN64
+    // we can allocate 4GiB, but still use UInt32 for (p->blockSize)
+    // we use UInt32 type for (p->blockSize), because
+    // we don't want to wrap over 4 GiB,
+    // when we use (p->streamPos - p->pos) that is UInt32.
+    if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign)
+    {
+      blockSizeT = ((size_t)1 << 32);
+      printf("\nchanged to blockSizeT = 4GiB\n");
+    }
+    #endif
+    */
+    
+    p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize);
+    // printf("\nbufferBase = %p\n", p->bufBase);
+    // return 0; // for debug
   }
-  return (p->bufferBase != NULL);
+  return (p->bufBase != NULL);
 }
 
-Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; }
-
-UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return p->streamPos - p->pos; }
+static const Byte *MatchFinder_GetPointerToCurrentPos(void *p)
+{
+  return ((CMatchFinder *)p)->buffer;
+}
 
-void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue)
+static UInt32 MatchFinder_GetNumAvailableBytes(void *p)
 {
-  p->posLimit -= subValue;
-  p->pos -= subValue;
-  p->streamPos -= subValue;
+  return GET_AVAIL_BYTES((CMatchFinder *)p);
 }
 
+
+Z7_NO_INLINE
 static void MatchFinder_ReadBlock(CMatchFinder *p)
 {
   if (p->streamEndWasReached || p->result != SZ_OK)
     return;
 
-  /* We use (p->streamPos - p->pos) value. (p->streamPos < p->pos) is allowed. */
+  /* We use (p->streamPos - p->pos) value.
+     (p->streamPos < p->pos) is allowed. */
 
   if (p->directInput)
   {
-    UInt32 curSize = 0xFFFFFFFF - (p->streamPos - p->pos);
+    UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p);
     if (curSize > p->directInputRem)
       curSize = (UInt32)p->directInputRem;
-    p->directInputRem -= curSize;
     p->streamPos += curSize;
+    p->directInputRem -= curSize;
     if (p->directInputRem == 0)
       p->streamEndWasReached = 1;
     return;
@@ -76,12 +142,31 @@ static void MatchFinder_ReadBlock(CMatchFinder *p)
   
   for (;;)
   {
-    Byte *dest = p->buffer + (p->streamPos - p->pos);
-    size_t size = (p->bufferBase + p->blockSize - dest);
+    const Byte *dest = p->buffer + GET_AVAIL_BYTES(p);
+    size_t size = (size_t)(p->bufBase + p->blockSize - dest);
     if (size == 0)
+    {
+      /* we call ReadBlock() after NeedMove() and MoveBlock().
+         NeedMove() and MoveBlock() povide more than (keepSizeAfter)
+         to the end of (blockSize).
+         So we don't execute this branch in normal code flow.
+         We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock().
+      */
+      // p->result = SZ_ERROR_FAIL; // we can show error here
       return;
+    }
+
+    // #define kRead 3
+    // if (size > kRead) size = kRead; // for debug
 
-    p->result = ISeqInStream_Read(p->stream, dest, &size);
+    /*
+    // we need cast (Byte *)dest.
+    #ifdef __clang__
+      #pragma GCC diagnostic ignored "-Wcast-qual"
+    #endif
+    */
+    p->result = ISeqInStream_Read(p->stream,
+        p->bufBase + (dest - p->bufBase), &size);
     if (p->result != SZ_OK)
       return;
     if (size == 0)
@@ -90,47 +175,60 @@ static void MatchFinder_ReadBlock(CMatchFinder *p)
       return;
     }
     p->streamPos += (UInt32)size;
-    if (p->streamPos - p->pos > p->keepSizeAfter)
+    if (GET_AVAIL_BYTES(p) > p->keepSizeAfter)
       return;
+    /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function
+         (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */
   }
+
+  // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter)
 }
 
+
+
+Z7_NO_INLINE
 void MatchFinder_MoveBlock(CMatchFinder *p)
 {
-  memmove(p->bufferBase,
-      p->buffer - p->keepSizeBefore,
-      (size_t)(p->streamPos - p->pos) + p->keepSizeBefore);
-  p->buffer = p->bufferBase + p->keepSizeBefore;
+  const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore;
+  const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore;
+  p->buffer = p->bufBase + keepBefore;
+  memmove(p->bufBase,
+      p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)),
+      keepBefore + (size_t)GET_AVAIL_BYTES(p));
 }
 
+/* We call MoveBlock() before ReadBlock().
+   So MoveBlock() can be wasteful operation, if the whole input data
+   can fit in current block even without calling MoveBlock().
+   in important case where (dataSize <= historySize)
+     condition (p->blockSize > dataSize + p->keepSizeAfter) is met
+     So there is no MoveBlock() in that case case.
+*/
+
 int MatchFinder_NeedMove(CMatchFinder *p)
 {
   if (p->directInput)
     return 0;
-  /* if (p->streamEndWasReached) return 0; */
-  return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
+  if (p->streamEndWasReached || p->result != SZ_OK)
+    return 0;
+  return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
 }
 
 void MatchFinder_ReadIfRequired(CMatchFinder *p)
 {
-  if (p->streamEndWasReached)
-    return;
-  if (p->keepSizeAfter >= p->streamPos - p->pos)
+  if (p->keepSizeAfter >= GET_AVAIL_BYTES(p))
     MatchFinder_ReadBlock(p);
 }
 
-static void MatchFinder_CheckAndMoveAndRead(CMatchFinder *p)
-{
-  if (MatchFinder_NeedMove(p))
-    MatchFinder_MoveBlock(p);
-  MatchFinder_ReadBlock(p);
-}
+
 
 static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
 {
   p->cutValue = 32;
   p->btMode = 1;
   p->numHashBytes = 4;
+  p->numHashBytes_Min = 2;
+  p->numHashOutBits = 0;
   p->bigHash = 0;
 }
 
@@ -139,8 +237,10 @@ static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
 void MatchFinder_Construct(CMatchFinder *p)
 {
   unsigned i;
-  p->bufferBase = NULL;
+  p->buffer = NULL;
+  p->bufBase = NULL;
   p->directInput = 0;
+  p->stream = NULL;
   p->hash = NULL;
   p->expectedDataSize = (UInt64)(Int64)-1;
   MatchFinder_SetDefaultSettings(p);
@@ -155,6 +255,8 @@ void MatchFinder_Construct(CMatchFinder *p)
   }
 }
 
+#undef kCrcPoly
+
 static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc)
 {
   ISzAlloc_Free(alloc, p->hash);
@@ -169,87 +271,213 @@ void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc)
 
 static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc)
 {
-  size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
+  const size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
   if (sizeInBytes / sizeof(CLzRef) != num)
     return NULL;
   return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes);
 }
 
-int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
-    UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
-    ISzAllocPtr alloc)
+#if (kBlockSizeReserveMin < kBlockSizeAlign * 2)
+  #error Stop_Compiling_Bad_Reserve
+#endif
+
+
+
+static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize)
 {
-  UInt32 sizeReserv;
-  
+  UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter);
+  /*
   if (historySize > kMaxHistorySize)
-  {
-    MatchFinder_Free(p, alloc);
     return 0;
-  }
+  */
+  // printf("\nhistorySize == 0x%x\n", historySize);
   
-  sizeReserv = historySize >> 1;
-       if (historySize >= ((UInt32)3 << 30)) sizeReserv = historySize >> 3;
-  else if (historySize >= ((UInt32)2 << 30)) sizeReserv = historySize >> 2;
+  if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore)  // if 32-bit overflow
+    return 0;
   
-  sizeReserv += (keepAddBufferBefore + matchMaxLen + keepAddBufferAfter) / 2 + (1 << 19);
+  {
+    const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign;
+    const UInt32 rem = kBlockSizeMax - blockSize;
+    const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2))
+        + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here
+    if (blockSize >= kBlockSizeMax
+        || rem < kBlockSizeReserveMin) // we reject settings that will be slow
+      return 0;
+    if (reserve >= rem)
+      blockSize = kBlockSizeMax;
+    else
+    {
+      blockSize += reserve;
+      blockSize &= ~(UInt32)(kBlockSizeAlign - 1);
+    }
+  }
+  // printf("\n LzFind_blockSize = %x\n", blockSize);
+  // printf("\n LzFind_blockSize = %d\n", blockSize >> 20);
+  return blockSize;
+}
+
+
+// input is historySize
+static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs)
+{
+  if (p->numHashBytes == 2)
+    return (1 << 16) - 1;
+  if (hs != 0)
+    hs--;
+  hs |= (hs >> 1);
+  hs |= (hs >> 2);
+  hs |= (hs >> 4);
+  hs |= (hs >> 8);
+  // we propagated 16 bits in (hs). Low 16 bits must be set later
+  if (hs >= (1 << 24))
+  {
+    if (p->numHashBytes == 3)
+      hs = (1 << 24) - 1;
+    /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
+  }
+  // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
+  hs |= (1 << 16) - 1; /* don't change it! */
+  // bt5: we adjust the size with recommended minimum size
+  if (p->numHashBytes >= 5)
+    hs |= (256 << kLzHash_CrcShift_2) - 1;
+  return hs;
+}
+
+// input is historySize
+static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs)
+{
+  if (p->numHashBytes == 2)
+    return (1 << 16) - 1;
+  if (hs != 0)
+    hs--;
+  hs |= (hs >> 1);
+  hs |= (hs >> 2);
+  hs |= (hs >> 4);
+  hs |= (hs >> 8);
+  // we propagated 16 bits in (hs). Low 16 bits must be set later
+  hs >>= 1;
+  if (hs >= (1 << 24))
+  {
+    if (p->numHashBytes == 3)
+      hs = (1 << 24) - 1;
+    else
+      hs >>= 1;
+    /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
+  }
+  // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
+  hs |= (1 << 16) - 1; /* don't change it! */
+  // bt5: we adjust the size with recommended minimum size
+  if (p->numHashBytes >= 5)
+    hs |= (256 << kLzHash_CrcShift_2) - 1;
+  return hs;
+}
+
 
+int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
+    UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
+    ISzAllocPtr alloc)
+{
+  /* we need one additional byte in (p->keepSizeBefore),
+     since we use MoveBlock() after (p->pos++) and before dictionary using */
+  // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug
   p->keepSizeBefore = historySize + keepAddBufferBefore + 1;
-  p->keepSizeAfter = matchMaxLen + keepAddBufferAfter;
-  
-  /* we need one additional byte, since we use MoveBlock after pos++ and before dictionary using */
-  
-  if (LzInWindow_Create(p, sizeReserv, alloc))
+
+  keepAddBufferAfter += matchMaxLen;
+  /* we need (p->keepSizeAfter >= p->numHashBytes) */
+  if (keepAddBufferAfter < p->numHashBytes)
+    keepAddBufferAfter = p->numHashBytes;
+  // keepAddBufferAfter -= 2; // for debug
+  p->keepSizeAfter = keepAddBufferAfter;
+
+  if (p->directInput)
+    p->blockSize = 0;
+  if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc))
   {
-    UInt32 newCyclicBufferSize = historySize + 1;
-    UInt32 hs;
-    p->matchMaxLen = matchMaxLen;
+    size_t hashSizeSum;
     {
-      p->fixedHashSize = 0;
-      if (p->numHashBytes == 2)
-        hs = (1 << 16) - 1;
+      UInt32 hs;
+      UInt32 hsCur;
+      
+      if (p->numHashOutBits != 0)
+      {
+        unsigned numBits = p->numHashOutBits;
+        const unsigned nbMax =
+            (p->numHashBytes == 2 ? 16 :
+            (p->numHashBytes == 3 ? 24 : 32));
+        if (numBits >= nbMax)
+          numBits = nbMax;
+        if (numBits >= 32)
+          hs = (UInt32)0 - 1;
+        else
+          hs = ((UInt32)1 << numBits) - 1;
+        // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
+        hs |= (1 << 16) - 1; /* don't change it! */
+        if (p->numHashBytes >= 5)
+          hs |= (256 << kLzHash_CrcShift_2) - 1;
+        {
+          const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
+          if (hs >= hs2)
+            hs = hs2;
+        }
+        hsCur = hs;
+        if (p->expectedDataSize < historySize)
+        {
+          const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
+          if (hsCur >= hs2)
+            hsCur = hs2;
+        }
+      }
       else
       {
-        hs = historySize;
-        if (hs > p->expectedDataSize)
-          hs = (UInt32)p->expectedDataSize;
-        if (hs != 0)
-          hs--;
-        hs |= (hs >> 1);
-        hs |= (hs >> 2);
-        hs |= (hs >> 4);
-        hs |= (hs >> 8);
-        hs >>= 1;
-        hs |= 0xFFFF; /* don't change it! It's required for Deflate */
-        if (hs > (1 << 24))
+        hs = MatchFinder_GetHashMask(p, historySize);
+        hsCur = hs;
+        if (p->expectedDataSize < historySize)
         {
-          if (p->numHashBytes == 3)
-            hs = (1 << 24) - 1;
-          else
-            hs >>= 1;
-          /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
+          hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
+          if (hsCur >= hs) // is it possible?
+            hsCur = hs;
         }
       }
-      p->hashMask = hs;
-      hs++;
-      if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size;
-      if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size;
-      if (p->numHashBytes > 4) p->fixedHashSize += kHash4Size;
-      hs += p->fixedHashSize;
+
+      p->hashMask = hsCur;
+
+      hashSizeSum = hs;
+      hashSizeSum++;
+      if (hashSizeSum < hs)
+        return 0;
+      {
+        UInt32 fixedHashSize = 0;
+        if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size;
+        if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size;
+        // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size;
+        hashSizeSum += fixedHashSize;
+        p->fixedHashSize = fixedHashSize;
+      }
     }
 
+    p->matchMaxLen = matchMaxLen;
+
     {
       size_t newSize;
       size_t numSons;
+      const UInt32 newCyclicBufferSize = historySize + 1; // do not change it
       p->historySize = historySize;
-      p->hashSizeSum = hs;
-      p->cyclicBufferSize = newCyclicBufferSize;
+      p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1)
       
       numSons = newCyclicBufferSize;
       if (p->btMode)
         numSons <<= 1;
-      newSize = hs + numSons;
+      newSize = hashSizeSum + numSons;
+
+      if (numSons < newCyclicBufferSize || newSize < numSons)
+        return 0;
+
+      // aligned size is not required here, but it can be better for some loops
+      #define NUM_REFS_ALIGN_MASK 0xF
+      newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK;
 
-      if (p->hash && p->numRefs == newSize)
+      // 22.02: we don't reallocate buffer, if old size is enough
+      if (p->hash && p->numRefs >= newSize)
         return 1;
       
       MatchFinder_FreeThisClassMemory(p, alloc);
@@ -258,7 +486,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
       
       if (p->hash)
       {
-        p->son = p->hash + p->hashSizeSum;
+        p->son = p->hash + hashSizeSum;
         return 1;
       }
     }
@@ -268,33 +496,43 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
   return 0;
 }
 
+
 static void MatchFinder_SetLimits(CMatchFinder *p)
 {
-  UInt32 limit = kMaxValForNormalize - p->pos;
-  UInt32 limit2 = p->cyclicBufferSize - p->cyclicBufferPos;
-  
-  if (limit2 < limit)
-    limit = limit2;
-  limit2 = p->streamPos - p->pos;
+  UInt32 k;
+  UInt32 n = kMaxValForNormalize - p->pos;
+  if (n == 0)
+    n = (UInt32)(Int32)-1;  // we allow (pos == 0) at start even with (kMaxValForNormalize == 0)
   
-  if (limit2 <= p->keepSizeAfter)
+  k = p->cyclicBufferSize - p->cyclicBufferPos;
+  if (k < n)
+    n = k;
+
+  k = GET_AVAIL_BYTES(p);
   {
-    if (limit2 > 0)
-      limit2 = 1;
+    const UInt32 ksa = p->keepSizeAfter;
+    UInt32 mm = p->matchMaxLen;
+    if (k > ksa)
+      k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock
+    else if (k >= mm)
+    {
+      // the limitation for (p->lenLimit) update
+      k -= mm;   // optimization : to reduce the number of checks
+      k++;
+      // k = 1; // non-optimized version : for debug
+    }
+    else
+    {
+      mm = k;
+      if (k != 0)
+        k = 1;
+    }
+    p->lenLimit = mm;
   }
-  else
-    limit2 -= p->keepSizeAfter;
-  
-  if (limit2 < limit)
-    limit = limit2;
+  if (k < n)
+    n = k;
   
-  {
-    UInt32 lenLimit = p->streamPos - p->pos;
-    if (lenLimit > p->matchMaxLen)
-      lenLimit = p->matchMaxLen;
-    p->lenLimit = lenLimit;
-  }
-  p->posLimit = p->pos + limit;
+  p->posLimit = p->pos + n;
 }
 
 
@@ -302,7 +540,7 @@ void MatchFinder_Init_LowHash(CMatchFinder *p)
 {
   size_t i;
   CLzRef *items = p->hash;
-  size_t numItems = p->fixedHashSize;
+  const size_t numItems = p->fixedHashSize;
   for (i = 0; i < numItems; i++)
     items[i] = kEmptyHashValue;
 }
@@ -312,72 +550,325 @@ void MatchFinder_Init_HighHash(CMatchFinder *p)
 {
   size_t i;
   CLzRef *items = p->hash + p->fixedHashSize;
-  size_t numItems = (size_t)p->hashMask + 1;
+  const size_t numItems = (size_t)p->hashMask + 1;
   for (i = 0; i < numItems; i++)
     items[i] = kEmptyHashValue;
 }
 
 
-void MatchFinder_Init_3(CMatchFinder *p, int readData)
+void MatchFinder_Init_4(CMatchFinder *p)
 {
-  p->cyclicBufferPos = 0;
-  p->buffer = p->bufferBase;
-  p->pos =
-  p->streamPos = p->cyclicBufferSize;
+  if (!p->directInput)
+    p->buffer = p->bufBase;
+  {
+    /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker.
+       the code in CMatchFinderMt expects (pos = 1) */
+    p->pos =
+    p->streamPos =
+        1; // it's smallest optimal value. do not change it
+        // 0; // for debug
+  }
   p->result = SZ_OK;
   p->streamEndWasReached = 0;
-  
-  if (readData)
-    MatchFinder_ReadBlock(p);
-  
-  MatchFinder_SetLimits(p);
 }
 
 
-void MatchFinder_Init(CMatchFinder *p)
+// (CYC_TO_POS_OFFSET == 0) is expected by some optimized code
+#define CYC_TO_POS_OFFSET 0
+// #define CYC_TO_POS_OFFSET 1 // for debug
+
+void MatchFinder_Init(void *_p)
 {
+  CMatchFinder *p = (CMatchFinder *)_p;
   MatchFinder_Init_HighHash(p);
   MatchFinder_Init_LowHash(p);
-  MatchFinder_Init_3(p, True);
+  MatchFinder_Init_4(p);
+  // if (readData)
+  MatchFinder_ReadBlock(p);
+
+  /* if we init (cyclicBufferPos = pos), then we can use one variable
+     instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */
+  p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos)
+  // p->cyclicBufferPos = 0; // smallest value
+  // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses.
+  MatchFinder_SetLimits(p);
 }
 
-  
-static UInt32 MatchFinder_GetSubValue(CMatchFinder *p)
+
+
+#ifdef MY_CPU_X86_OR_AMD64
+  #if defined(__clang__) && (__clang_major__ >= 4) \
+    || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
+    // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
+
+      #define USE_LZFIND_SATUR_SUB_128
+      #define USE_LZFIND_SATUR_SUB_256
+      #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1")))
+      #define LZFIND_ATTRIB_AVX2  __attribute__((__target__("avx2")))
+  #elif defined(_MSC_VER)
+    #if (_MSC_VER >= 1600)
+      #define USE_LZFIND_SATUR_SUB_128
+    #endif
+    #if (_MSC_VER >= 1900)
+      #define USE_LZFIND_SATUR_SUB_256
+    #endif
+  #endif
+
+#elif defined(MY_CPU_ARM64) \
+  /* || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) */
+
+  #if  defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
+    || defined(__GNUC__) && (__GNUC__ >= 6)
+      #define USE_LZFIND_SATUR_SUB_128
+    #ifdef MY_CPU_ARM64
+      // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("")))
+    #else
+      #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=neon")))
+    #endif
+
+  #elif defined(_MSC_VER)
+    #if (_MSC_VER >= 1910)
+      #define USE_LZFIND_SATUR_SUB_128
+    #endif
+  #endif
+
+  #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
+    #include <arm64_neon.h>
+  #else
+    #include <arm_neon.h>
+  #endif
+
+#endif
+
+
+#ifdef USE_LZFIND_SATUR_SUB_128
+
+// #define Z7_SHOW_HW_STATUS
+
+#ifdef Z7_SHOW_HW_STATUS
+#include <stdio.h>
+#define PRF(x) x
+PRF(;)
+#else
+#define PRF(x)
+#endif
+
+
+#ifdef MY_CPU_ARM_OR_ARM64
+
+#ifdef MY_CPU_ARM64
+// #define FORCE_LZFIND_SATUR_SUB_128
+#endif
+typedef uint32x4_t LzFind_v128;
+#define SASUB_128_V(v, s) \
+  vsubq_u32(vmaxq_u32(v, s), s)
+
+#else // MY_CPU_ARM_OR_ARM64
+
+#include <smmintrin.h> // sse4.1
+
+typedef __m128i LzFind_v128;
+// SSE 4.1
+#define SASUB_128_V(v, s)   \
+  _mm_sub_epi32(_mm_max_epu32(v, s), s)
+
+#endif // MY_CPU_ARM_OR_ARM64
+
+
+#define SASUB_128(i) \
+  *(      LzFind_v128 *)(      void *)(items + (i) * 4) = SASUB_128_V( \
+  *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2);
+
+
+Z7_NO_INLINE
+static
+#ifdef LZFIND_ATTRIB_SSE41
+LZFIND_ATTRIB_SSE41
+#endif
+void
+Z7_FASTCALL
+LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim)
 {
-  return (p->pos - p->historySize - 1) & kNormalizeMask;
+  const LzFind_v128 sub2 =
+    #ifdef MY_CPU_ARM_OR_ARM64
+      vdupq_n_u32(subValue);
+    #else
+      _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
+    #endif
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SASUB_128(0)  SASUB_128(1)  items += 2 * 4;
+    SASUB_128(0)  SASUB_128(1)  items += 2 * 4;
+  }
+  while (items != lim);
 }
 
-void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
+
+
+#ifdef USE_LZFIND_SATUR_SUB_256
+
+#include <immintrin.h> // avx
+/*
+clang :immintrin.h uses
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX2__)
+#include <avx2intrin.h>
+#endif
+so we need <avxintrin.h> for clang-cl */
+
+#if defined(__clang__)
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#endif
+
+// AVX2:
+#define SASUB_256(i) \
+    *(      __m256i *)(      void *)(items + (i) * 8) = \
+   _mm256_sub_epi32(_mm256_max_epu32( \
+    *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2);
+
+Z7_NO_INLINE
+static
+#ifdef LZFIND_ATTRIB_AVX2
+LZFIND_ATTRIB_AVX2
+#endif
+void
+Z7_FASTCALL
+LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim)
 {
-  size_t i;
-  for (i = 0; i < numItems; i++)
+  const __m256i sub2 = _mm256_set_epi32(
+      (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue,
+      (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
   {
-    UInt32 value = items[i];
-    if (value <= subValue)
-      value = kEmptyHashValue;
-    else
-      value -= subValue;
-    items[i] = value;
+    SASUB_256(0)  SASUB_256(1)  items += 2 * 8;
+    SASUB_256(0)  SASUB_256(1)  items += 2 * 8;
   }
+  while (items != lim);
 }
+#endif // USE_LZFIND_SATUR_SUB_256
+
+#ifndef FORCE_LZFIND_SATUR_SUB_128
+typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)(
+    UInt32 subValue, CLzRef *items, const CLzRef *lim);
+static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub;
+#endif // FORCE_LZFIND_SATUR_SUB_128
+
+#endif // USE_LZFIND_SATUR_SUB_128
+
+
+// kEmptyHashValue must be zero
+// #define SASUB_32(i)  { UInt32 v = items[i];  UInt32 m = v - subValue;  if (v < subValue) m = kEmptyHashValue;  items[i] = m; }
+#define SASUB_32(i)  { UInt32 v = items[i];  if (v < subValue) v = subValue; items[i] = v - subValue; }
+
+#ifdef FORCE_LZFIND_SATUR_SUB_128
 
-static void MatchFinder_Normalize(CMatchFinder *p)
+#define DEFAULT_SaturSub LzFind_SaturSub_128
+
+#else
+
+#define DEFAULT_SaturSub LzFind_SaturSub_32
+
+Z7_NO_INLINE
+static
+void
+Z7_FASTCALL
+LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim)
 {
-  UInt32 subValue = MatchFinder_GetSubValue(p);
-  MatchFinder_Normalize3(subValue, p->hash, p->numRefs);
-  MatchFinder_ReduceOffsets(p, subValue);
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SASUB_32(0)  SASUB_32(1)  items += 2;
+    SASUB_32(0)  SASUB_32(1)  items += 2;
+    SASUB_32(0)  SASUB_32(1)  items += 2;
+    SASUB_32(0)  SASUB_32(1)  items += 2;
+  }
+  while (items != lim);
 }
 
+#endif
+
 
-MY_NO_INLINE
+Z7_NO_INLINE
+void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
+{
+  #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7)
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--)
+  {
+    SASUB_32(0)
+    items++;
+  }
+  {
+    const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1);
+    CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask);
+    numItems &= k_Align_Mask;
+    if (items != lim)
+    {
+      #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128)
+        if (g_LzFind_SaturSub)
+          g_LzFind_SaturSub(subValue, items, lim);
+        else
+      #endif
+          DEFAULT_SaturSub(subValue, items, lim);
+    }
+    items = lim;
+  }
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  for (; numItems != 0; numItems--)
+  {
+    SASUB_32(0)
+    items++;
+  }
+}
+
+
+
+// call MatchFinder_CheckLimits() only after (p->pos++) update
+
+Z7_NO_INLINE
 static void MatchFinder_CheckLimits(CMatchFinder *p)
 {
+  if (// !p->streamEndWasReached && p->result == SZ_OK &&
+      p->keepSizeAfter == GET_AVAIL_BYTES(p))
+  {
+    // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p))
+    if (MatchFinder_NeedMove(p))
+      MatchFinder_MoveBlock(p);
+    MatchFinder_ReadBlock(p);
+  }
+
   if (p->pos == kMaxValForNormalize)
-    MatchFinder_Normalize(p);
-  if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos)
-    MatchFinder_CheckAndMoveAndRead(p);
+  if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data.
+    /*
+       if we disable normalization for last bytes of data, and
+       if (data_size == 4 GiB), we don't call wastfull normalization,
+       but (pos) will be wrapped over Zero (0) in that case.
+       And we cannot resume later to normal operation
+    */
+  {
+    // MatchFinder_Normalize(p);
+    /* after normalization we need (p->pos >= p->historySize + 1); */
+    /* we can reduce subValue to aligned value, if want to keep alignment
+       of (p->pos) and (p->buffer) for speculated accesses. */
+    const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */;
+    // const UInt32 subValue = (1 << 15); // for debug
+    // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue);
+    MatchFinder_REDUCE_OFFSETS(p, subValue)
+    MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize);
+    {
+      size_t numSonRefs = p->cyclicBufferSize;
+      if (p->btMode)
+        numSonRefs <<= 1;
+      MatchFinder_Normalize3(subValue, p->son, numSonRefs);
+    }
+  }
+
   if (p->cyclicBufferPos == p->cyclicBufferSize)
     p->cyclicBufferPos = 0;
+  
   MatchFinder_SetLimits(p);
 }
 
@@ -385,10 +876,10 @@ static void MatchFinder_CheckLimits(CMatchFinder *p)
 /*
   (lenLimit > maxLen)
 */
-MY_FORCE_INLINE
-static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
-    UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
-    UInt32 *distances, unsigned maxLen)
+Z7_FORCE_INLINE
+static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
+    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
+    UInt32 *d, unsigned maxLen)
 {
   /*
   son[_cyclicBufferPos] = curMatch;
@@ -396,10 +887,10 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos
   {
     UInt32 delta = pos - curMatch;
     if (cutValue-- == 0 || delta >= _cyclicBufferSize)
-      return distances;
+      return d;
     {
       const Byte *pb = cur - delta;
-      curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
+      curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
       if (pb[maxLen] == cur[maxLen] && *pb == *cur)
       {
         UInt32 len = 0;
@@ -409,10 +900,10 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos
         if (maxLen < len)
         {
           maxLen = len;
-          *distances++ = len;
-          *distances++ = delta - 1;
+          *d++ = len;
+          *d++ = delta - 1;
           if (len == lenLimit)
-            return distances;
+            return d;
         }
       }
     }
@@ -421,35 +912,41 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos
 
   const Byte *lim = cur + lenLimit;
   son[_cyclicBufferPos] = curMatch;
+
   do
   {
-    UInt32 delta = pos - curMatch;
+    UInt32 delta;
+
+    if (curMatch == 0)
+      break;
+    // if (curMatch2 >= curMatch) return NULL;
+    delta = pos - curMatch;
     if (delta >= _cyclicBufferSize)
       break;
     {
       ptrdiff_t diff;
-      curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
-      diff = (ptrdiff_t)0 - delta;
-      if (cur[maxLen] == cur[maxLen + diff])
+      curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
+      diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
+      if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
       {
         const Byte *c = cur;
         while (*c == c[diff])
         {
           if (++c == lim)
           {
-            distances[0] = (UInt32)(lim - cur);
-            distances[1] = delta - 1;
-            return distances + 2;
+            d[0] = (UInt32)(lim - cur);
+            d[1] = delta - 1;
+            return d + 2;
           }
         }
         {
-          unsigned len = (unsigned)(c - cur);
+          const unsigned len = (unsigned)(c - cur);
           if (maxLen < len)
           {
             maxLen = len;
-            distances[0] = (UInt32)len;
-            distances[1] = delta - 1;
-            distances += 2;
+            d[0] = (UInt32)len;
+            d[1] = delta - 1;
+            d += 2;
           }
         }
       }
@@ -457,31 +954,36 @@ static UInt32 * Hc_GetMatchesSpec(unsigned lenLimit, UInt32 curMatch, UInt32 pos
   }
   while (--cutValue);
   
-  return distances;
+  return d;
 }
 
 
-MY_FORCE_INLINE
+Z7_FORCE_INLINE
 UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
-    UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
-    UInt32 *distances, UInt32 maxLen)
+    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
+    UInt32 *d, UInt32 maxLen)
 {
   CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
   CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
   unsigned len0 = 0, len1 = 0;
-  for (;;)
+
+  UInt32 cmCheck;
+
+  // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
+
+  cmCheck = (UInt32)(pos - _cyclicBufferSize);
+  if ((UInt32)pos < _cyclicBufferSize)
+    cmCheck = 0;
+
+  if (cmCheck < curMatch)
+  do
   {
-    UInt32 delta = pos - curMatch;
-    if (cutValue-- == 0 || delta >= _cyclicBufferSize)
-    {
-      *ptr0 = *ptr1 = kEmptyHashValue;
-      return distances;
-    }
+    const UInt32 delta = pos - curMatch;
     {
-      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
+      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
       const Byte *pb = cur - delta;
       unsigned len = (len0 < len1 ? len0 : len1);
-      UInt32 pair0 = pair[0];
+      const UInt32 pair0 = pair[0];
       if (pb[len] == cur[len])
       {
         if (++len != lenLimit && pb[len] == cur[len])
@@ -491,50 +993,62 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
         if (maxLen < len)
         {
           maxLen = (UInt32)len;
-          *distances++ = (UInt32)len;
-          *distances++ = delta - 1;
+          *d++ = (UInt32)len;
+          *d++ = delta - 1;
           if (len == lenLimit)
           {
             *ptr1 = pair0;
             *ptr0 = pair[1];
-            return distances;
+            return d;
           }
         }
       }
       if (pb[len] < cur[len])
       {
         *ptr1 = curMatch;
+        // const UInt32 curMatch2 = pair[1];
+        // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue;  return NULL; }
+        // curMatch = curMatch2;
+        curMatch = pair[1];
         ptr1 = pair + 1;
-        curMatch = *ptr1;
         len1 = len;
       }
       else
       {
         *ptr0 = curMatch;
+        curMatch = pair[0];
         ptr0 = pair;
-        curMatch = *ptr0;
         len0 = len;
       }
     }
   }
+  while(--cutValue && cmCheck < curMatch);
+
+  *ptr0 = *ptr1 = kEmptyHashValue;
+  return d;
 }
 
+
 static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
-    UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue)
+    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue)
 {
   CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
   CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
   unsigned len0 = 0, len1 = 0;
-  for (;;)
+
+  UInt32 cmCheck;
+
+  cmCheck = (UInt32)(pos - _cyclicBufferSize);
+  if ((UInt32)pos < _cyclicBufferSize)
+    cmCheck = 0;
+
+  if (// curMatch >= pos ||  // failure
+      cmCheck < curMatch)
+  do
   {
-    UInt32 delta = pos - curMatch;
-    if (cutValue-- == 0 || delta >= _cyclicBufferSize)
+    const UInt32 delta = pos - curMatch;
     {
-      *ptr0 = *ptr1 = kEmptyHashValue;
-      return;
-    }
-    {
-      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
+      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
       const Byte *pb = cur - delta;
       unsigned len = (len0 < len1 ? len0 : len1);
       if (pb[len] == cur[len])
@@ -554,84 +1068,122 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
       if (pb[len] < cur[len])
       {
         *ptr1 = curMatch;
+        curMatch = pair[1];
         ptr1 = pair + 1;
-        curMatch = *ptr1;
         len1 = len;
       }
       else
       {
         *ptr0 = curMatch;
+        curMatch = pair[0];
         ptr0 = pair;
-        curMatch = *ptr0;
         len0 = len;
       }
     }
   }
+  while(--cutValue && cmCheck < curMatch);
+  
+  *ptr0 = *ptr1 = kEmptyHashValue;
+  return;
 }
 
+
 #define MOVE_POS \
-  ++p->cyclicBufferPos; \
+  p->cyclicBufferPos++; \
   p->buffer++; \
-  if (++p->pos == p->posLimit) MatchFinder_CheckLimits(p);
+  { const UInt32 pos1 = p->pos + 1; \
+    p->pos = pos1; \
+    if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); }
 
-#define MOVE_POS_RET MOVE_POS return (UInt32)offset;
+#define MOVE_POS_RET MOVE_POS return distances;
 
-static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; }
+Z7_NO_INLINE
+static void MatchFinder_MovePos(CMatchFinder *p)
+{
+  /* we go here at the end of stream data, when (avail < num_hash_bytes)
+     We don't update sons[cyclicBufferPos << btMode].
+     So (sons) record will contain junk. And we cannot resume match searching
+     to normal operation, even if we will provide more input data in buffer.
+     p->sons[p->cyclicBufferPos << p->btMode] = 0;  // kEmptyHashValue
+     if (p->btMode)
+        p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0;  // kEmptyHashValue
+  */
+  MOVE_POS
+}
 
 #define GET_MATCHES_HEADER2(minLen, ret_op) \
-  unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \
-  lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \
+  UInt32 hv; const Byte *cur; UInt32 curMatch; \
+  UInt32 lenLimit = p->lenLimit; \
+  if (lenLimit < minLen) { MatchFinder_MovePos(p);  ret_op; } \
   cur = p->buffer;
 
-#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return 0)
-#define SKIP_HEADER(minLen)        GET_MATCHES_HEADER2(minLen, continue)
+#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances)
+#define SKIP_HEADER(minLen)  \
+  do { GET_MATCHES_HEADER2(minLen, continue)
+
+#define MF_PARAMS(p)  lenLimit, curMatch, p->pos, p->buffer, p->son, \
+    p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
+
+#define SKIP_FOOTER  \
+    SkipMatchesSpec(MF_PARAMS(p)); \
+    MOVE_POS \
+  } while (--num);
+
+#define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \
+  distances = func(MF_PARAMS(p), distances, (UInt32)_maxLen_); \
+  MOVE_POS_RET
+
+#define GET_MATCHES_FOOTER_BT(_maxLen_) \
+  GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1)
 
-#define MF_PARAMS(p) p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
+#define GET_MATCHES_FOOTER_HC(_maxLen_) \
+  GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec)
 
-#define GET_MATCHES_FOOTER(offset, maxLen) \
-  offset = (unsigned)(GetMatchesSpec1((UInt32)lenLimit, curMatch, MF_PARAMS(p), \
-  distances + offset, (UInt32)maxLen) - distances); MOVE_POS_RET;
 
-#define SKIP_FOOTER \
-  SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p)); MOVE_POS;
 
 #define UPDATE_maxLen { \
-    ptrdiff_t diff = (ptrdiff_t)0 - d2; \
+    const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \
     const Byte *c = cur + maxLen; \
     const Byte *lim = cur + lenLimit; \
     for (; c != lim; c++) if (*(c + diff) != *c) break; \
     maxLen = (unsigned)(c - cur); }
 
-static UInt32 Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+static UInt32* Bt2_MatchFinder_GetMatches(void *_p, UInt32 *distances)
 {
-  unsigned offset;
+  CMatchFinder *p = (CMatchFinder *)_p;
   GET_MATCHES_HEADER(2)
-  HASH2_CALC;
+  HASH2_CALC
   curMatch = p->hash[hv];
   p->hash[hv] = p->pos;
-  offset = 0;
-  GET_MATCHES_FOOTER(offset, 1)
+  GET_MATCHES_FOOTER_BT(1)
 }
 
-UInt32 Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
 {
-  unsigned offset;
   GET_MATCHES_HEADER(3)
-  HASH_ZIP_CALC;
+  HASH_ZIP_CALC
   curMatch = p->hash[hv];
   p->hash[hv] = p->pos;
-  offset = 0;
-  GET_MATCHES_FOOTER(offset, 2)
+  GET_MATCHES_FOOTER_BT(2)
 }
 
-static UInt32 Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+
+#define SET_mmm  \
+  mmm = p->cyclicBufferSize; \
+  if (pos < mmm) \
+    mmm = pos;
+
+
+static UInt32* Bt3_MatchFinder_GetMatches(void *_p, UInt32 *distances)
 {
+  CMatchFinder *p = (CMatchFinder *)_p;
+  UInt32 mmm;
   UInt32 h2, d2, pos;
-  unsigned maxLen, offset;
+  unsigned maxLen;
   UInt32 *hash;
   GET_MATCHES_HEADER(3)
 
-  HASH3_CALC;
+  HASH3_CALC
 
   hash = p->hash;
   pos = p->pos;
@@ -643,167 +1195,180 @@ static UInt32 Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
   hash[h2] = pos;
   (hash + kFix3HashSize)[hv] = pos;
 
+  SET_mmm
+
   maxLen = 2;
-  offset = 0;
 
-  if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur)
+  if (d2 < mmm && *(cur - d2) == *cur)
   {
     UPDATE_maxLen
     distances[0] = (UInt32)maxLen;
     distances[1] = d2 - 1;
-    offset = 2;
+    distances += 2;
     if (maxLen == lenLimit)
     {
-      SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p));
-      MOVE_POS_RET;
+      SkipMatchesSpec(MF_PARAMS(p));
+      MOVE_POS_RET
     }
   }
   
-  GET_MATCHES_FOOTER(offset, maxLen)
+  GET_MATCHES_FOOTER_BT(maxLen)
 }
 
-static UInt32 Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+
+static UInt32* Bt4_MatchFinder_GetMatches(void *_p, UInt32 *distances)
 {
+  CMatchFinder *p = (CMatchFinder *)_p;
+  UInt32 mmm;
   UInt32 h2, h3, d2, d3, pos;
-  unsigned maxLen, offset;
+  unsigned maxLen;
   UInt32 *hash;
   GET_MATCHES_HEADER(4)
 
-  HASH4_CALC;
+  HASH4_CALC
 
   hash = p->hash;
   pos = p->pos;
 
   d2 = pos - hash                  [h2];
   d3 = pos - (hash + kFix3HashSize)[h3];
-
   curMatch = (hash + kFix4HashSize)[hv];
 
   hash                  [h2] = pos;
   (hash + kFix3HashSize)[h3] = pos;
   (hash + kFix4HashSize)[hv] = pos;
 
-  maxLen = 0;
-  offset = 0;
-  
-  if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur)
-  {
-    maxLen = 2;
-    distances[0] = 2;
-    distances[1] = d2 - 1;
-    offset = 2;
-  }
+  SET_mmm
+
+  maxLen = 3;
   
-  if (d2 != d3 && d3 < p->cyclicBufferSize && *(cur - d3) == *cur)
+  for (;;)
   {
-    maxLen = 3;
-    distances[(size_t)offset + 1] = d3 - 1;
-    offset += 2;
-    d2 = d3;
-  }
+    if (d2 < mmm && *(cur - d2) == *cur)
+    {
+      distances[0] = 2;
+      distances[1] = d2 - 1;
+      distances += 2;
+      if (*(cur - d2 + 2) == cur[2])
+      {
+        // distances[-2] = 3;
+      }
+      else if (d3 < mmm && *(cur - d3) == *cur)
+      {
+        d2 = d3;
+        distances[1] = d3 - 1;
+        distances += 2;
+      }
+      else
+        break;
+    }
+    else if (d3 < mmm && *(cur - d3) == *cur)
+    {
+      d2 = d3;
+      distances[1] = d3 - 1;
+      distances += 2;
+    }
+    else
+      break;
   
-  if (offset != 0)
-  {
     UPDATE_maxLen
-    distances[(size_t)offset - 2] = (UInt32)maxLen;
+    distances[-2] = (UInt32)maxLen;
     if (maxLen == lenLimit)
     {
-      SkipMatchesSpec((UInt32)lenLimit, curMatch, MF_PARAMS(p));
-      MOVE_POS_RET;
+      SkipMatchesSpec(MF_PARAMS(p));
+      MOVE_POS_RET
     }
+    break;
   }
   
-  if (maxLen < 3)
-    maxLen = 3;
-  
-  GET_MATCHES_FOOTER(offset, maxLen)
+  GET_MATCHES_FOOTER_BT(maxLen)
 }
 
-/*
-static UInt32 Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+
+static UInt32* Bt5_MatchFinder_GetMatches(void *_p, UInt32 *distances)
 {
-  UInt32 h2, h3, h4, d2, d3, d4, maxLen, offset, pos;
+  CMatchFinder *p = (CMatchFinder *)_p;
+  UInt32 mmm;
+  UInt32 h2, h3, d2, d3, pos;
+  unsigned maxLen;
   UInt32 *hash;
   GET_MATCHES_HEADER(5)
 
-  HASH5_CALC;
+  HASH5_CALC
 
   hash = p->hash;
   pos = p->pos;
 
   d2 = pos - hash                  [h2];
   d3 = pos - (hash + kFix3HashSize)[h3];
-  d4 = pos - (hash + kFix4HashSize)[h4];
+  // d4 = pos - (hash + kFix4HashSize)[h4];
 
   curMatch = (hash + kFix5HashSize)[hv];
 
   hash                  [h2] = pos;
   (hash + kFix3HashSize)[h3] = pos;
-  (hash + kFix4HashSize)[h4] = pos;
+  // (hash + kFix4HashSize)[h4] = pos;
   (hash + kFix5HashSize)[hv] = pos;
 
-  maxLen = 0;
-  offset = 0;
+  SET_mmm
+
+  maxLen = 4;
 
-  if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur)
+  for (;;)
   {
-    distances[0] = maxLen = 2;
-    distances[1] = d2 - 1;
-    offset = 2;
-    if (*(cur - d2 + 2) == cur[2])
-      distances[0] = maxLen = 3;
-    else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur)
+    if (d2 < mmm && *(cur - d2) == *cur)
     {
-      distances[2] = maxLen = 3;
-      distances[3] = d3 - 1;
-      offset = 4;
+      distances[0] = 2;
+      distances[1] = d2 - 1;
+      distances += 2;
+      if (*(cur - d2 + 2) == cur[2])
+      {
+      }
+      else if (d3 < mmm && *(cur - d3) == *cur)
+      {
+        distances[1] = d3 - 1;
+        distances += 2;
+        d2 = d3;
+      }
+      else
+        break;
+    }
+    else if (d3 < mmm && *(cur - d3) == *cur)
+    {
+      distances[1] = d3 - 1;
+      distances += 2;
       d2 = d3;
     }
-  }
-  else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur)
-  {
-    distances[0] = maxLen = 3;
-    distances[1] = d3 - 1;
-    offset = 2;
-    d2 = d3;
-  }
-  
-  if (d2 != d4 && d4 < p->cyclicBufferSize
-      && *(cur - d4) == *cur
-      && *(cur - d4 + 3) == *(cur + 3))
-  {
-    maxLen = 4;
-    distances[(size_t)offset + 1] = d4 - 1;
-    offset += 2;
-    d2 = d4;
-  }
-  
-  if (offset != 0)
-  {
+    else
+      break;
+
+    distances[-2] = 3;
+    if (*(cur - d2 + 3) != cur[3])
+      break;
     UPDATE_maxLen
-    distances[(size_t)offset - 2] = maxLen;
+    distances[-2] = (UInt32)maxLen;
     if (maxLen == lenLimit)
     {
-      SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p));
-      MOVE_POS_RET;
+      SkipMatchesSpec(MF_PARAMS(p));
+      MOVE_POS_RET
     }
+    break;
   }
-
-  if (maxLen < 4)
-    maxLen = 4;
   
-  GET_MATCHES_FOOTER(offset, maxLen)
+  GET_MATCHES_FOOTER_BT(maxLen)
 }
-*/
 
-static UInt32 Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+
+static UInt32* Hc4_MatchFinder_GetMatches(void *_p, UInt32 *distances)
 {
+  CMatchFinder *p = (CMatchFinder *)_p;
+  UInt32 mmm;
   UInt32 h2, h3, d2, d3, pos;
-  unsigned maxLen, offset;
+  unsigned maxLen;
   UInt32 *hash;
   GET_MATCHES_HEADER(4)
 
-  HASH4_CALC;
+  HASH4_CALC
 
   hash = p->hash;
   pos = p->pos;
@@ -816,312 +1381,366 @@ static UInt32 Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
   (hash + kFix3HashSize)[h3] = pos;
   (hash + kFix4HashSize)[hv] = pos;
 
-  maxLen = 0;
-  offset = 0;
+  SET_mmm
 
-  if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur)
-  {
-    maxLen = 2;
-    distances[0] = 2;
-    distances[1] = d2 - 1;
-    offset = 2;
-  }
-  
-  if (d2 != d3 && d3 < p->cyclicBufferSize && *(cur - d3) == *cur)
-  {
-    maxLen = 3;
-    distances[(size_t)offset + 1] = d3 - 1;
-    offset += 2;
-    d2 = d3;
-  }
-  
-  if (offset != 0)
+  maxLen = 3;
+
+  for (;;)
   {
+    if (d2 < mmm && *(cur - d2) == *cur)
+    {
+      distances[0] = 2;
+      distances[1] = d2 - 1;
+      distances += 2;
+      if (*(cur - d2 + 2) == cur[2])
+      {
+        // distances[-2] = 3;
+      }
+      else if (d3 < mmm && *(cur - d3) == *cur)
+      {
+        d2 = d3;
+        distances[1] = d3 - 1;
+        distances += 2;
+      }
+      else
+        break;
+    }
+    else if (d3 < mmm && *(cur - d3) == *cur)
+    {
+      d2 = d3;
+      distances[1] = d3 - 1;
+      distances += 2;
+    }
+    else
+      break;
+
     UPDATE_maxLen
-    distances[(size_t)offset - 2] = (UInt32)maxLen;
+    distances[-2] = (UInt32)maxLen;
     if (maxLen == lenLimit)
     {
       p->son[p->cyclicBufferPos] = curMatch;
-      MOVE_POS_RET;
+      MOVE_POS_RET
     }
+    break;
   }
   
-  if (maxLen < 3)
-    maxLen = 3;
-
-  offset = (unsigned)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p),
-      distances + offset, maxLen) - (distances));
-  MOVE_POS_RET
+  GET_MATCHES_FOOTER_HC(maxLen)
 }
 
-/*
-static UInt32 Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+
+static UInt32 * Hc5_MatchFinder_GetMatches(void *_p, UInt32 *distances)
 {
-  UInt32 h2, h3, h4, d2, d3, d4, maxLen, offset, pos
+  CMatchFinder *p = (CMatchFinder *)_p;
+  UInt32 mmm;
+  UInt32 h2, h3, d2, d3, pos;
+  unsigned maxLen;
   UInt32 *hash;
   GET_MATCHES_HEADER(5)
 
-  HASH5_CALC;
+  HASH5_CALC
 
   hash = p->hash;
   pos = p->pos;
-  
+
   d2 = pos - hash                  [h2];
   d3 = pos - (hash + kFix3HashSize)[h3];
-  d4 = pos - (hash + kFix4HashSize)[h4];
+  // d4 = pos - (hash + kFix4HashSize)[h4];
 
   curMatch = (hash + kFix5HashSize)[hv];
 
   hash                  [h2] = pos;
   (hash + kFix3HashSize)[h3] = pos;
-  (hash + kFix4HashSize)[h4] = pos;
+  // (hash + kFix4HashSize)[h4] = pos;
   (hash + kFix5HashSize)[hv] = pos;
 
-  maxLen = 0;
-  offset = 0;
+  SET_mmm
+  
+  maxLen = 4;
 
-  if (d2 < p->cyclicBufferSize && *(cur - d2) == *cur)
+  for (;;)
   {
-    distances[0] = maxLen = 2;
-    distances[1] = d2 - 1;
-    offset = 2;
-    if (*(cur - d2 + 2) == cur[2])
-      distances[0] = maxLen = 3;
-    else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur)
+    if (d2 < mmm && *(cur - d2) == *cur)
     {
-      distances[2] = maxLen = 3;
-      distances[3] = d3 - 1;
-      offset = 4;
+      distances[0] = 2;
+      distances[1] = d2 - 1;
+      distances += 2;
+      if (*(cur - d2 + 2) == cur[2])
+      {
+      }
+      else if (d3 < mmm && *(cur - d3) == *cur)
+      {
+        distances[1] = d3 - 1;
+        distances += 2;
+        d2 = d3;
+      }
+      else
+        break;
+    }
+    else if (d3 < mmm && *(cur - d3) == *cur)
+    {
+      distances[1] = d3 - 1;
+      distances += 2;
       d2 = d3;
     }
-  }
-  else if (d3 < p->cyclicBufferSize && *(cur - d3) == *cur)
-  {
-    distances[0] = maxLen = 3;
-    distances[1] = d3 - 1;
-    offset = 2;
-    d2 = d3;
-  }
-  
-  if (d2 != d4 && d4 < p->cyclicBufferSize
-      && *(cur - d4) == *cur
-      && *(cur - d4 + 3) == *(cur + 3))
-  {
-    maxLen = 4;
-    distances[(size_t)offset + 1] = d4 - 1;
-    offset += 2;
-    d2 = d4;
-  }
-  
-  if (offset != 0)
-  {
+    else
+      break;
+
+    distances[-2] = 3;
+    if (*(cur - d2 + 3) != cur[3])
+      break;
     UPDATE_maxLen
-    distances[(size_t)offset - 2] = maxLen;
+    distances[-2] = (UInt32)maxLen;
     if (maxLen == lenLimit)
     {
       p->son[p->cyclicBufferPos] = curMatch;
-      MOVE_POS_RET;
+      MOVE_POS_RET
     }
+    break;
   }
   
-  if (maxLen < 4)
-    maxLen = 4;
-
-  offset = (UInt32)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p),
-      distances + offset, maxLen) - (distances));
-  MOVE_POS_RET
+  GET_MATCHES_FOOTER_HC(maxLen)
 }
-*/
 
-UInt32 Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
+
+UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
 {
-  unsigned offset;
   GET_MATCHES_HEADER(3)
-  HASH_ZIP_CALC;
+  HASH_ZIP_CALC
   curMatch = p->hash[hv];
   p->hash[hv] = p->pos;
-  offset = (unsigned)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p),
-      distances, 2) - (distances));
-  MOVE_POS_RET
+  GET_MATCHES_FOOTER_HC(2)
 }
 
-static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+
+static void Bt2_MatchFinder_Skip(void *_p, UInt32 num)
 {
-  do
+  CMatchFinder *p = (CMatchFinder *)_p;
+  SKIP_HEADER(2)
   {
-    SKIP_HEADER(2)
-    HASH2_CALC;
+    HASH2_CALC
     curMatch = p->hash[hv];
     p->hash[hv] = p->pos;
-    SKIP_FOOTER
   }
-  while (--num != 0);
+  SKIP_FOOTER
 }
 
 void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
 {
-  do
+  SKIP_HEADER(3)
   {
-    SKIP_HEADER(3)
-    HASH_ZIP_CALC;
+    HASH_ZIP_CALC
     curMatch = p->hash[hv];
     p->hash[hv] = p->pos;
-    SKIP_FOOTER
   }
-  while (--num != 0);
+  SKIP_FOOTER
 }
 
-static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+static void Bt3_MatchFinder_Skip(void *_p, UInt32 num)
 {
-  do
+  CMatchFinder *p = (CMatchFinder *)_p;
+  SKIP_HEADER(3)
   {
     UInt32 h2;
     UInt32 *hash;
-    SKIP_HEADER(3)
-    HASH3_CALC;
+    HASH3_CALC
     hash = p->hash;
     curMatch = (hash + kFix3HashSize)[hv];
     hash[h2] =
     (hash + kFix3HashSize)[hv] = p->pos;
-    SKIP_FOOTER
   }
-  while (--num != 0);
+  SKIP_FOOTER
 }
 
-static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+static void Bt4_MatchFinder_Skip(void *_p, UInt32 num)
 {
-  do
+  CMatchFinder *p = (CMatchFinder *)_p;
+  SKIP_HEADER(4)
   {
     UInt32 h2, h3;
     UInt32 *hash;
-    SKIP_HEADER(4)
-    HASH4_CALC;
+    HASH4_CALC
     hash = p->hash;
     curMatch = (hash + kFix4HashSize)[hv];
     hash                  [h2] =
     (hash + kFix3HashSize)[h3] =
     (hash + kFix4HashSize)[hv] = p->pos;
-    SKIP_FOOTER
   }
-  while (--num != 0);
+  SKIP_FOOTER
 }
 
-/*
-static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+static void Bt5_MatchFinder_Skip(void *_p, UInt32 num)
 {
-  do
+  CMatchFinder *p = (CMatchFinder *)_p;
+  SKIP_HEADER(5)
   {
-    UInt32 h2, h3, h4;
+    UInt32 h2, h3;
     UInt32 *hash;
-    SKIP_HEADER(5)
-    HASH5_CALC;
+    HASH5_CALC
     hash = p->hash;
     curMatch = (hash + kFix5HashSize)[hv];
     hash                  [h2] =
     (hash + kFix3HashSize)[h3] =
-    (hash + kFix4HashSize)[h4] =
+    // (hash + kFix4HashSize)[h4] =
     (hash + kFix5HashSize)[hv] = p->pos;
-    SKIP_FOOTER
   }
-  while (--num != 0);
+  SKIP_FOOTER
 }
-*/
 
-static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+
+#define HC_SKIP_HEADER(minLen) \
+    do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \
+    const Byte *cur; \
+    UInt32 *hash; \
+    UInt32 *son; \
+    UInt32 pos = p->pos; \
+    UInt32 num2 = num; \
+    /* (p->pos == p->posLimit) is not allowed here !!! */ \
+    { const UInt32 rem = p->posLimit - pos; if (num2 >= rem) num2 = rem; } \
+    num -= num2; \
+    { const UInt32 cycPos = p->cyclicBufferPos; \
+      son = p->son + cycPos; \
+      p->cyclicBufferPos = cycPos + num2; } \
+    cur = p->buffer; \
+    hash = p->hash; \
+    do { \
+    UInt32 curMatch; \
+    UInt32 hv;
+
+
+#define HC_SKIP_FOOTER \
+    cur++;  pos++;  *son++ = curMatch; \
+    } while (--num2); \
+    p->buffer = cur; \
+    p->pos = pos; \
+    if (pos == p->posLimit) MatchFinder_CheckLimits(p); \
+    }} while(num); \
+
+
+static void Hc4_MatchFinder_Skip(void *_p, UInt32 num)
 {
-  do
-  {
+  CMatchFinder *p = (CMatchFinder *)_p;
+  HC_SKIP_HEADER(4)
+
     UInt32 h2, h3;
-    UInt32 *hash;
-    SKIP_HEADER(4)
-    HASH4_CALC;
-    hash = p->hash;
+    HASH4_CALC
     curMatch = (hash + kFix4HashSize)[hv];
     hash                  [h2] =
     (hash + kFix3HashSize)[h3] =
-    (hash + kFix4HashSize)[hv] = p->pos;
-    p->son[p->cyclicBufferPos] = curMatch;
-    MOVE_POS
-  }
-  while (--num != 0);
+    (hash + kFix4HashSize)[hv] = pos;
+  
+  HC_SKIP_FOOTER
 }
 
-/*
-static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
+
+static void Hc5_MatchFinder_Skip(void *_p, UInt32 num)
 {
-  do
-  {
-    UInt32 h2, h3, h4;
-    UInt32 *hash;
-    SKIP_HEADER(5)
-    HASH5_CALC;
-    hash = p->hash;
-    curMatch = hash + kFix5HashSize)[hv];
+  CMatchFinder *p = (CMatchFinder *)_p;
+  HC_SKIP_HEADER(5)
+  
+    UInt32 h2, h3;
+    HASH5_CALC
+    curMatch = (hash + kFix5HashSize)[hv];
     hash                  [h2] =
     (hash + kFix3HashSize)[h3] =
-    (hash + kFix4HashSize)[h4] =
-    (hash + kFix5HashSize)[hv] = p->pos;
-    p->son[p->cyclicBufferPos] = curMatch;
-    MOVE_POS
-  }
-  while (--num != 0);
+    // (hash + kFix4HashSize)[h4] =
+    (hash + kFix5HashSize)[hv] = pos;
+  
+  HC_SKIP_FOOTER
 }
-*/
+
 
 void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
 {
-  do
-  {
-    SKIP_HEADER(3)
-    HASH_ZIP_CALC;
-    curMatch = p->hash[hv];
-    p->hash[hv] = p->pos;
-    p->son[p->cyclicBufferPos] = curMatch;
-    MOVE_POS
-  }
-  while (--num != 0);
+  HC_SKIP_HEADER(3)
+
+    HASH_ZIP_CALC
+    curMatch = hash[hv];
+    hash[hv] = pos;
+
+  HC_SKIP_FOOTER
 }
 
-void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable)
+
+void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable)
 {
-  vTable->Init = (Mf_Init_Func)MatchFinder_Init;
-  vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes;
-  vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos;
+  vTable->Init = MatchFinder_Init;
+  vTable->GetNumAvailableBytes = MatchFinder_GetNumAvailableBytes;
+  vTable->GetPointerToCurrentPos = MatchFinder_GetPointerToCurrentPos;
   if (!p->btMode)
   {
-    /* if (p->numHashBytes <= 4) */
+    if (p->numHashBytes <= 4)
     {
-      vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches;
-      vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip;
+      vTable->GetMatches = Hc4_MatchFinder_GetMatches;
+      vTable->Skip = Hc4_MatchFinder_Skip;
     }
-    /*
     else
     {
-      vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches;
-      vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip;
+      vTable->GetMatches = Hc5_MatchFinder_GetMatches;
+      vTable->Skip = Hc5_MatchFinder_Skip;
     }
-    */
   }
   else if (p->numHashBytes == 2)
   {
-    vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches;
-    vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip;
+    vTable->GetMatches = Bt2_MatchFinder_GetMatches;
+    vTable->Skip = Bt2_MatchFinder_Skip;
   }
   else if (p->numHashBytes == 3)
   {
-    vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches;
-    vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip;
+    vTable->GetMatches = Bt3_MatchFinder_GetMatches;
+    vTable->Skip = Bt3_MatchFinder_Skip;
   }
-  else /* if (p->numHashBytes == 4) */
+  else if (p->numHashBytes == 4)
   {
-    vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches;
-    vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip;
+    vTable->GetMatches = Bt4_MatchFinder_GetMatches;
+    vTable->Skip = Bt4_MatchFinder_Skip;
   }
-  /*
   else
   {
-    vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches;
-    vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip;
+    vTable->GetMatches = Bt5_MatchFinder_GetMatches;
+    vTable->Skip = Bt5_MatchFinder_Skip;
   }
-  */
 }
+
+
+
+void LzFindPrepare(void)
+{
+  #ifndef FORCE_LZFIND_SATUR_SUB_128
+  #ifdef USE_LZFIND_SATUR_SUB_128
+  LZFIND_SATUR_SUB_CODE_FUNC f = NULL;
+  #ifdef MY_CPU_ARM_OR_ARM64
+  {
+    if (CPU_IsSupported_NEON())
+    {
+      // #pragma message ("=== LzFind NEON")
+      PRF(printf("\n=== LzFind NEON\n"));
+      f = LzFind_SaturSub_128;
+    }
+    // f = 0; // for debug
+  }
+  #else // MY_CPU_ARM_OR_ARM64
+  if (CPU_IsSupported_SSE41())
+  {
+    // #pragma message ("=== LzFind SSE41")
+    PRF(printf("\n=== LzFind SSE41\n"));
+    f = LzFind_SaturSub_128;
+
+    #ifdef USE_LZFIND_SATUR_SUB_256
+    if (CPU_IsSupported_AVX2())
+    {
+      // #pragma message ("=== LzFind AVX2")
+      PRF(printf("\n=== LzFind AVX2\n"));
+      f = LzFind_SaturSub_256;
+    }
+    #endif
+  }
+  #endif // MY_CPU_ARM_OR_ARM64
+  g_LzFind_SaturSub = f;
+  #endif // USE_LZFIND_SATUR_SUB_128
+  #endif // FORCE_LZFIND_SATUR_SUB_128
+}
+
+
+#undef MOVE_POS
+#undef MOVE_POS_RET
+#undef PRF
diff --git a/src/sdk/C/LzFind.h b/src/sdk/C/LzFind.h
index 42c13be..67e8a6e 100644
--- a/src/sdk/C/LzFind.h
+++ b/src/sdk/C/LzFind.h
@@ -1,8 +1,8 @@
 /* LzFind.h -- Match finder for LZ algorithms
-2017-06-10 : Igor Pavlov : Public domain */
+2024-01-22 : Igor Pavlov : Public domain */
 
-#ifndef __LZ_FIND_H
-#define __LZ_FIND_H
+#ifndef ZIP7_INC_LZ_FIND_H
+#define ZIP7_INC_LZ_FIND_H
 
 #include "7zTypes.h"
 
@@ -10,12 +10,12 @@ EXTERN_C_BEGIN
 
 typedef UInt32 CLzRef;
 
-typedef struct _CMatchFinder
+typedef struct
 {
-  Byte *buffer;
+  const Byte *buffer;
   UInt32 pos;
   UInt32 posLimit;
-  UInt32 streamPos;
+  UInt32 streamPos;  /* wrap over Zero is allowed (streamPos < pos). Use (UInt32)(streamPos - pos) */
   UInt32 lenLimit;
 
   UInt32 cyclicBufferPos;
@@ -32,8 +32,8 @@ typedef struct _CMatchFinder
   UInt32 hashMask;
   UInt32 cutValue;
 
-  Byte *bufferBase;
-  ISeqInStream *stream;
+  Byte *bufBase;
+  ISeqInStreamPtr stream;
   
   UInt32 blockSize;
   UInt32 keepSizeBefore;
@@ -43,7 +43,9 @@ typedef struct _CMatchFinder
   size_t directInputRem;
   UInt32 historySize;
   UInt32 fixedHashSize;
-  UInt32 hashSizeSum;
+  Byte numHashBytes_Min;
+  Byte numHashOutBits;
+  Byte _pad2_[2];
   SRes result;
   UInt32 crc[256];
   size_t numRefs;
@@ -51,35 +53,69 @@ typedef struct _CMatchFinder
   UInt64 expectedDataSize;
 } CMatchFinder;
 
-#define Inline_MatchFinder_GetPointerToCurrentPos(p) ((p)->buffer)
+#define Inline_MatchFinder_GetPointerToCurrentPos(p) ((const Byte *)(p)->buffer)
 
-#define Inline_MatchFinder_GetNumAvailableBytes(p) ((p)->streamPos - (p)->pos)
+#define Inline_MatchFinder_GetNumAvailableBytes(p) ((UInt32)((p)->streamPos - (p)->pos))
 
+/*
 #define Inline_MatchFinder_IsFinishedOK(p) \
     ((p)->streamEndWasReached \
         && (p)->streamPos == (p)->pos \
         && (!(p)->directInput || (p)->directInputRem == 0))
+*/
       
 int MatchFinder_NeedMove(CMatchFinder *p);
-Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p);
+/* Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p); */
 void MatchFinder_MoveBlock(CMatchFinder *p);
 void MatchFinder_ReadIfRequired(CMatchFinder *p);
 
 void MatchFinder_Construct(CMatchFinder *p);
 
-/* Conditions:
-     historySize <= 3 GB
-     keepAddBufferBefore + matchMaxLen + keepAddBufferAfter < 511MB
+/* (directInput = 0) is default value.
+   It's required to provide correct (directInput) value
+   before calling MatchFinder_Create().
+   You can set (directInput) by any of the following calls:
+     - MatchFinder_SET_DIRECT_INPUT_BUF()
+     - MatchFinder_SET_STREAM()
+     - MatchFinder_SET_STREAM_MODE()
+*/
+
+#define MatchFinder_SET_DIRECT_INPUT_BUF(p, _src_, _srcLen_) { \
+  (p)->stream = NULL; \
+  (p)->directInput = 1; \
+  (p)->buffer = (_src_); \
+  (p)->directInputRem = (_srcLen_); }
+
+/*
+#define MatchFinder_SET_STREAM_MODE(p) { \
+  (p)->directInput = 0; }
 */
+
+#define MatchFinder_SET_STREAM(p, _stream_) { \
+  (p)->stream = _stream_; \
+  (p)->directInput = 0; }
+  
+
 int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
     UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
     ISzAllocPtr alloc);
 void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc);
 void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems);
-void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue);
+
+/*
+#define MatchFinder_INIT_POS(p, val) \
+    (p)->pos = (val); \
+    (p)->streamPos = (val);
+*/
+
+// void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue);
+#define MatchFinder_REDUCE_OFFSETS(p, subValue) \
+    (p)->pos -= (subValue); \
+    (p)->streamPos -= (subValue);
+
 
 UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *buffer, CLzRef *son,
-    UInt32 _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue,
+    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue,
     UInt32 *distances, UInt32 maxLen);
 
 /*
@@ -91,31 +127,34 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
 typedef void (*Mf_Init_Func)(void *object);
 typedef UInt32 (*Mf_GetNumAvailableBytes_Func)(void *object);
 typedef const Byte * (*Mf_GetPointerToCurrentPos_Func)(void *object);
-typedef UInt32 (*Mf_GetMatches_Func)(void *object, UInt32 *distances);
+typedef UInt32 * (*Mf_GetMatches_Func)(void *object, UInt32 *distances);
 typedef void (*Mf_Skip_Func)(void *object, UInt32);
 
-typedef struct _IMatchFinder
+typedef struct
 {
   Mf_Init_Func Init;
   Mf_GetNumAvailableBytes_Func GetNumAvailableBytes;
   Mf_GetPointerToCurrentPos_Func GetPointerToCurrentPos;
   Mf_GetMatches_Func GetMatches;
   Mf_Skip_Func Skip;
-} IMatchFinder;
+} IMatchFinder2;
 
-void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable);
+void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable);
 
 void MatchFinder_Init_LowHash(CMatchFinder *p);
 void MatchFinder_Init_HighHash(CMatchFinder *p);
-void MatchFinder_Init_3(CMatchFinder *p, int readData);
-void MatchFinder_Init(CMatchFinder *p);
+void MatchFinder_Init_4(CMatchFinder *p);
+// void MatchFinder_Init(CMatchFinder *p);
+void MatchFinder_Init(void *p);
 
-UInt32 Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances);
-UInt32 Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances);
+UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances);
+UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances);
 
 void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num);
 void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num);
 
+void LzFindPrepare(void);
+
 EXTERN_C_END
 
 #endif
diff --git a/src/sdk/C/LzFindMt.c b/src/sdk/C/LzFindMt.c
index bb0f42c..25fcc46 100644
--- a/src/sdk/C/LzFindMt.c
+++ b/src/sdk/C/LzFindMt.c
@@ -1,97 +1,217 @@
 /* LzFindMt.c -- multithreaded Match finder for LZ algorithms
-2018-12-29 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
-#include "LzHash.h"
+// #include <stdio.h>
+
+#include "CpuArch.h"
 
+#include "LzHash.h"
 #include "LzFindMt.h"
 
+// #define LOG_ITERS
+
+// #define LOG_THREAD
+
+#ifdef LOG_THREAD
+#include <stdio.h>
+#define PRF(x) x
+#else
+#define PRF(x)
+#endif
+
+#ifdef LOG_ITERS
+#include <stdio.h>
+extern UInt64 g_NumIters_Tree;
+extern UInt64 g_NumIters_Loop;
+extern UInt64 g_NumIters_Bytes;
+#define LOG_ITER(x) x
+#else
+#define LOG_ITER(x)
+#endif
+
+#define kMtHashBlockSize ((UInt32)1 << 17)
+#define kMtHashNumBlocks (1 << 1)
+
+#define GET_HASH_BLOCK_OFFSET(i)  (((i) & (kMtHashNumBlocks - 1)) * kMtHashBlockSize)
+
+#define kMtBtBlockSize ((UInt32)1 << 16)
+#define kMtBtNumBlocks (1 << 4)
+
+#define GET_BT_BLOCK_OFFSET(i)  (((i) & (kMtBtNumBlocks - 1)) * (size_t)kMtBtBlockSize)
+
+/*
+  HASH functions:
+  We use raw 8/16 bits from a[1] and a[2],
+  xored with crc(a[0]) and crc(a[3]).
+  We check a[0], a[3] only. We don't need to compare a[1] and a[2] in matches.
+  our crc() function provides one-to-one correspondence for low 8-bit values:
+    (crc[0...0xFF] & 0xFF) <-> [0...0xFF]
+*/
+
+#define MF(mt) ((mt)->MatchFinder)
+#define MF_CRC (p->crc)
+
+// #define MF(mt) (&(mt)->MatchFinder)
+// #define MF_CRC (p->MatchFinder.crc)
+
+#define MT_HASH2_CALC \
+  h2 = (MF_CRC[cur[0]] ^ cur[1]) & (kHash2Size - 1);
+
+#define MT_HASH3_CALC { \
+  UInt32 temp = MF_CRC[cur[0]] ^ cur[1]; \
+  h2 = temp & (kHash2Size - 1); \
+  h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); }
+
+/*
+#define MT_HASH3_CALC__NO_2 { \
+  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
+  h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); }
+
+#define MT_HASH4_CALC { \
+  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
+  h2 = temp & (kHash2Size - 1); \
+  temp ^= ((UInt32)cur[2] << 8); \
+  h3 = temp & (kHash3Size - 1); \
+  h4 = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hash4Mask; }
+  // (kHash4Size - 1);
+*/
+
+
+Z7_NO_INLINE
 static void MtSync_Construct(CMtSync *p)
 {
+  p->affinityGroup = -1;
+  p->affinityInGroup = 0;
+  p->affinity = 0;
   p->wasCreated = False;
   p->csWasInitialized = False;
   p->csWasEntered = False;
-  Thread_Construct(&p->thread);
+  Thread_CONSTRUCT(&p->thread)
   Event_Construct(&p->canStart);
-  Event_Construct(&p->wasStarted);
   Event_Construct(&p->wasStopped);
   Semaphore_Construct(&p->freeSemaphore);
   Semaphore_Construct(&p->filledSemaphore);
 }
 
-static void MtSync_GetNextBlock(CMtSync *p)
+
+// #define DEBUG_BUFFER_LOCK   // define it to debug lock state
+
+#ifdef DEBUG_BUFFER_LOCK
+#include <stdlib.h>
+#define BUFFER_MUST_BE_LOCKED(p)    if (!(p)->csWasEntered) exit(1);
+#define BUFFER_MUST_BE_UNLOCKED(p)  if ( (p)->csWasEntered) exit(1);
+#else
+#define BUFFER_MUST_BE_LOCKED(p)
+#define BUFFER_MUST_BE_UNLOCKED(p)
+#endif
+
+#define LOCK_BUFFER(p) { \
+    BUFFER_MUST_BE_UNLOCKED(p); \
+    CriticalSection_Enter(&(p)->cs); \
+    (p)->csWasEntered = True; }
+
+#define UNLOCK_BUFFER(p) { \
+    BUFFER_MUST_BE_LOCKED(p); \
+    CriticalSection_Leave(&(p)->cs); \
+    (p)->csWasEntered = False; }
+
+
+Z7_NO_INLINE
+static UInt32 MtSync_GetNextBlock(CMtSync *p)
 {
+  UInt32 numBlocks = 0;
   if (p->needStart)
   {
+    BUFFER_MUST_BE_UNLOCKED(p)
     p->numProcessedBlocks = 1;
     p->needStart = False;
     p->stopWriting = False;
     p->exit = False;
-    Event_Reset(&p->wasStarted);
     Event_Reset(&p->wasStopped);
-
     Event_Set(&p->canStart);
-    Event_Wait(&p->wasStarted);
-
-    // if (mt) MatchFinder_Init_LowHash(mt->MatchFinder);
   }
   else
   {
-    CriticalSection_Leave(&p->cs);
-    p->csWasEntered = False;
-    p->numProcessedBlocks++;
+    UNLOCK_BUFFER(p)
+    // we free current block
+    numBlocks = p->numProcessedBlocks++;
     Semaphore_Release1(&p->freeSemaphore);
   }
+
+  // buffer is UNLOCKED here
   Semaphore_Wait(&p->filledSemaphore);
-  CriticalSection_Enter(&p->cs);
-  p->csWasEntered = True;
+  LOCK_BUFFER(p)
+  return numBlocks;
 }
 
-/* MtSync_StopWriting must be called if Writing was started */
 
+/* if Writing (Processing) thread was started, we must call MtSync_StopWriting() */
+
+Z7_NO_INLINE
 static void MtSync_StopWriting(CMtSync *p)
 {
-  UInt32 myNumBlocks = p->numProcessedBlocks;
   if (!Thread_WasCreated(&p->thread) || p->needStart)
     return;
-  p->stopWriting = True;
+
+    PRF(printf("\nMtSync_StopWriting %p\n", p));
+
   if (p->csWasEntered)
   {
-    CriticalSection_Leave(&p->cs);
-    p->csWasEntered = False;
+    /* we don't use buffer in this thread after StopWriting().
+       So we UNLOCK buffer.
+       And we restore default UNLOCKED state for stopped thread */
+    UNLOCK_BUFFER(p)
   }
-  Semaphore_Release1(&p->freeSemaphore);
- 
+
+  /* We send (p->stopWriting) message and release freeSemaphore
+     to free current block.
+     So the thread will see (p->stopWriting) at some
+     iteration after Wait(freeSemaphore).
+     The thread doesn't need to fill all avail free blocks,
+     so we can get fast thread stop.
+  */
+
+  p->stopWriting = True;
+  Semaphore_Release1(&p->freeSemaphore); // check semaphore count !!!
+
+    PRF(printf("\nMtSync_StopWriting %p : Event_Wait(&p->wasStopped)\n", p));
   Event_Wait(&p->wasStopped);
+    PRF(printf("\nMtSync_StopWriting %p : Event_Wait() finsihed\n", p));
+
+  /* 21.03 : we don't restore samaphore counters here.
+     We will recreate and reinit samaphores in next start */
 
-  while (myNumBlocks++ != p->numProcessedBlocks)
-  {
-    Semaphore_Wait(&p->filledSemaphore);
-    Semaphore_Release1(&p->freeSemaphore);
-  }
   p->needStart = True;
 }
 
+
+Z7_NO_INLINE
 static void MtSync_Destruct(CMtSync *p)
 {
+    PRF(printf("\nMtSync_Destruct %p\n", p));
+  
   if (Thread_WasCreated(&p->thread))
   {
+    /* we want thread to be in Stopped state before sending EXIT command.
+       note: stop(btSync) will stop (htSync) also */
     MtSync_StopWriting(p);
+    /* thread in Stopped state here : (p->needStart == true) */
     p->exit = True;
-    if (p->needStart)
-      Event_Set(&p->canStart);
-    Thread_Wait(&p->thread);
-    Thread_Close(&p->thread);
+    // if (p->needStart)  // it's (true)
+    Event_Set(&p->canStart);  // we send EXIT command to thread
+    Thread_Wait_Close(&p->thread);  // we wait thread finishing
   }
+
   if (p->csWasInitialized)
   {
     CriticalSection_Delete(&p->cs);
     p->csWasInitialized = False;
   }
+  p->csWasEntered = False;
 
   Event_Close(&p->canStart);
-  Event_Close(&p->wasStarted);
   Event_Close(&p->wasStopped);
   Semaphore_Close(&p->freeSemaphore);
   Semaphore_Close(&p->filledSemaphore);
@@ -99,80 +219,257 @@ static void MtSync_Destruct(CMtSync *p)
   p->wasCreated = False;
 }
 
-#define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; }
 
-static SRes MtSync_Create2(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks)
+// #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; }
+// we want to get real system error codes here instead of SZ_ERROR_THREAD
+#define RINOK_THREAD(x)  RINOK_WRes(x)
+
+
+// call it before each new file (when new starting is required):
+Z7_NO_INLINE
+static SRes MtSync_Init(CMtSync *p, UInt32 numBlocks)
+{
+  WRes wres;
+  // BUFFER_MUST_BE_UNLOCKED(p)
+  if (!p->needStart || p->csWasEntered)
+    return SZ_ERROR_FAIL;
+  wres = Semaphore_OptCreateInit(&p->freeSemaphore, numBlocks, numBlocks);
+  if (wres == 0)
+    wres = Semaphore_OptCreateInit(&p->filledSemaphore, 0, numBlocks);
+  return MY_SRes_HRESULT_FROM_WRes(wres);
+}
+
+
+static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj)
 {
+  WRes wres;
+
   if (p->wasCreated)
     return SZ_OK;
 
-  RINOK_THREAD(CriticalSection_Init(&p->cs));
+  RINOK_THREAD(CriticalSection_Init(&p->cs))
   p->csWasInitialized = True;
+  p->csWasEntered = False;
 
-  RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart));
-  RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStarted));
-  RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped));
-  
-  RINOK_THREAD(Semaphore_Create(&p->freeSemaphore, numBlocks, numBlocks));
-  RINOK_THREAD(Semaphore_Create(&p->filledSemaphore, 0, numBlocks));
+  RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart))
+  RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped))
 
   p->needStart = True;
-  
-  RINOK_THREAD(Thread_Create(&p->thread, startAddress, obj));
+  p->exit = True;  /* p->exit is unused before (canStart) Event.
+     But in case of some unexpected code failure we will get fast exit from thread */
+
+  // return ERROR_TOO_MANY_POSTS; // for debug
+  // return EINVAL; // for debug
+
+#ifdef _WIN32
+  if (p->affinityGroup >= 0)
+    wres = Thread_Create_With_Group(&p->thread, startAddress, obj,
+        (unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup);
+  else
+#endif
+  if (p->affinity != 0)
+    wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
+  else
+    wres = Thread_Create(&p->thread, startAddress, obj);
+
+  RINOK_THREAD(wres)
   p->wasCreated = True;
   return SZ_OK;
 }
 
-static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks)
+
+Z7_NO_INLINE
+static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj)
 {
-  SRes res = MtSync_Create2(p, startAddress, obj, numBlocks);
-  if (res != SZ_OK)
-    MtSync_Destruct(p);
-  return res;
+  const WRes wres = MtSync_Create_WRes(p, startAddress, obj);
+  if (wres == 0)
+    return 0;
+  MtSync_Destruct(p);
+  return MY_SRes_HRESULT_FROM_WRes(wres);
 }
 
-void MtSync_Init(CMtSync *p) { p->needStart = True; }
+
+// ---------- HASH THREAD ----------
 
 #define kMtMaxValForNormalize 0xFFFFFFFF
+// #define kMtMaxValForNormalize ((1 << 21)) // for debug
+// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
 
-#define DEF_GetHeads2(name, v, action) \
-  static void GetHeads ## name(const Byte *p, UInt32 pos, \
-      UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc) \
-    { action; for (; numHeads != 0; numHeads--) { \
-      const UInt32 value = (v); p++; *heads++ = pos - hash[value]; hash[value] = pos++;  } }
+#ifdef MY_CPU_LE_UNALIGN
+  #define GetUi24hi_from32(p) ((UInt32)GetUi32(p) >> 8)
+#else
+  #define GetUi24hi_from32(p) ((p)[1] ^ ((UInt32)(p)[2] << 8) ^ ((UInt32)(p)[3] << 16))
+#endif
+
+#define GetHeads_DECL(name) \
+    static void GetHeads ## name(const Byte *p, UInt32 pos, \
+      UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc)
 
+#define GetHeads_LOOP(v) \
+    for (; numHeads != 0; numHeads--) { \
+      const UInt32 value = (v); \
+      p++; \
+      *heads++ = pos - hash[value]; \
+      hash[value] = pos++; }
+
+#define DEF_GetHeads2(name, v, action) \
+    GetHeads_DECL(name) { action \
+    GetHeads_LOOP(v) }
+ 
 #define DEF_GetHeads(name, v) DEF_GetHeads2(name, v, ;)
 
-DEF_GetHeads2(2,  (p[0] | ((UInt32)p[1] << 8)), UNUSED_VAR(hashMask); UNUSED_VAR(crc); )
-DEF_GetHeads(3,  (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8)) & hashMask)
-DEF_GetHeads(4,  (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8) ^ (crc[p[3]] << 5)) & hashMask)
-DEF_GetHeads(4b, (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8) ^ ((UInt32)p[3] << 16)) & hashMask)
-/* DEF_GetHeads(5,  (crc[p[0]] ^ p[1] ^ ((UInt32)p[2] << 8) ^ (crc[p[3]] << 5) ^ (crc[p[4]] << 3)) & hashMask) */
+DEF_GetHeads2(2, GetUi16(p), UNUSED_VAR(hashMask); UNUSED_VAR(crc); )
+DEF_GetHeads(3,  (crc[p[0]] ^ GetUi16(p + 1)) & hashMask)
+DEF_GetHeads2(3b, GetUi16(p) ^ ((UInt32)(p)[2] << 16), UNUSED_VAR(hashMask); UNUSED_VAR(crc); )
+// BT3 is not good for crc collisions for big hashMask values.
+
+/*
+GetHeads_DECL(3b)
+{
+  UNUSED_VAR(hashMask);
+  UNUSED_VAR(crc);
+  {
+  const Byte *pLim = p + numHeads;
+  if (numHeads == 0)
+    return;
+  pLim--;
+  while (p < pLim)
+  {
+    UInt32 v1 = GetUi32(p);
+    UInt32 v0 = v1 & 0xFFFFFF;
+    UInt32 h0, h1;
+    p += 2;
+    v1 >>= 8;
+    h0 = hash[v0]; hash[v0] = pos; heads[0] = pos - h0; pos++;
+    h1 = hash[v1]; hash[v1] = pos; heads[1] = pos - h1; pos++;
+    heads += 2;
+  }
+  if (p == pLim)
+  {
+    UInt32 v0 = GetUi16(p) ^ ((UInt32)(p)[2] << 16);
+    *heads = pos - hash[v0];
+    hash[v0] = pos;
+  }
+  }
+}
+*/
+
+/*
+GetHeads_DECL(4)
+{
+  unsigned sh = 0;
+  UNUSED_VAR(crc)
+  while ((hashMask & 0x80000000) == 0)
+  {
+    hashMask <<= 1;
+    sh++;
+  }
+  GetHeads_LOOP((GetUi32(p) * 0xa54a1) >> sh)
+}
+#define GetHeads4b GetHeads4
+*/
+
+#define USE_GetHeads_LOCAL_CRC
+
+#ifdef USE_GetHeads_LOCAL_CRC
+
+GetHeads_DECL(4)
+{
+  UInt32 crc0[256];
+  UInt32 crc1[256];
+  {
+    unsigned i;
+    for (i = 0; i < 256; i++)
+    {
+      UInt32 v = crc[i];
+      crc0[i] = v & hashMask;
+      crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
+      // crc1[i] = rotlFixed(v, 8) & hashMask;
+    }
+  }
+  GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ (UInt32)GetUi16(p+1))
+}
+
+GetHeads_DECL(4b)
+{
+  UInt32 crc0[256];
+  {
+    unsigned i;
+    for (i = 0; i < 256; i++)
+      crc0[i] = crc[i] & hashMask;
+  }
+  GetHeads_LOOP(crc0[p[0]] ^ GetUi24hi_from32(p))
+}
+
+GetHeads_DECL(5)
+{
+  UInt32 crc0[256];
+  UInt32 crc1[256];
+  UInt32 crc2[256];
+  {
+    unsigned i;
+    for (i = 0; i < 256; i++)
+    {
+      UInt32 v = crc[i];
+      crc0[i] = v & hashMask;
+      crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
+      crc2[i] = (v << kLzHash_CrcShift_2) & hashMask;
+    }
+  }
+  GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ crc2[p[4]] ^ (UInt32)GetUi16(p+1))
+}
+
+GetHeads_DECL(5b)
+{
+  UInt32 crc0[256];
+  UInt32 crc1[256];
+  {
+    unsigned i;
+    for (i = 0; i < 256; i++)
+    {
+      UInt32 v = crc[i];
+      crc0[i] = v & hashMask;
+      crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
+    }
+  }
+  GetHeads_LOOP(crc0[p[0]] ^ crc1[p[4]] ^ GetUi24hi_from32(p))
+}
+
+#else
+
+DEF_GetHeads(4,  (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (UInt32)GetUi16(p+1)) & hashMask)
+DEF_GetHeads(4b, (crc[p[0]] ^ GetUi24hi_from32(p)) & hashMask)
+DEF_GetHeads(5,  (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (crc[p[4]] << kLzHash_CrcShift_2) ^ (UInt32)GetUi16(p + 1)) & hashMask)
+DEF_GetHeads(5b, (crc[p[0]] ^ (crc[p[4]] << kLzHash_CrcShift_1) ^ GetUi24hi_from32(p)) & hashMask)
+
+#endif
+ 
 
 static void HashThreadFunc(CMatchFinderMt *mt)
 {
   CMtSync *p = &mt->hashSync;
+    PRF(printf("\nHashThreadFunc\n"));
+  
   for (;;)
   {
-    UInt32 numProcessedBlocks = 0;
+    UInt32 blockIndex = 0;
+      PRF(printf("\nHashThreadFunc : Event_Wait(&p->canStart)\n"));
     Event_Wait(&p->canStart);
-    Event_Set(&p->wasStarted);
+      PRF(printf("\nHashThreadFunc : Event_Wait(&p->canStart) : after \n"));
+    if (p->exit)
+    {
+      PRF(printf("\nHashThreadFunc : exit \n"));
+      return;
+    }
 
-    MatchFinder_Init_HighHash(mt->MatchFinder);
+    MatchFinder_Init_HighHash(MF(mt));
 
     for (;;)
     {
-      if (p->exit)
-        return;
-      if (p->stopWriting)
-      {
-        p->numProcessedBlocks = numProcessedBlocks;
-        Event_Set(&p->wasStopped);
-        break;
-      }
+      PRF(printf("Hash thread block = %d pos = %d\n", (unsigned)blockIndex, mt->MatchFinder->pos));
 
       {
-        CMatchFinder *mf = mt->MatchFinder;
+        CMatchFinder *mf = MF(mt);
         if (MatchFinder_NeedMove(mf))
         {
           CriticalSection_Enter(&mt->btSync.cs);
@@ -185,194 +482,178 @@ static void HashThreadFunc(CMatchFinderMt *mt)
             mt->pointerToCurPos -= offset;
             mt->buffer -= offset;
           }
-          CriticalSection_Leave(&mt->btSync.cs);
           CriticalSection_Leave(&mt->hashSync.cs);
+          CriticalSection_Leave(&mt->btSync.cs);
           continue;
         }
 
         Semaphore_Wait(&p->freeSemaphore);
 
+        if (p->exit) // exit is unexpected here. But we check it here for some failure case
+          return;
+
+        // for faster stop : we check (p->stopWriting) after Wait(freeSemaphore)
+        if (p->stopWriting)
+          break;
+
         MatchFinder_ReadIfRequired(mf);
-        if (mf->pos > (kMtMaxValForNormalize - kMtHashBlockSize))
-        {
-          UInt32 subValue = (mf->pos - mf->historySize - 1);
-          MatchFinder_ReduceOffsets(mf, subValue);
-          MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1);
-        }
         {
-          UInt32 *heads = mt->hashBuf + ((numProcessedBlocks++) & kMtHashNumBlocksMask) * kMtHashBlockSize;
-          UInt32 num = mf->streamPos - mf->pos;
+          UInt32 *heads = mt->hashBuf + GET_HASH_BLOCK_OFFSET(blockIndex++);
+          UInt32 num = Inline_MatchFinder_GetNumAvailableBytes(mf);
           heads[0] = 2;
           heads[1] = num;
+
+          /* heads[1] contains the number of avail bytes:
+             if (avail < mf->numHashBytes) :
+             {
+               it means that stream was finished
+               HASH_THREAD and BT_TREAD must move position for heads[1] (avail) bytes.
+               HASH_THREAD doesn't stop,
+               HASH_THREAD fills only the header (2 numbers) for all next blocks:
+               {2, NumHashBytes - 1}, {2,0}, {2,0}, ... , {2,0}
+             }
+             else
+             {
+               HASH_THREAD and BT_TREAD must move position for (heads[0] - 2) bytes;
+             }
+          */
+
           if (num >= mf->numHashBytes)
           {
             num = num - mf->numHashBytes + 1;
             if (num > kMtHashBlockSize - 2)
               num = kMtHashBlockSize - 2;
-            mt->GetHeadsFunc(mf->buffer, mf->pos, mf->hash + mf->fixedHashSize, mf->hashMask, heads + 2, num, mf->crc);
+
+            if (mf->pos > (UInt32)kMtMaxValForNormalize - num)
+            {
+              const UInt32 subValue = (mf->pos - mf->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1);
+              MatchFinder_REDUCE_OFFSETS(mf, subValue)
+              MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1);
+            }
+
             heads[0] = 2 + num;
+            mt->GetHeadsFunc(mf->buffer, mf->pos, mf->hash + mf->fixedHashSize, mf->hashMask, heads + 2, num, mf->crc);
           }
-          mf->pos += num;
+
+          mf->pos += num;  // wrap over zero is allowed at the end of stream
           mf->buffer += num;
         }
       }
 
       Semaphore_Release1(&p->filledSemaphore);
-    }
-  }
-}
+    } // for() processing end
 
-static void MatchFinderMt_GetNextBlock_Hash(CMatchFinderMt *p)
-{
-  MtSync_GetNextBlock(&p->hashSync);
-  p->hashBufPosLimit = p->hashBufPos = ((p->hashSync.numProcessedBlocks - 1) & kMtHashNumBlocksMask) * kMtHashBlockSize;
-  p->hashBufPosLimit += p->hashBuf[p->hashBufPos++];
-  p->hashNumAvail = p->hashBuf[p->hashBufPos++];
+    // p->numBlocks_Sent = blockIndex;
+    Event_Set(&p->wasStopped);
+  } // for() thread end
 }
 
-#define kEmptyHashValue 0
+
+
+
+// ---------- BT THREAD ----------
+
+/* we use one variable instead of two (cyclicBufferPos == pos) before CyclicBuf wrap.
+   here we define fixed offset of (p->pos) from (p->cyclicBufferPos) */
+#define CYC_TO_POS_OFFSET 0
+// #define CYC_TO_POS_OFFSET 1 // for debug
 
 #define MFMT_GM_INLINE
 
 #ifdef MFMT_GM_INLINE
 
 /*
-  we use size_t for _cyclicBufferPos instead of UInt32
+  we use size_t for (pos) instead of UInt32
   to eliminate "movsx" BUG in old MSVC x64 compiler.
 */
 
-MY_NO_INLINE
-static UInt32 *GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *cur, CLzRef *son,
-    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue,
-    UInt32 *distances, UInt32 _maxLen, const UInt32 *hash, const UInt32 *limit, UInt32 size, UInt32 *posRes)
-{
-  do
-  {
-  UInt32 *_distances = ++distances;
-  UInt32 delta = *hash++;
-
-  CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
-  CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
-  unsigned len0 = 0, len1 = 0;
-  UInt32 cutValue = _cutValue;
-  unsigned maxLen = (unsigned)_maxLen;
 
-  /*
-  if (size > 1)
-  {
-    UInt32 delta = *hash;
-    if (delta < _cyclicBufferSize)
-    {
-      UInt32 cyc1 = _cyclicBufferPos + 1;
-      CLzRef *pair = son + ((size_t)(cyc1 - delta + ((delta > cyc1) ? _cyclicBufferSize : 0)) << 1);
-      Byte b = *(cur + 1 - delta);
-      _distances[0] = pair[0];
-      _distances[1] = b;
-    }
-  }
-  */
-  if (cutValue == 0 || delta >= _cyclicBufferSize)
-  {
-    *ptr0 = *ptr1 = kEmptyHashValue;
-  }
-  else
-  for(;;)
-  {
-    {
-      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((_cyclicBufferPos < delta) ? _cyclicBufferSize : 0)) << 1);
-      const Byte *pb = cur - delta;
-      unsigned len = (len0 < len1 ? len0 : len1);
-      UInt32 pair0 = *pair;
-      if (pb[len] == cur[len])
-      {
-        if (++len != lenLimit && pb[len] == cur[len])
-          while (++len != lenLimit)
-            if (pb[len] != cur[len])
-              break;
-        if (maxLen < len)
-        {
-          maxLen = len;
-          *distances++ = (UInt32)len;
-          *distances++ = delta - 1;
-          if (len == lenLimit)
-          {
-            UInt32 pair1 = pair[1];
-            *ptr1 = pair0;
-            *ptr0 = pair1;
-            break;
-          }
-        }
-      }
-      {
-        UInt32 curMatch = pos - delta;
-        // delta = pos - *pair;
-        // delta = pos - pair[((UInt32)pb[len] - (UInt32)cur[len]) >> 31];
-        if (pb[len] < cur[len])
-        {
-          delta = pos - pair[1];
-          *ptr1 = curMatch;
-          ptr1 = pair + 1;
-          len1 = len;
-        }
-        else
-        {
-          delta = pos - *pair;
-          *ptr0 = curMatch;
-          ptr0 = pair;
-          len0 = len;
-        }
-      }
-    }
-    if (--cutValue == 0 || delta >= _cyclicBufferSize)
-    {
-      *ptr0 = *ptr1 = kEmptyHashValue;
-      break;
-    }
-  }
-  pos++;
-  _cyclicBufferPos++;
-  cur++;
-  {
-    UInt32 num = (UInt32)(distances - _distances);
-    _distances[-1] = num;
-  }
-  }
-  while (distances < limit && --size != 0);
-  *posRes = pos;
-  return distances;
-}
+UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
+    UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
+    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
+    UInt32 *posRes);
 
 #endif
 
 
-
-static void BtGetMatches(CMatchFinderMt *p, UInt32 *distances)
+static void BtGetMatches(CMatchFinderMt *p, UInt32 *d)
 {
   UInt32 numProcessed = 0;
   UInt32 curPos = 2;
-  UInt32 limit = kMtBtBlockSize - (p->matchMaxLen * 2); //  * 2
   
-  distances[1] = p->hashNumAvail;
+  /* GetMatchesSpec() functions don't create (len = 1)
+     in [len, dist] match pairs, if (p->numHashBytes >= 2)
+     Also we suppose here that (matchMaxLen >= 2).
+     So the following code for (reserve) is not required
+     UInt32 reserve = (p->matchMaxLen * 2);
+     const UInt32 kNumHashBytes_Max = 5; // BT_HASH_BYTES_MAX
+     if (reserve < kNumHashBytes_Max - 1)
+        reserve = kNumHashBytes_Max - 1;
+     const UInt32 limit = kMtBtBlockSize - (reserve);
+  */
+
+  const UInt32 limit = kMtBtBlockSize - (p->matchMaxLen * 2);
+
+  d[1] = p->hashNumAvail;
+
+  if (p->failure_BT)
+  {
+    // printf("\n == 1 BtGetMatches() p->failure_BT\n");
+    d[0] = 0;
+    // d[1] = 0;
+    return;
+  }
   
   while (curPos < limit)
   {
     if (p->hashBufPos == p->hashBufPosLimit)
     {
-      MatchFinderMt_GetNextBlock_Hash(p);
-      distances[1] = numProcessed + p->hashNumAvail;
-      if (p->hashNumAvail >= p->numHashBytes)
+      // MatchFinderMt_GetNextBlock_Hash(p);
+      UInt32 avail;
+      {
+        const UInt32 bi = MtSync_GetNextBlock(&p->hashSync);
+        const UInt32 k = GET_HASH_BLOCK_OFFSET(bi);
+        const UInt32 *h = p->hashBuf + k;
+        avail = h[1];
+        p->hashBufPosLimit = k + h[0];
+        p->hashNumAvail = avail;
+        p->hashBufPos = k + 2;
+      }
+
+      {
+        /* we must prevent UInt32 overflow for avail total value,
+           if avail was increased with new hash block */
+        UInt32 availSum = numProcessed + avail;
+        if (availSum < numProcessed)
+          availSum = (UInt32)(Int32)-1;
+        d[1] = availSum;
+      }
+
+      if (avail >= p->numHashBytes)
         continue;
-      distances[0] = curPos + p->hashNumAvail;
-      distances += curPos;
-      for (; p->hashNumAvail != 0; p->hashNumAvail--)
-        *distances++ = 0;
+
+      // if (p->hashBufPos != p->hashBufPosLimit) exit(1);
+
+      /* (avail < p->numHashBytes)
+         It means that stream was finished.
+         And (avail) - is a number of remaining bytes,
+         we fill (d) for (avail) bytes for LZ_THREAD (receiver).
+         but we don't update (p->pos) and (p->cyclicBufferPos) here in BT_THREAD */
+
+      /* here we suppose that we have space enough:
+         (kMtBtBlockSize - curPos >= p->hashNumAvail) */
+      p->hashNumAvail = 0;
+      d[0] = curPos + avail;
+      d += curPos;
+      for (; avail != 0; avail--)
+        *d++ = 0;
       return;
     }
     {
       UInt32 size = p->hashBufPosLimit - p->hashBufPos;
-      UInt32 lenLimit = p->matchMaxLen;
       UInt32 pos = p->pos;
       UInt32 cyclicBufferPos = p->cyclicBufferPos;
+      UInt32 lenLimit = p->matchMaxLen;
       if (lenLimit >= p->hashNumAvail)
         lenLimit = p->hashNumAvail;
       {
@@ -384,10 +665,18 @@ static void BtGetMatches(CMatchFinderMt *p, UInt32 *distances)
           size = size2;
       }
       
+      if (pos > (UInt32)kMtMaxValForNormalize - size)
+      {
+        const UInt32 subValue = (pos - p->cyclicBufferSize); // & ~(UInt32)(kNormalizeAlign - 1);
+        pos -= subValue;
+        p->pos = pos;
+        MatchFinder_Normalize3(subValue, p->son, (size_t)p->cyclicBufferSize * 2);
+      }
+
       #ifndef MFMT_GM_INLINE
       while (curPos < limit && size-- != 0)
       {
-        UInt32 *startDistances = distances + curPos;
+        UInt32 *startDistances = d + curPos;
         UInt32 num = (UInt32)(GetMatchesSpec1(lenLimit, pos - p->hashBuf[p->hashBufPos++],
             pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue,
             startDistances + 1, p->numHashBytes - 1) - startDistances);
@@ -399,81 +688,112 @@ static void BtGetMatches(CMatchFinderMt *p, UInt32 *distances)
       }
       #else
       {
-        UInt32 posRes;
-        curPos = (UInt32)(GetMatchesSpecN(lenLimit, pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue,
-            distances + curPos, p->numHashBytes - 1, p->hashBuf + p->hashBufPos,
-            distances + limit,
-            size, &posRes) - distances);
-        p->hashBufPos += posRes - pos;
-        cyclicBufferPos += posRes - pos;
-        p->buffer += posRes - pos;
-        pos = posRes;
+        UInt32 posRes = pos;
+        const UInt32 *d_end;
+        {
+          d_end = GetMatchesSpecN_2(
+              p->buffer + lenLimit - 1,
+              pos, p->buffer, p->son, p->cutValue, d + curPos,
+              p->numHashBytes - 1, p->hashBuf + p->hashBufPos,
+              d + limit, p->hashBuf + p->hashBufPos + size,
+              cyclicBufferPos, p->cyclicBufferSize,
+              &posRes);
+        }
+        {
+          if (!d_end)
+          {
+            // printf("\n == 2 BtGetMatches() p->failure_BT\n");
+            // internal data failure
+            p->failure_BT = True;
+            d[0] = 0;
+            // d[1] = 0;
+            return;
+          }
+        }
+        curPos = (UInt32)(d_end - d);
+        {
+          const UInt32 processed = posRes - pos;
+          pos = posRes;
+          p->hashBufPos += processed;
+          cyclicBufferPos += processed;
+          p->buffer += processed;
+        }
       }
       #endif
 
-      numProcessed += pos - p->pos;
-      p->hashNumAvail -= pos - p->pos;
-      p->pos = pos;
+      {
+        const UInt32 processed = pos - p->pos;
+        numProcessed += processed;
+        p->hashNumAvail -= processed;
+        p->pos = pos;
+      }
       if (cyclicBufferPos == p->cyclicBufferSize)
         cyclicBufferPos = 0;
       p->cyclicBufferPos = cyclicBufferPos;
     }
   }
   
-  distances[0] = curPos;
+  d[0] = curPos;
 }
 
+
 static void BtFillBlock(CMatchFinderMt *p, UInt32 globalBlockIndex)
 {
   CMtSync *sync = &p->hashSync;
+  
+  BUFFER_MUST_BE_UNLOCKED(sync)
+  
   if (!sync->needStart)
   {
-    CriticalSection_Enter(&sync->cs);
-    sync->csWasEntered = True;
+    LOCK_BUFFER(sync)
   }
   
-  BtGetMatches(p, p->btBuf + (globalBlockIndex & kMtBtNumBlocksMask) * kMtBtBlockSize);
-
-  if (p->pos > kMtMaxValForNormalize - kMtBtBlockSize)
-  {
-    UInt32 subValue = p->pos - p->cyclicBufferSize;
-    MatchFinder_Normalize3(subValue, p->son, (size_t)p->cyclicBufferSize * 2);
-    p->pos -= subValue;
-  }
+  BtGetMatches(p, p->btBuf + GET_BT_BLOCK_OFFSET(globalBlockIndex));
+  
+  /* We suppose that we have called GetNextBlock() from start.
+     So buffer is LOCKED */
 
-  if (!sync->needStart)
-  {
-    CriticalSection_Leave(&sync->cs);
-    sync->csWasEntered = False;
-  }
+  UNLOCK_BUFFER(sync)
 }
 
-void BtThreadFunc(CMatchFinderMt *mt)
+
+Z7_NO_INLINE
+static void BtThreadFunc(CMatchFinderMt *mt)
 {
   CMtSync *p = &mt->btSync;
   for (;;)
   {
     UInt32 blockIndex = 0;
     Event_Wait(&p->canStart);
-    Event_Set(&p->wasStarted);
+
     for (;;)
     {
+        PRF(printf("  BT thread block = %d  pos = %d\n", (unsigned)blockIndex, mt->pos));
+      /* (p->exit == true) is possible after (p->canStart) at first loop iteration
+         and is unexpected after more Wait(freeSemaphore) iterations */
       if (p->exit)
         return;
+
+      Semaphore_Wait(&p->freeSemaphore);
+      
+      // for faster stop : we check (p->stopWriting) after Wait(freeSemaphore)
       if (p->stopWriting)
-      {
-        p->numProcessedBlocks = blockIndex;
-        MtSync_StopWriting(&mt->hashSync);
-        Event_Set(&p->wasStopped);
         break;
-      }
-      Semaphore_Wait(&p->freeSemaphore);
+
       BtFillBlock(mt, blockIndex++);
+      
       Semaphore_Release1(&p->filledSemaphore);
     }
+
+    // we stop HASH_THREAD here
+    MtSync_StopWriting(&mt->hashSync);
+
+    // p->numBlocks_Sent = blockIndex;
+    Event_Set(&p->wasStopped);
   }
 }
 
+
 void MatchFinderMt_Construct(CMatchFinderMt *p)
 {
   p->hashBuf = NULL;
@@ -489,16 +809,39 @@ static void MatchFinderMt_FreeMem(CMatchFinderMt *p, ISzAllocPtr alloc)
 
 void MatchFinderMt_Destruct(CMatchFinderMt *p, ISzAllocPtr alloc)
 {
-  MtSync_Destruct(&p->hashSync);
+  /*
+     HASH_THREAD can use CriticalSection(s) btSync.cs and hashSync.cs.
+     So we must be sure that HASH_THREAD will not use CriticalSection(s)
+     after deleting CriticalSection here.
+
+     we call ReleaseStream(p)
+       that calls StopWriting(btSync)
+         that calls StopWriting(hashSync), if it's required to stop HASH_THREAD.
+     after StopWriting() it's safe to destruct MtSync(s) in any order */
+
+  MatchFinderMt_ReleaseStream(p);
+
   MtSync_Destruct(&p->btSync);
+  MtSync_Destruct(&p->hashSync);
+
+  LOG_ITER(
+  printf("\nTree %9d * %7d iter = %9d = sum  :  bytes = %9d\n",
+      (UInt32)(g_NumIters_Tree / 1000),
+      (UInt32)(((UInt64)g_NumIters_Loop * 1000) / (g_NumIters_Tree + 1)),
+      (UInt32)(g_NumIters_Loop / 1000),
+      (UInt32)(g_NumIters_Bytes / 1000)
+      ));
+
   MatchFinderMt_FreeMem(p, alloc);
 }
 
+
 #define kHashBufferSize (kMtHashBlockSize * kMtHashNumBlocks)
 #define kBtBufferSize (kMtBtBlockSize * kMtBtNumBlocks)
 
-static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE HashThreadFunc2(void *p) { HashThreadFunc((CMatchFinderMt *)p);  return 0; }
-static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE BtThreadFunc2(void *p)
+
+static THREAD_FUNC_DECL HashThreadFunc2(void *p) { HashThreadFunc((CMatchFinderMt *)p);  return 0; }
+static THREAD_FUNC_DECL BtThreadFunc2(void *p)
 {
   Byte allocaDummy[0x180];
   unsigned i = 0;
@@ -509,16 +852,17 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE BtThreadFunc2(void *p)
   return 0;
 }
 
+
 SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddBufferBefore,
     UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc)
 {
-  CMatchFinder *mf = p->MatchFinder;
+  CMatchFinder *mf = MF(p);
   p->historySize = historySize;
   if (kMtBtBlockSize <= matchMaxLen * 4)
     return SZ_ERROR_PARAM;
   if (!p->hashBuf)
   {
-    p->hashBuf = (UInt32 *)ISzAlloc_Alloc(alloc, (kHashBufferSize + kBtBufferSize) * sizeof(UInt32));
+    p->hashBuf = (UInt32 *)ISzAlloc_Alloc(alloc, ((size_t)kHashBufferSize + (size_t)kBtBufferSize) * sizeof(UInt32));
     if (!p->hashBuf)
       return SZ_ERROR_MEM;
     p->btBuf = p->hashBuf + kHashBufferSize;
@@ -528,262 +872,472 @@ SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddB
   if (!MatchFinder_Create(mf, historySize, keepAddBufferBefore, matchMaxLen, keepAddBufferAfter, alloc))
     return SZ_ERROR_MEM;
 
-  RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p, kMtHashNumBlocks));
-  RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p, kMtBtNumBlocks));
+  RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p))
+  RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p))
   return SZ_OK;
 }
 
-/* Call it after ReleaseStream / SetStream */
-static void MatchFinderMt_Init(CMatchFinderMt *p)
+
+SRes MatchFinderMt_InitMt(CMatchFinderMt *p)
+{
+  RINOK(MtSync_Init(&p->hashSync, kMtHashNumBlocks))
+  return MtSync_Init(&p->btSync, kMtBtNumBlocks);
+}
+
+
+static void MatchFinderMt_Init(void *_p)
 {
-  CMatchFinder *mf = p->MatchFinder;
+  CMatchFinderMt *p = (CMatchFinderMt *)_p;
+  CMatchFinder *mf = MF(p);
   
   p->btBufPos =
-  p->btBufPosLimit = 0;
+  p->btBufPosLimit = NULL;
   p->hashBufPos =
   p->hashBufPosLimit = 0;
+  p->hashNumAvail = 0; // 21.03
+  
+  p->failure_BT = False;
 
   /* Init without data reading. We don't want to read data in this thread */
-  MatchFinder_Init_3(mf, False);
+  MatchFinder_Init_4(mf);
+
   MatchFinder_Init_LowHash(mf);
   
   p->pointerToCurPos = Inline_MatchFinder_GetPointerToCurrentPos(mf);
   p->btNumAvailBytes = 0;
-  p->lzPos = p->historySize + 1;
+  p->failure_LZ_BT = False;
+  // p->failure_LZ_LZ = False;
+  
+  p->lzPos =
+      1; // optimal smallest value
+      // 0; // for debug: ignores match to start
+      // kNormalizeAlign; // for debug
 
   p->hash = mf->hash;
   p->fixedHashSize = mf->fixedHashSize;
+  // p->hash4Mask = mf->hash4Mask;
   p->crc = mf->crc;
+  // memcpy(p->crc, mf->crc, sizeof(mf->crc));
 
   p->son = mf->son;
   p->matchMaxLen = mf->matchMaxLen;
   p->numHashBytes = mf->numHashBytes;
-  p->pos = mf->pos;
-  p->buffer = mf->buffer;
-  p->cyclicBufferPos = mf->cyclicBufferPos;
+  
+  /* (mf->pos) and (mf->streamPos) were already initialized to 1 in MatchFinder_Init_4() */
+  // mf->streamPos = mf->pos = 1; // optimal smallest value
+      // 0; // for debug: ignores match to start
+      // kNormalizeAlign; // for debug
+
+  /* we must init (p->pos = mf->pos) for BT, because
+     BT code needs (p->pos == delta_value_for_empty_hash_record == mf->pos) */
+  p->pos = mf->pos; // do not change it
+  
+  p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET);
   p->cyclicBufferSize = mf->cyclicBufferSize;
+  p->buffer = mf->buffer;
   p->cutValue = mf->cutValue;
+  // p->son[0] = p->son[1] = 0; // unused: to init skipped record for speculated accesses.
 }
 
+
 /* ReleaseStream is required to finish multithreading */
 void MatchFinderMt_ReleaseStream(CMatchFinderMt *p)
 {
+  // Sleep(1); // for debug
   MtSync_StopWriting(&p->btSync);
+  // Sleep(200); // for debug
   /* p->MatchFinder->ReleaseStream(); */
 }
 
-static void MatchFinderMt_Normalize(CMatchFinderMt *p)
-{
-  MatchFinder_Normalize3(p->lzPos - p->historySize - 1, p->hash, p->fixedHashSize);
-  p->lzPos = p->historySize + 1;
-}
 
-static void MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p)
+Z7_NO_INLINE
+static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p)
 {
-  UInt32 blockIndex;
-  MtSync_GetNextBlock(&p->btSync);
-  blockIndex = ((p->btSync.numProcessedBlocks - 1) & kMtBtNumBlocksMask);
-  p->btBufPosLimit = p->btBufPos = blockIndex * kMtBtBlockSize;
-  p->btBufPosLimit += p->btBuf[p->btBufPos++];
-  p->btNumAvailBytes = p->btBuf[p->btBufPos++];
-  if (p->lzPos >= kMtMaxValForNormalize - kMtBtBlockSize)
-    MatchFinderMt_Normalize(p);
+  if (p->failure_LZ_BT)
+    p->btBufPos = p->failureBuf;
+  else
+  {
+    const UInt32 bi = MtSync_GetNextBlock(&p->btSync);
+    const UInt32 *bt = p->btBuf + GET_BT_BLOCK_OFFSET(bi);
+    {
+      const UInt32 numItems = bt[0];
+      p->btBufPosLimit = bt + numItems;
+      p->btNumAvailBytes = bt[1];
+      p->btBufPos = bt + 2;
+      if (numItems < 2 || numItems > kMtBtBlockSize)
+      {
+        p->failureBuf[0] = 0;
+        p->btBufPos = p->failureBuf;
+        p->btBufPosLimit = p->failureBuf + 1;
+        p->failure_LZ_BT = True;
+        // p->btNumAvailBytes = 0;
+        /* we don't want to decrease AvailBytes, that was load before.
+            that can be unxepected for the code that have loaded anopther value before */
+      }
+    }
+  
+    if (p->lzPos >= (UInt32)kMtMaxValForNormalize - (UInt32)kMtBtBlockSize)
+    {
+      /* we don't check (lzPos) over exact avail bytes in (btBuf).
+         (fixedHashSize) is small, so normalization is fast */
+      const UInt32 subValue = (p->lzPos - p->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1);
+      p->lzPos -= subValue;
+      MatchFinder_Normalize3(subValue, p->hash, p->fixedHashSize);
+    }
+  }
+  return p->btNumAvailBytes;
 }
 
-static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p)
+
+
+static const Byte * MatchFinderMt_GetPointerToCurrentPos(void *_p)
 {
+  CMatchFinderMt *p = (CMatchFinderMt *)_p;
   return p->pointerToCurPos;
 }
 
+
 #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p);
 
-static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p)
+
+static UInt32 MatchFinderMt_GetNumAvailableBytes(void *_p)
 {
-  GET_NEXT_BLOCK_IF_REQUIRED;
-  return p->btNumAvailBytes;
+  CMatchFinderMt *p = (CMatchFinderMt *)_p;
+  if (p->btBufPos != p->btBufPosLimit)
+    return p->btNumAvailBytes;
+  return MatchFinderMt_GetNextBlock_Bt(p);
 }
 
-static UInt32 * MixMatches2(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *distances)
+
+// #define CHECK_FAILURE_LZ(_match_, _pos_) if (_match_ >= _pos_) { p->failure_LZ_LZ = True;  return d; }
+#define CHECK_FAILURE_LZ(_match_, _pos_)
+
+static UInt32 * MixMatches2(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
 {
-  UInt32 h2, curMatch2;
+  UInt32 h2, c2;
   UInt32 *hash = p->hash;
   const Byte *cur = p->pointerToCurPos;
-  UInt32 lzPos = p->lzPos;
+  const UInt32 m = p->lzPos;
   MT_HASH2_CALC
       
-  curMatch2 = hash[h2];
-  hash[h2] = lzPos;
+  c2 = hash[h2];
+  hash[h2] = m;
 
-  if (curMatch2 >= matchMinPos)
-    if (cur[(ptrdiff_t)curMatch2 - lzPos] == cur[0])
+  if (c2 >= matchMinPos)
+  {
+    CHECK_FAILURE_LZ(c2, m)
+    if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
     {
-      *distances++ = 2;
-      *distances++ = lzPos - curMatch2 - 1;
+      *d++ = 2;
+      *d++ = m - c2 - 1;
     }
+  }
   
-  return distances;
+  return d;
 }
 
-static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *distances)
+static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
 {
-  UInt32 h2, h3, curMatch2, curMatch3;
+  UInt32 h2, h3, c2, c3;
   UInt32 *hash = p->hash;
   const Byte *cur = p->pointerToCurPos;
-  UInt32 lzPos = p->lzPos;
+  const UInt32 m = p->lzPos;
   MT_HASH3_CALC
 
-  curMatch2 = hash[                h2];
-  curMatch3 = (hash + kFix3HashSize)[h3];
+  c2 = hash[h2];
+  c3 = (hash + kFix3HashSize)[h3];
   
-  hash[                h2] = lzPos;
-  (hash + kFix3HashSize)[h3] = lzPos;
+  hash[h2] = m;
+  (hash + kFix3HashSize)[h3] = m;
 
-  if (curMatch2 >= matchMinPos && cur[(ptrdiff_t)curMatch2 - lzPos] == cur[0])
+  if (c2 >= matchMinPos)
   {
-    distances[1] = lzPos - curMatch2 - 1;
-    if (cur[(ptrdiff_t)curMatch2 - lzPos + 2] == cur[2])
+    CHECK_FAILURE_LZ(c2, m)
+    if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
     {
-      distances[0] = 3;
-      return distances + 2;
+      d[1] = m - c2 - 1;
+      if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
+      {
+        d[0] = 3;
+        return d + 2;
+      }
+      d[0] = 2;
+      d += 2;
     }
-    distances[0] = 2;
-    distances += 2;
   }
   
-  if (curMatch3 >= matchMinPos && cur[(ptrdiff_t)curMatch3 - lzPos] == cur[0])
+  if (c3 >= matchMinPos)
   {
-    *distances++ = 3;
-    *distances++ = lzPos - curMatch3 - 1;
+    CHECK_FAILURE_LZ(c3, m)
+    if (cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
+    {
+      *d++ = 3;
+      *d++ = m - c3 - 1;
+    }
   }
   
-  return distances;
+  return d;
 }
 
+
+#define INCREASE_LZ_POS p->lzPos++; p->pointerToCurPos++;
+
 /*
-static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *distances)
+static
+UInt32* MatchFinderMt_GetMatches_Bt4(CMatchFinderMt *p, UInt32 *d)
 {
-  UInt32 h2, h3, h4, curMatch2, curMatch3, curMatch4;
+  const UInt32 *bt = p->btBufPos;
+  const UInt32 len = *bt++;
+  const UInt32 *btLim = bt + len;
+  UInt32 matchMinPos;
+  UInt32 avail = p->btNumAvailBytes - 1;
+  p->btBufPos = btLim;
+
+  {
+    p->btNumAvailBytes = avail;
+
+    #define BT_HASH_BYTES_MAX 5
+      
+    matchMinPos = p->lzPos;
+
+    if (len != 0)
+      matchMinPos -= bt[1];
+    else if (avail < (BT_HASH_BYTES_MAX - 1) - 1)
+    {
+      INCREASE_LZ_POS
+      return d;
+    }
+    else
+    {
+      const UInt32 hs = p->historySize;
+      if (matchMinPos > hs)
+        matchMinPos -= hs;
+      else
+        matchMinPos = 1;
+    }
+  }
+
+  for (;;)
+  {
+  
+  UInt32 h2, h3, c2, c3;
   UInt32 *hash = p->hash;
   const Byte *cur = p->pointerToCurPos;
-  UInt32 lzPos = p->lzPos;
-  MT_HASH4_CALC
-      
-  curMatch2 = hash[                h2];
-  curMatch3 = (hash + kFix3HashSize)[h3];
-  curMatch4 = (hash + kFix4HashSize)[h4];
+  UInt32 m = p->lzPos;
+  MT_HASH3_CALC
+
+  c2 = hash[h2];
+  c3 = (hash + kFix3HashSize)[h3];
+ 
+  hash[h2] = m;
+  (hash + kFix3HashSize)[h3] = m;
+
+  if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
+  {
+    d[1] = m - c2 - 1;
+    if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
+    {
+      d[0] = 3;
+      d += 2;
+      break;
+    }
+    // else
+    {
+      d[0] = 2;
+      d += 2;
+    }
+  }
+  if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
+  {
+    *d++ = 3;
+    *d++ = m - c3 - 1;
+  }
+  break;
+  }
+
+  if (len != 0)
+  {
+    do
+    {
+      const UInt32 v0 = bt[0];
+      const UInt32 v1 = bt[1];
+      bt += 2;
+      d[0] = v0;
+      d[1] = v1;
+      d += 2;
+    }
+    while (bt != btLim);
+  }
+  INCREASE_LZ_POS
+  return d;
+}
+*/
+
+
+static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
+{
+  UInt32 h2, h3, /* h4, */ c2, c3 /* , c4 */;
+  UInt32 *hash = p->hash;
+  const Byte *cur = p->pointerToCurPos;
+  const UInt32 m = p->lzPos;
+  MT_HASH3_CALC
+  // MT_HASH4_CALC
+  c2 = hash[h2];
+  c3 = (hash + kFix3HashSize)[h3];
+  // c4 = (hash + kFix4HashSize)[h4];
   
-  hash[                h2] = lzPos;
-  (hash + kFix3HashSize)[h3] = lzPos;
-  (hash + kFix4HashSize)[h4] = lzPos;
+  hash[h2] = m;
+  (hash + kFix3HashSize)[h3] = m;
+  // (hash + kFix4HashSize)[h4] = m;
 
-  if (curMatch2 >= matchMinPos && cur[(ptrdiff_t)curMatch2 - lzPos] == cur[0])
+  // #define BT5_USE_H2
+  // #ifdef BT5_USE_H2
+  if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
   {
-    distances[1] = lzPos - curMatch2 - 1;
-    if (cur[(ptrdiff_t)curMatch2 - lzPos + 2] == cur[2])
+    d[1] = m - c2 - 1;
+    if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
     {
-      distances[0] = (cur[(ptrdiff_t)curMatch2 - lzPos + 3] == cur[3]) ? 4 : 3;
-      return distances + 2;
+      // d[0] = (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3]) ? 4 : 3;
+      // return d + 2;
+
+      if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3])
+      {
+        d[0] = 4;
+        return d + 2;
+      }
+      d[0] = 3;
+      d += 2;
+
+      #ifdef BT5_USE_H4
+      if (c4 >= matchMinPos)
+        if (
+          cur[(ptrdiff_t)c4 - (ptrdiff_t)m]     == cur[0] &&
+          cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3]
+          )
+      {
+        *d++ = 4;
+        *d++ = m - c4 - 1;
+      }
+      #endif
+      return d;
     }
-    distances[0] = 2;
-    distances += 2;
+    d[0] = 2;
+    d += 2;
   }
+  // #endif
   
-  if (curMatch3 >= matchMinPos && cur[(ptrdiff_t)curMatch3 - lzPos] == cur[0])
+  if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
   {
-    distances[1] = lzPos - curMatch3 - 1;
-    if (cur[(ptrdiff_t)curMatch3 - lzPos + 3] == cur[3])
+    d[1] = m - c3 - 1;
+    if (cur[(ptrdiff_t)c3 - (ptrdiff_t)m + 3] == cur[3])
     {
-      distances[0] = 4;
-      return distances + 2;
+      d[0] = 4;
+      return d + 2;
     }
-    distances[0] = 3;
-    distances += 2;
+    d[0] = 3;
+    d += 2;
   }
 
-  if (curMatch4 >= matchMinPos)
+  #ifdef BT5_USE_H4
+  if (c4 >= matchMinPos)
     if (
-      cur[(ptrdiff_t)curMatch4 - lzPos] == cur[0] &&
-      cur[(ptrdiff_t)curMatch4 - lzPos + 3] == cur[3]
+      cur[(ptrdiff_t)c4 - (ptrdiff_t)m]     == cur[0] &&
+      cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3]
       )
     {
-      *distances++ = 4;
-      *distances++ = lzPos - curMatch4 - 1;
+      *d++ = 4;
+      *d++ = m - c4 - 1;
     }
+  #endif
   
-  return distances;
+  return d;
 }
-*/
 
-#define INCREASE_LZ_POS p->lzPos++; p->pointerToCurPos++;
 
-static UInt32 MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *distances)
+static UInt32 * MatchFinderMt2_GetMatches(void *_p, UInt32 *d)
 {
-  const UInt32 *btBuf = p->btBuf + p->btBufPos;
-  UInt32 len = *btBuf++;
-  p->btBufPos += 1 + len;
+  CMatchFinderMt *p = (CMatchFinderMt *)_p;
+  const UInt32 *bt = p->btBufPos;
+  const UInt32 len = *bt++;
+  const UInt32 *btLim = bt + len;
+  p->btBufPos = btLim;
   p->btNumAvailBytes--;
+  INCREASE_LZ_POS
   {
-    UInt32 i;
-    for (i = 0; i < len; i += 2)
+    while (bt != btLim)
     {
-      UInt32 v0 = btBuf[0];
-      UInt32 v1 = btBuf[1];
-      btBuf += 2;
-      distances[0] = v0;
-      distances[1] = v1;
-      distances += 2;
+      const UInt32 v0 = bt[0];
+      const UInt32 v1 = bt[1];
+      bt += 2;
+      d[0] = v0;
+      d[1] = v1;
+      d += 2;
     }
   }
-  INCREASE_LZ_POS
-  return len;
+  return d;
 }
 
-static UInt32 MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *distances)
-{
-  const UInt32 *btBuf = p->btBuf + p->btBufPos;
-  UInt32 len = *btBuf++;
-  p->btBufPos += 1 + len;
 
+
+static UInt32 * MatchFinderMt_GetMatches(void *_p, UInt32 *d)
+{
+  CMatchFinderMt *p = (CMatchFinderMt *)_p;
+  const UInt32 *bt = p->btBufPos;
+  UInt32 len = *bt++;
+  const UInt32 avail = p->btNumAvailBytes - 1;
+  p->btNumAvailBytes = avail;
+  p->btBufPos = bt + len;
   if (len == 0)
   {
-    /* change for bt5 ! */
-    if (p->btNumAvailBytes-- >= 4)
-      len = (UInt32)(p->MixMatchesFunc(p, p->lzPos - p->historySize, distances) - (distances));
+    #define BT_HASH_BYTES_MAX 5
+    if (avail >= (BT_HASH_BYTES_MAX - 1) - 1)
+    {
+      UInt32 m = p->lzPos;
+      if (m > p->historySize)
+        m -= p->historySize;
+      else
+        m = 1;
+      d = p->MixMatchesFunc(p, m, d);
+    }
   }
   else
   {
-    /* Condition: there are matches in btBuf with length < p->numHashBytes */
-    UInt32 *distances2;
-    p->btNumAvailBytes--;
-    distances2 = p->MixMatchesFunc(p, p->lzPos - btBuf[1], distances);
+    /*
+      first match pair from BinTree: (match_len, match_dist),
+      (match_len >= numHashBytes).
+      MixMatchesFunc() inserts only hash matches that are nearer than (match_dist)
+    */
+    d = p->MixMatchesFunc(p, p->lzPos - bt[1], d);
+    // if (d) // check for failure
     do
     {
-      UInt32 v0 = btBuf[0];
-      UInt32 v1 = btBuf[1];
-      btBuf += 2;
-      distances2[0] = v0;
-      distances2[1] = v1;
-      distances2 += 2;
+      const UInt32 v0 = bt[0];
+      const UInt32 v1 = bt[1];
+      bt += 2;
+      d[0] = v0;
+      d[1] = v1;
+      d += 2;
     }
-    while ((len -= 2) != 0);
-    len = (UInt32)(distances2 - (distances));
+    while (len -= 2);
   }
   INCREASE_LZ_POS
-  return len;
+  return d;
 }
 
 #define SKIP_HEADER2_MT  do { GET_NEXT_BLOCK_IF_REQUIRED
 #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash;
-#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += p->btBuf[p->btBufPos] + 1; } while (--num != 0);
+#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0);
 
-static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num)
+static void MatchFinderMt0_Skip(void *_p, UInt32 num)
 {
+  CMatchFinderMt *p = (CMatchFinderMt *)_p;
   SKIP_HEADER2_MT { p->btNumAvailBytes--;
   SKIP_FOOTER_MT
 }
 
-static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num)
+static void MatchFinderMt2_Skip(void *_p, UInt32 num)
 {
+  CMatchFinderMt *p = (CMatchFinderMt *)_p;
   SKIP_HEADER_MT(2)
       UInt32 h2;
       MT_HASH2_CALC
@@ -791,8 +1345,9 @@ static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num)
   SKIP_FOOTER_MT
 }
 
-static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num)
+static void MatchFinderMt3_Skip(void *_p, UInt32 num)
 {
+  CMatchFinderMt *p = (CMatchFinderMt *)_p;
   SKIP_HEADER_MT(3)
       UInt32 h2, h3;
       MT_HASH3_CALC
@@ -803,12 +1358,16 @@ static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num)
 }
 
 /*
+// MatchFinderMt4_Skip() is similar to MatchFinderMt3_Skip().
+// The difference is that MatchFinderMt3_Skip() updates hash for last 3 bytes of stream.
+
 static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num)
 {
   SKIP_HEADER_MT(4)
-      UInt32 h2, h3, h4;
-      MT_HASH4_CALC
-      (hash + kFix4HashSize)[h4] =
+      UInt32 h2, h3; // h4
+      MT_HASH3_CALC
+      // MT_HASH4_CALC
+      // (hash + kFix4HashSize)[h4] =
       (hash + kFix3HashSize)[h3] =
       hash[                h2] =
         p->lzPos;
@@ -816,38 +1375,48 @@ static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num)
 }
 */
 
-void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable)
+void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable)
 {
-  vTable->Init = (Mf_Init_Func)MatchFinderMt_Init;
-  vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes;
-  vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos;
-  vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches;
+  vTable->Init = MatchFinderMt_Init;
+  vTable->GetNumAvailableBytes = MatchFinderMt_GetNumAvailableBytes;
+  vTable->GetPointerToCurrentPos = MatchFinderMt_GetPointerToCurrentPos;
+  vTable->GetMatches = MatchFinderMt_GetMatches;
   
-  switch (p->MatchFinder->numHashBytes)
+  switch (MF(p)->numHashBytes)
   {
     case 2:
       p->GetHeadsFunc = GetHeads2;
-      p->MixMatchesFunc = (Mf_Mix_Matches)NULL;
-      vTable->Skip = (Mf_Skip_Func)MatchFinderMt0_Skip;
-      vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches;
+      p->MixMatchesFunc = NULL;
+      vTable->Skip = MatchFinderMt0_Skip;
+      vTable->GetMatches = MatchFinderMt2_GetMatches;
       break;
     case 3:
-      p->GetHeadsFunc = GetHeads3;
-      p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2;
-      vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip;
+      p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3;
+      p->MixMatchesFunc = MixMatches2;
+      vTable->Skip = MatchFinderMt2_Skip;
       break;
-    default:
-    /* case 4: */
-      p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads4b : GetHeads4;
-      p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3;
-      vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip;
+    case 4:
+      p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4;
+
+      // it's fast inline version of GetMatches()
+      // vTable->GetMatches = MatchFinderMt_GetMatches_Bt4;
+
+      p->MixMatchesFunc = MixMatches3;
+      vTable->Skip = MatchFinderMt3_Skip;
       break;
-    /*
     default:
-      p->GetHeadsFunc = GetHeads5;
-      p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4;
-      vTable->Skip = (Mf_Skip_Func)MatchFinderMt4_Skip;
+      p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5;
+      p->MixMatchesFunc = MixMatches4;
+      vTable->Skip =
+          MatchFinderMt3_Skip;
+          // MatchFinderMt4_Skip;
       break;
-    */
   }
 }
+
+#undef RINOK_THREAD
+#undef PRF
+#undef MF
+#undef GetUi24hi_from32
+#undef LOCK_BUFFER
+#undef UNLOCK_BUFFER
diff --git a/src/sdk/C/LzFindMt.h b/src/sdk/C/LzFindMt.h
index ef431e3..89984f5 100644
--- a/src/sdk/C/LzFindMt.h
+++ b/src/sdk/C/LzFindMt.h
@@ -1,42 +1,42 @@
 /* LzFindMt.h -- multithreaded Match finder for LZ algorithms
-2018-07-04 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
-#ifndef __LZ_FIND_MT_H
-#define __LZ_FIND_MT_H
+#ifndef ZIP7_INC_LZ_FIND_MT_H
+#define ZIP7_INC_LZ_FIND_MT_H
 
 #include "LzFind.h"
 #include "Threads.h"
 
 EXTERN_C_BEGIN
 
-#define kMtHashBlockSize (1 << 13)
-#define kMtHashNumBlocks (1 << 3)
-#define kMtHashNumBlocksMask (kMtHashNumBlocks - 1)
-
-#define kMtBtBlockSize (1 << 14)
-#define kMtBtNumBlocks (1 << 6)
-#define kMtBtNumBlocksMask (kMtBtNumBlocks - 1)
-
-typedef struct _CMtSync
+typedef struct
 {
+  UInt32 numProcessedBlocks;
+  Int32 affinityGroup;
+  UInt64 affinityInGroup;
+  UInt64 affinity;
+  CThread thread;
+
   BoolInt wasCreated;
   BoolInt needStart;
+  BoolInt csWasInitialized;
+  BoolInt csWasEntered;
+
   BoolInt exit;
   BoolInt stopWriting;
 
-  CThread thread;
   CAutoResetEvent canStart;
-  CAutoResetEvent wasStarted;
   CAutoResetEvent wasStopped;
   CSemaphore freeSemaphore;
   CSemaphore filledSemaphore;
-  BoolInt csWasInitialized;
-  BoolInt csWasEntered;
   CCriticalSection cs;
-  UInt32 numProcessedBlocks;
+  // UInt32 numBlocks_Sent;
 } CMtSync;
 
-typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distances);
+
+struct CMatchFinderMt_;
+
+typedef UInt32 * (*Mf_Mix_Matches)(struct CMatchFinderMt_ *p, UInt32 matchMinPos, UInt32 *distances);
 
 /* kMtCacheLineDummy must be >= size_of_CPU_cache_line */
 #define kMtCacheLineDummy 128
@@ -44,23 +44,28 @@ typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distance
 typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos,
   UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc);
 
-typedef struct _CMatchFinderMt
+typedef struct CMatchFinderMt_
 {
   /* LZ */
   const Byte *pointerToCurPos;
   UInt32 *btBuf;
-  UInt32 btBufPos;
-  UInt32 btBufPosLimit;
+  const UInt32 *btBufPos;
+  const UInt32 *btBufPosLimit;
   UInt32 lzPos;
   UInt32 btNumAvailBytes;
 
   UInt32 *hash;
   UInt32 fixedHashSize;
+  // UInt32 hash4Mask;
   UInt32 historySize;
   const UInt32 *crc;
 
   Mf_Mix_Matches MixMatchesFunc;
-  
+  UInt32 failure_LZ_BT; // failure in BT transfered to LZ
+  // UInt32 failure_LZ_LZ; // failure in LZ tables
+  UInt32 failureBuf[1];
+  // UInt32 crc[256];
+
   /* LZ + BT */
   CMtSync btSync;
   Byte btDummy[kMtCacheLineDummy];
@@ -70,6 +75,8 @@ typedef struct _CMatchFinderMt
   UInt32 hashBufPos;
   UInt32 hashBufPosLimit;
   UInt32 hashNumAvail;
+  UInt32 failure_BT;
+
 
   CLzRef *son;
   UInt32 matchMaxLen;
@@ -77,7 +84,7 @@ typedef struct _CMatchFinderMt
   UInt32 pos;
   const Byte *buffer;
   UInt32 cyclicBufferPos;
-  UInt32 cyclicBufferSize; /* it must be historySize + 1 */
+  UInt32 cyclicBufferSize; /* it must be = (historySize + 1) */
   UInt32 cutValue;
 
   /* BT + Hash */
@@ -87,13 +94,19 @@ typedef struct _CMatchFinderMt
   /* Hash */
   Mf_GetHeads GetHeadsFunc;
   CMatchFinder *MatchFinder;
+  // CMatchFinder MatchFinder;
 } CMatchFinderMt;
 
+// only for Mt part
 void MatchFinderMt_Construct(CMatchFinderMt *p);
 void MatchFinderMt_Destruct(CMatchFinderMt *p, ISzAllocPtr alloc);
+
 SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddBufferBefore,
     UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc);
-void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable);
+void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable);
+
+/* call MatchFinderMt_InitMt() before IMatchFinder::Init() */
+SRes MatchFinderMt_InitMt(CMatchFinderMt *p);
 void MatchFinderMt_ReleaseStream(CMatchFinderMt *p);
 
 EXTERN_C_END
diff --git a/src/sdk/C/LzFindOpt.c b/src/sdk/C/LzFindOpt.c
new file mode 100644
index 0000000..85bdc13
--- /dev/null
+++ b/src/sdk/C/LzFindOpt.c
@@ -0,0 +1,578 @@
+/* LzFindOpt.c -- multithreaded Match finder for LZ algorithms
+2023-04-02 : Igor Pavlov : Public domain */
+
+#include "Precomp.h"
+
+#include "CpuArch.h"
+#include "LzFind.h"
+
+// #include "LzFindMt.h"
+
+// #define LOG_ITERS
+
+// #define LOG_THREAD
+
+#ifdef LOG_THREAD
+#include <stdio.h>
+#define PRF(x) x
+#else
+// #define PRF(x)
+#endif
+
+#ifdef LOG_ITERS
+#include <stdio.h>
+UInt64 g_NumIters_Tree;
+UInt64 g_NumIters_Loop;
+UInt64 g_NumIters_Bytes;
+#define LOG_ITER(x) x
+#else
+#define LOG_ITER(x)
+#endif
+
+// ---------- BT THREAD ----------
+
+#define USE_SON_PREFETCH
+#define USE_LONG_MATCH_OPT
+
+#define kEmptyHashValue 0
+
+// #define CYC_TO_POS_OFFSET 0
+
+// #define CYC_TO_POS_OFFSET 1 // for debug
+
+/*
+Z7_NO_INLINE
+UInt32 * Z7_FASTCALL GetMatchesSpecN_1(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
+    UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 *posRes)
+{
+  do
+  {
+    UInt32 delta;
+    if (hash == size)
+      break;
+    delta = *hash++;
+
+    if (delta == 0 || delta > (UInt32)pos)
+      return NULL;
+
+    lenLimit++;
+
+    if (delta == (UInt32)pos)
+    {
+      CLzRef *ptr1 = son + ((size_t)pos << 1) - CYC_TO_POS_OFFSET * 2;
+      *d++ = 0;
+      ptr1[0] = kEmptyHashValue;
+      ptr1[1] = kEmptyHashValue;
+    }
+else
+{
+  UInt32 *_distances = ++d;
+
+  CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;
+  CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;
+
+  const Byte *len0 = cur, *len1 = cur;
+  UInt32 cutValue = _cutValue;
+  const Byte *maxLen = cur + _maxLen;
+
+  for (LOG_ITER(g_NumIters_Tree++);;)
+  {
+    LOG_ITER(g_NumIters_Loop++);
+    {
+      const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
+      CLzRef *pair = son + ((size_t)(((ptrdiff_t)pos - CYC_TO_POS_OFFSET) + diff) << 1);
+      const Byte *len = (len0 < len1 ? len0 : len1);
+
+    #ifdef USE_SON_PREFETCH
+      const UInt32 pair0 = *pair;
+    #endif
+
+      if (len[diff] == len[0])
+      {
+        if (++len != lenLimit && len[diff] == len[0])
+          while (++len != lenLimit)
+          {
+            LOG_ITER(g_NumIters_Bytes++);
+            if (len[diff] != len[0])
+              break;
+          }
+        if (maxLen < len)
+        {
+          maxLen = len;
+          *d++ = (UInt32)(len - cur);
+          *d++ = delta - 1;
+          
+          if (len == lenLimit)
+          {
+            const UInt32 pair1 = pair[1];
+            *ptr1 =
+              #ifdef USE_SON_PREFETCH
+                pair0;
+              #else
+                pair[0];
+              #endif
+            *ptr0 = pair1;
+
+            _distances[-1] = (UInt32)(d - _distances);
+
+            #ifdef USE_LONG_MATCH_OPT
+
+                if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
+                  break;
+
+            {
+              for (;;)
+              {
+                hash++;
+                pos++;
+                cur++;
+                lenLimit++;
+                {
+                  CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;
+                  #if 0
+                  *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];
+                  #else
+                  const UInt32 p0 = ptr[0 + (diff * 2)];
+                  const UInt32 p1 = ptr[1 + (diff * 2)];
+                  ptr[0] = p0;
+                  ptr[1] = p1;
+                  // ptr[0] = ptr[0 + (diff * 2)];
+                  // ptr[1] = ptr[1 + (diff * 2)];
+                  #endif
+                }
+                // PrintSon(son + 2, pos - 1);
+                // printf("\npos = %x delta = %x\n", pos, delta);
+                len++;
+                *d++ = 2;
+                *d++ = (UInt32)(len - cur);
+                *d++ = delta - 1;
+                if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
+                  break;
+              }
+            }
+            #endif
+
+            break;
+          }
+        }
+      }
+
+      {
+        const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff);
+        if (len[diff] < len[0])
+        {
+          delta = pair[1];
+          if (delta >= curMatch)
+            return NULL;
+          *ptr1 = curMatch;
+          ptr1 = pair + 1;
+          len1 = len;
+        }
+        else
+        {
+          delta = *pair;
+          if (delta >= curMatch)
+            return NULL;
+          *ptr0 = curMatch;
+          ptr0 = pair;
+          len0 = len;
+        }
+
+        delta = (UInt32)pos - delta;
+ 
+        if (--cutValue == 0 || delta >= pos)
+        {
+          *ptr0 = *ptr1 = kEmptyHashValue;
+          _distances[-1] = (UInt32)(d - _distances);
+          break;
+        }
+      }
+    }
+  } // for (tree iterations)
+}
+    pos++;
+    cur++;
+  }
+  while (d < limit);
+  *posRes = (UInt32)pos;
+  return d;
+}
+*/
+
+/* define cbs if you use 2 functions.
+       GetMatchesSpecN_1() :  (pos <  _cyclicBufferSize)
+       GetMatchesSpecN_2() :  (pos >= _cyclicBufferSize)
+
+  do not define cbs if you use 1 function:
+       GetMatchesSpecN_2()
+*/
+
+// #define cbs _cyclicBufferSize
+
+/*
+  we use size_t for (pos) and (_cyclicBufferPos_ instead of UInt32
+  to eliminate "movsx" BUG in old MSVC x64 compiler.
+*/
+
+UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
+    UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
+    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
+    UInt32 *posRes);
+
+Z7_NO_INLINE
+UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
+    UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
+    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
+    UInt32 *posRes)
+{
+  do // while (hash != size)
+  {
+    UInt32 delta;
+    
+  #ifndef cbs
+    UInt32 cbs;
+  #endif
+
+    if (hash == size)
+      break;
+
+    delta = *hash++;
+
+    if (delta == 0)
+      return NULL;
+
+    lenLimit++;
+
+  #ifndef cbs
+    cbs = _cyclicBufferSize;
+    if ((UInt32)pos < cbs)
+    {
+      if (delta > (UInt32)pos)
+        return NULL;
+      cbs = (UInt32)pos;
+    }
+  #endif
+
+    if (delta >= cbs)
+    {
+      CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
+      *d++ = 0;
+      ptr1[0] = kEmptyHashValue;
+      ptr1[1] = kEmptyHashValue;
+    }
+else
+{
+  UInt32 *_distances = ++d;
+
+  CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
+  CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
+
+  UInt32 cutValue = _cutValue;
+  const Byte *len0 = cur, *len1 = cur;
+  const Byte *maxLen = cur + _maxLen;
+
+  // if (cutValue == 0) { *ptr0 = *ptr1 = kEmptyHashValue; } else
+  for (LOG_ITER(g_NumIters_Tree++);;)
+  {
+    LOG_ITER(g_NumIters_Loop++);
+    {
+      // SPEC code
+      CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - (ptrdiff_t)delta
+          + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)
+          ) << 1);
+
+      const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
+      const Byte *len = (len0 < len1 ? len0 : len1);
+
+    #ifdef USE_SON_PREFETCH
+      const UInt32 pair0 = *pair;
+    #endif
+
+      if (len[diff] == len[0])
+      {
+        if (++len != lenLimit && len[diff] == len[0])
+          while (++len != lenLimit)
+          {
+            LOG_ITER(g_NumIters_Bytes++);
+            if (len[diff] != len[0])
+              break;
+          }
+        if (maxLen < len)
+        {
+          maxLen = len;
+          *d++ = (UInt32)(len - cur);
+          *d++ = delta - 1;
+          
+          if (len == lenLimit)
+          {
+            const UInt32 pair1 = pair[1];
+            *ptr1 =
+              #ifdef USE_SON_PREFETCH
+                pair0;
+              #else
+                pair[0];
+              #endif
+            *ptr0 = pair1;
+
+            _distances[-1] = (UInt32)(d - _distances);
+
+            #ifdef USE_LONG_MATCH_OPT
+
+                if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
+                  break;
+
+            {
+              for (;;)
+              {
+                *d++ = 2;
+                *d++ = (UInt32)(lenLimit - cur);
+                *d++ = delta - 1;
+                cur++;
+                lenLimit++;
+                // SPEC
+                _cyclicBufferPos++;
+                {
+                  // SPEC code
+                  CLzRef *dest = son + ((size_t)(_cyclicBufferPos) << 1);
+                  const CLzRef *src = dest + ((diff
+                      + (ptrdiff_t)(UInt32)((_cyclicBufferPos < delta) ? cbs : 0)) << 1);
+                  // CLzRef *ptr = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;
+                  #if 0
+                  *(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src);
+                  #else
+                  const UInt32 p0 = src[0];
+                  const UInt32 p1 = src[1];
+                  dest[0] = p0;
+                  dest[1] = p1;
+                  #endif
+                }
+                pos++;
+                hash++;
+                if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
+                  break;
+              } // for() end for long matches
+            }
+            #endif
+
+            break; // break from TREE iterations
+          }
+        }
+      }
+      {
+        const UInt32 curMatch = (UInt32)pos - delta; // (UInt32)(pos + diff);
+        if (len[diff] < len[0])
+        {
+          delta = pair[1];
+          *ptr1 = curMatch;
+          ptr1 = pair + 1;
+          len1 = len;
+          if (delta >= curMatch)
+            return NULL;
+        }
+        else
+        {
+          delta = *pair;
+          *ptr0 = curMatch;
+          ptr0 = pair;
+          len0 = len;
+          if (delta >= curMatch)
+            return NULL;
+        }
+        delta = (UInt32)pos - delta;
+ 
+        if (--cutValue == 0 || delta >= cbs)
+        {
+          *ptr0 = *ptr1 = kEmptyHashValue;
+          _distances[-1] = (UInt32)(d - _distances);
+          break;
+        }
+      }
+    }
+  } // for (tree iterations)
+}
+    pos++;
+    _cyclicBufferPos++;
+    cur++;
+  }
+  while (d < limit);
+  *posRes = (UInt32)pos;
+  return d;
+}
+
+
+
+/*
+typedef UInt32 uint32plus; // size_t
+
+UInt32 * Z7_FASTCALL GetMatchesSpecN_3(uint32plus lenLimit, size_t pos, const Byte *cur, CLzRef *son,
+    UInt32 _cutValue, UInt32 *d, uint32plus _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
+    size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
+    UInt32 *posRes)
+{
+  do // while (hash != size)
+  {
+    UInt32 delta;
+
+  #ifndef cbs
+    UInt32 cbs;
+  #endif
+
+    if (hash == size)
+      break;
+
+    delta = *hash++;
+
+    if (delta == 0)
+      return NULL;
+
+  #ifndef cbs
+    cbs = _cyclicBufferSize;
+    if ((UInt32)pos < cbs)
+    {
+      if (delta > (UInt32)pos)
+        return NULL;
+      cbs = (UInt32)pos;
+    }
+  #endif
+    
+    if (delta >= cbs)
+    {
+      CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
+      *d++ = 0;
+      ptr1[0] = kEmptyHashValue;
+      ptr1[1] = kEmptyHashValue;
+    }
+else
+{
+  CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
+  CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
+  UInt32 *_distances = ++d;
+  uint32plus len0 = 0, len1 = 0;
+  UInt32 cutValue = _cutValue;
+  uint32plus maxLen = _maxLen;
+  // lenLimit++; // const Byte *lenLimit = cur + _lenLimit;
+
+  for (LOG_ITER(g_NumIters_Tree++);;)
+  {
+    LOG_ITER(g_NumIters_Loop++);
+    {
+      // const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
+      CLzRef *pair = son + ((size_t)((ptrdiff_t)_cyclicBufferPos - delta
+          + (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)
+          ) << 1);
+      const Byte *pb = cur - delta;
+      uint32plus len = (len0 < len1 ? len0 : len1);
+
+    #ifdef USE_SON_PREFETCH
+      const UInt32 pair0 = *pair;
+    #endif
+
+      if (pb[len] == cur[len])
+      {
+        if (++len != lenLimit && pb[len] == cur[len])
+          while (++len != lenLimit)
+            if (pb[len] != cur[len])
+              break;
+        if (maxLen < len)
+        {
+          maxLen = len;
+          *d++ = (UInt32)len;
+          *d++ = delta - 1;
+          if (len == lenLimit)
+          {
+            {
+              const UInt32 pair1 = pair[1];
+              *ptr0 = pair1;
+              *ptr1 =
+              #ifdef USE_SON_PREFETCH
+                pair0;
+              #else
+                pair[0];
+              #endif
+            }
+
+            _distances[-1] = (UInt32)(d - _distances);
+
+            #ifdef USE_LONG_MATCH_OPT
+
+                if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit)
+                  break;
+
+            {
+              const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
+              for (;;)
+              {
+                *d++ = 2;
+                *d++ = (UInt32)lenLimit;
+                *d++ = delta - 1;
+                _cyclicBufferPos++;
+                {
+                  CLzRef *dest = son + ((size_t)_cyclicBufferPos << 1);
+                  const CLzRef *src = dest + ((diff +
+                      (ptrdiff_t)(UInt32)(_cyclicBufferPos < delta ? cbs : 0)) << 1);
+                #if 0
+                  *(UInt64 *)(void *)dest = *((const UInt64 *)(const void *)src);
+                #else
+                  const UInt32 p0 = src[0];
+                  const UInt32 p1 = src[1];
+                  dest[0] = p0;
+                  dest[1] = p1;
+                #endif
+                }
+                hash++;
+                pos++;
+                cur++;
+                pb++;
+                if (hash == size || *hash != delta || pb[lenLimit] != cur[lenLimit] || d >= limit)
+                  break;
+              }
+            }
+            #endif
+
+            break;
+          }
+        }
+      }
+      {
+        const UInt32 curMatch = (UInt32)pos - delta;
+        if (pb[len] < cur[len])
+        {
+          delta = pair[1];
+          *ptr1 = curMatch;
+          ptr1 = pair + 1;
+          len1 = len;
+        }
+        else
+        {
+          delta = *pair;
+          *ptr0 = curMatch;
+          ptr0 = pair;
+          len0 = len;
+        }
+
+        {
+          if (delta >= curMatch)
+            return NULL;
+          delta = (UInt32)pos - delta;
+          if (delta >= cbs
+              // delta >= _cyclicBufferSize || delta >= pos
+              || --cutValue == 0)
+          {
+            *ptr0 = *ptr1 = kEmptyHashValue;
+            _distances[-1] = (UInt32)(d - _distances);
+            break;
+          }
+        }
+      }
+    }
+  } // for (tree iterations)
+}
+    pos++;
+    _cyclicBufferPos++;
+    cur++;
+  }
+  while (d < limit);
+  *posRes = (UInt32)pos;
+  return d;
+}
+*/
diff --git a/src/sdk/C/LzHash.h b/src/sdk/C/LzHash.h
index e7c9423..2b6290b 100644
--- a/src/sdk/C/LzHash.h
+++ b/src/sdk/C/LzHash.h
@@ -1,57 +1,34 @@
-/* LzHash.h -- HASH functions for LZ algorithms
-2015-04-12 : Igor Pavlov : Public domain */
+/* LzHash.h -- HASH constants for LZ algorithms
+2023-03-05 : Igor Pavlov : Public domain */
 
-#ifndef __LZ_HASH_H
-#define __LZ_HASH_H
+#ifndef ZIP7_INC_LZ_HASH_H
+#define ZIP7_INC_LZ_HASH_H
+
+/*
+  (kHash2Size >= (1 <<  8)) : Required
+  (kHash3Size >= (1 << 16)) : Required
+*/
 
 #define kHash2Size (1 << 10)
 #define kHash3Size (1 << 16)
-#define kHash4Size (1 << 20)
+// #define kHash4Size (1 << 20)
 
 #define kFix3HashSize (kHash2Size)
 #define kFix4HashSize (kHash2Size + kHash3Size)
-#define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
-
-#define HASH2_CALC hv = cur[0] | ((UInt32)cur[1] << 8);
-
-#define HASH3_CALC { \
-  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
-  h2 = temp & (kHash2Size - 1); \
-  hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; }
-
-#define HASH4_CALC { \
-  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
-  h2 = temp & (kHash2Size - 1); \
-  temp ^= ((UInt32)cur[2] << 8); \
-  h3 = temp & (kHash3Size - 1); \
-  hv = (temp ^ (p->crc[cur[3]] << 5)) & p->hashMask; }
-
-#define HASH5_CALC { \
-  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
-  h2 = temp & (kHash2Size - 1); \
-  temp ^= ((UInt32)cur[2] << 8); \
-  h3 = temp & (kHash3Size - 1); \
-  temp ^= (p->crc[cur[3]] << 5); \
-  h4 = temp & (kHash4Size - 1); \
-  hv = (temp ^ (p->crc[cur[4]] << 3)) & p->hashMask; }
-
-/* #define HASH_ZIP_CALC hv = ((cur[0] | ((UInt32)cur[1] << 8)) ^ p->crc[cur[2]]) & 0xFFFF; */
-#define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF;
-
-
-#define MT_HASH2_CALC \
-  h2 = (p->crc[cur[0]] ^ cur[1]) & (kHash2Size - 1);
-
-#define MT_HASH3_CALC { \
-  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
-  h2 = temp & (kHash2Size - 1); \
-  h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); }
-
-#define MT_HASH4_CALC { \
-  UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
-  h2 = temp & (kHash2Size - 1); \
-  temp ^= ((UInt32)cur[2] << 8); \
-  h3 = temp & (kHash3Size - 1); \
-  h4 = (temp ^ (p->crc[cur[3]] << 5)) & (kHash4Size - 1); }
+// #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
+
+/*
+  We use up to 3 crc values for hash:
+    crc0
+    crc1 << Shift_1
+    crc2 << Shift_2
+  (Shift_1 = 5) and (Shift_2 = 10) is good tradeoff.
+  Small values for Shift are not good for collision rate.
+  Big value for Shift_2 increases the minimum size
+  of hash table, that will be slow for small files.
+*/
+
+#define kLzHash_CrcShift_1 5
+#define kLzHash_CrcShift_2 10
 
 #endif
diff --git a/src/sdk/C/Lzma2Dec.c b/src/sdk/C/Lzma2Dec.c
index 4e138a4..8bf54e4 100644
--- a/src/sdk/C/Lzma2Dec.c
+++ b/src/sdk/C/Lzma2Dec.c
@@ -1,5 +1,5 @@
 /* Lzma2Dec.c -- LZMA2 Decoder
-2019-02-02 : Igor Pavlov : Public domain */
+2024-03-01 : Igor Pavlov : Public domain */
 
 /* #define SHOW_DEBUG_INFO */
 
@@ -71,14 +71,14 @@ static SRes Lzma2Dec_GetOldProps(Byte prop, Byte *props)
 SRes Lzma2Dec_AllocateProbs(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc)
 {
   Byte props[LZMA_PROPS_SIZE];
-  RINOK(Lzma2Dec_GetOldProps(prop, props));
+  RINOK(Lzma2Dec_GetOldProps(prop, props))
   return LzmaDec_AllocateProbs(&p->decoder, props, LZMA_PROPS_SIZE, alloc);
 }
 
 SRes Lzma2Dec_Allocate(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc)
 {
   Byte props[LZMA_PROPS_SIZE];
-  RINOK(Lzma2Dec_GetOldProps(prop, props));
+  RINOK(Lzma2Dec_GetOldProps(prop, props))
   return LzmaDec_Allocate(&p->decoder, props, LZMA_PROPS_SIZE, alloc);
 }
 
@@ -93,7 +93,8 @@ void Lzma2Dec_Init(CLzma2Dec *p)
   LzmaDec_Init(&p->decoder);
 }
 
-static ELzma2State Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b)
+// ELzma2State
+static unsigned Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b)
 {
   switch (p->state)
   {
@@ -156,8 +157,10 @@ static ELzma2State Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b)
       p->decoder.prop.lp = (Byte)lp;
       return LZMA2_STATE_DATA;
     }
+    
+    default:
+      return LZMA2_STATE_ERROR;
   }
-  return LZMA2_STATE_ERROR;
 }
 
 static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size)
@@ -473,8 +476,8 @@ SRes Lzma2Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
   SizeT outSize = *destLen, inSize = *srcLen;
   *destLen = *srcLen = 0;
   *status = LZMA_STATUS_NOT_SPECIFIED;
-  Lzma2Dec_Construct(&p);
-  RINOK(Lzma2Dec_AllocateProbs(&p, prop, alloc));
+  Lzma2Dec_CONSTRUCT(&p)
+  RINOK(Lzma2Dec_AllocateProbs(&p, prop, alloc))
   p.decoder.dic = dest;
   p.decoder.dicBufSize = outSize;
   Lzma2Dec_Init(&p);
@@ -486,3 +489,5 @@ SRes Lzma2Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
   Lzma2Dec_FreeProbs(&p, alloc);
   return res;
 }
+
+#undef PRF
diff --git a/src/sdk/C/Lzma2Dec.h b/src/sdk/C/Lzma2Dec.h
index b8ddeac..1f5233a 100644
--- a/src/sdk/C/Lzma2Dec.h
+++ b/src/sdk/C/Lzma2Dec.h
@@ -1,8 +1,8 @@
 /* Lzma2Dec.h -- LZMA2 Decoder
-2018-02-19 : Igor Pavlov : Public domain */
+2023-03-03 : Igor Pavlov : Public domain */
 
-#ifndef __LZMA2_DEC_H
-#define __LZMA2_DEC_H
+#ifndef ZIP7_INC_LZMA2_DEC_H
+#define ZIP7_INC_LZMA2_DEC_H
 
 #include "LzmaDec.h"
 
@@ -22,9 +22,10 @@ typedef struct
   CLzmaDec decoder;
 } CLzma2Dec;
 
-#define Lzma2Dec_Construct(p) LzmaDec_Construct(&(p)->decoder)
-#define Lzma2Dec_FreeProbs(p, alloc) LzmaDec_FreeProbs(&(p)->decoder, alloc)
-#define Lzma2Dec_Free(p, alloc) LzmaDec_Free(&(p)->decoder, alloc)
+#define Lzma2Dec_CONSTRUCT(p)  LzmaDec_CONSTRUCT(&(p)->decoder)
+#define Lzma2Dec_Construct(p)  Lzma2Dec_CONSTRUCT(p)
+#define Lzma2Dec_FreeProbs(p, alloc)  LzmaDec_FreeProbs(&(p)->decoder, alloc)
+#define Lzma2Dec_Free(p, alloc)  LzmaDec_Free(&(p)->decoder, alloc)
 
 SRes Lzma2Dec_AllocateProbs(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc);
 SRes Lzma2Dec_Allocate(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc);
@@ -90,7 +91,7 @@ Lzma2Dec_GetUnpackExtra() returns the value that shows
     at current input positon.
 */
 
-#define Lzma2Dec_GetUnpackExtra(p) ((p)->isExtraMode ? (p)->unpackSize : 0);
+#define Lzma2Dec_GetUnpackExtra(p)  ((p)->isExtraMode ? (p)->unpackSize : 0)
 
 
 /* ---------- One Call Interface ---------- */
diff --git a/src/sdk/C/Lzma2DecMt.c b/src/sdk/C/Lzma2DecMt.c
index 988643d..4bc4dde 100644
--- a/src/sdk/C/Lzma2DecMt.c
+++ b/src/sdk/C/Lzma2DecMt.c
@@ -1,44 +1,44 @@
 /* Lzma2DecMt.c -- LZMA2 Decoder Multi-thread
-2019-02-02 : Igor Pavlov : Public domain */
+2023-04-13 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 // #define SHOW_DEBUG_INFO
+// #define Z7_ST
 
 #ifdef SHOW_DEBUG_INFO
 #include <stdio.h>
 #endif
 
-#ifdef SHOW_DEBUG_INFO
-#define PRF(x) x
-#else
-#define PRF(x)
-#endif
-
-#define PRF_STR(s) PRF(printf("\n" s "\n"))
-#define PRF_STR_INT(s, d) PRF(printf("\n" s " %d\n", (unsigned)d))
-#define PRF_STR_INT_2(s, d1, d2) PRF(printf("\n" s " %d %d\n", (unsigned)d1, (unsigned)d2))
-
-// #define _7ZIP_ST
-
 #include "Alloc.h"
 
 #include "Lzma2Dec.h"
 #include "Lzma2DecMt.h"
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 #include "MtDec.h"
+
+#define LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT (1 << 28)
 #endif
 
 
-#define LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT (1 << 28)
+#ifndef Z7_ST
+#ifdef SHOW_DEBUG_INFO
+#define PRF(x) x
+#else
+#define PRF(x)
+#endif
+#define PRF_STR(s) PRF(printf("\n" s "\n");)
+#define PRF_STR_INT_2(s, d1, d2) PRF(printf("\n" s " %d %d\n", (unsigned)d1, (unsigned)d2);)
+#endif
+
 
 void Lzma2DecMtProps_Init(CLzma2DecMtProps *p)
 {
   p->inBufSize_ST = 1 << 20;
   p->outStep_ST = 1 << 20;
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   p->numThreads = 1;
   p->inBufSize_MT = 1 << 18;
   p->outBlockMax = LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT;
@@ -48,7 +48,7 @@ void Lzma2DecMtProps_Init(CLzma2DecMtProps *p)
 
 
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
 /* ---------- CLzma2DecMtThread ---------- */
 
@@ -81,7 +81,7 @@ typedef struct
 
 /* ---------- CLzma2DecMt ---------- */
 
-typedef struct
+struct CLzma2DecMt
 {
   // ISzAllocPtr alloc;
   ISzAllocPtr allocMid;
@@ -90,9 +90,9 @@ typedef struct
   CLzma2DecMtProps props;
   Byte prop;
   
-  ISeqInStream *inStream;
-  ISeqOutStream *outStream;
-  ICompressProgress *progress;
+  ISeqInStreamPtr inStream;
+  ISeqOutStreamPtr outStream;
+  ICompressProgressPtr progress;
 
   BoolInt finishMode;
   BoolInt outSize_Defined;
@@ -111,14 +111,13 @@ typedef struct
   size_t inPos;
   size_t inLim;
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   UInt64 outProcessed_Parse;
   BoolInt mtc_WasConstructed;
   CMtDec mtc;
-  CLzma2DecMtThread coders[MTDEC__THREADS_MAX];
+  CLzma2DecMtThread coders[MTDEC_THREADS_MAX];
   #endif
-
-} CLzma2DecMt;
+};
 
 
 
@@ -142,11 +141,11 @@ CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid)
 
   // Lzma2DecMtProps_Init(&p->props);
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   p->mtc_WasConstructed = False;
   {
     unsigned i;
-    for (i = 0; i < MTDEC__THREADS_MAX; i++)
+    for (i = 0; i < MTDEC_THREADS_MAX; i++)
     {
       CLzma2DecMtThread *t = &p->coders[i];
       t->dec_created = False;
@@ -156,16 +155,16 @@ CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid)
   }
   #endif
 
-  return p;
+  return (CLzma2DecMtHandle)(void *)p;
 }
 
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
 static void Lzma2DecMt_FreeOutBufs(CLzma2DecMt *p)
 {
   unsigned i;
-  for (i = 0; i < MTDEC__THREADS_MAX; i++)
+  for (i = 0; i < MTDEC_THREADS_MAX; i++)
   {
     CLzma2DecMtThread *t = &p->coders[i];
     if (t->outBuf)
@@ -196,13 +195,15 @@ static void Lzma2DecMt_FreeSt(CLzma2DecMt *p)
 }
 
 
-void Lzma2DecMt_Destroy(CLzma2DecMtHandle pp)
+// #define GET_CLzma2DecMt_p CLzma2DecMt *p = (CLzma2DecMt *)(void *)pp;
+
+void Lzma2DecMt_Destroy(CLzma2DecMtHandle p)
 {
-  CLzma2DecMt *p = (CLzma2DecMt *)pp;
+  // GET_CLzma2DecMt_p
 
   Lzma2DecMt_FreeSt(p);
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
 
   if (p->mtc_WasConstructed)
   {
@@ -211,7 +212,7 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle pp)
   }
   {
     unsigned i;
-    for (i = 0; i < MTDEC__THREADS_MAX; i++)
+    for (i = 0; i < MTDEC_THREADS_MAX; i++)
     {
       CLzma2DecMtThread *t = &p->coders[i];
       if (t->dec_created)
@@ -226,19 +227,19 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle pp)
 
   #endif
 
-  ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, pp);
+  ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, p);
 }
 
 
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
 static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCallbackInfo *cc)
 {
   CLzma2DecMt *me = (CLzma2DecMt *)obj;
   CLzma2DecMtThread *t = &me->coders[coderIndex];
 
-  PRF_STR_INT_2("Parse", coderIndex, cc->srcSize);
+  PRF_STR_INT_2("Parse", coderIndex, cc->srcSize)
 
   cc->state = MTDEC_PARSE_CONTINUE;
 
@@ -246,7 +247,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
   {
     if (!t->dec_created)
     {
-      Lzma2Dec_Construct(&t->dec);
+      Lzma2Dec_CONSTRUCT(&t->dec)
       t->dec_created = True;
       AlignOffsetAlloc_CreateVTable(&t->alloc);
       {
@@ -255,7 +256,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
         const unsigned kNumAlignBits = 12;
         const unsigned kNumCacheLineBits = 7; /* <= kNumAlignBits */
         t->alloc.numAlignBits = kNumAlignBits;
-        t->alloc.offset = ((UInt32)coderIndex * ((1 << 11) + (1 << 8) + (1 << 6))) & ((1 << kNumAlignBits) - (1 << kNumCacheLineBits));
+        t->alloc.offset = ((UInt32)coderIndex * (((unsigned)1 << 11) + (1 << 8) + (1 << 6))) & (((unsigned)1 << kNumAlignBits) - ((unsigned)1 << kNumCacheLineBits));
         t->alloc.baseAlloc = me->alignOffsetAlloc.baseAlloc;
       }
     }
@@ -297,7 +298,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
     // that must be finished at position <= outBlockMax.
 
     {
-      const SizeT srcOrig = cc->srcSize;
+      const size_t srcOrig = cc->srcSize;
       SizeT srcSize_Point = 0;
       SizeT dicPos_Point = 0;
       
@@ -306,10 +307,10 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
 
       for (;;)
       {
-        SizeT srcCur = srcOrig - cc->srcSize;
+        SizeT srcCur = (SizeT)(srcOrig - cc->srcSize);
         
         status = Lzma2Dec_Parse(&t->dec,
-            limit - t->dec.decoder.dicPos,
+            (SizeT)limit - t->dec.decoder.dicPos,
             cc->src + cc->srcSize, &srcCur,
             checkFinishBlock);
 
@@ -333,7 +334,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
           if (t->dec.decoder.dicPos >= (1 << 14))
             break;
           dicPos_Point = t->dec.decoder.dicPos;
-          srcSize_Point = cc->srcSize;
+          srcSize_Point = (SizeT)cc->srcSize;
           continue;
         }
 
@@ -391,7 +392,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
             if (unpackRem != 0)
             {
               /* we also reserve space for max possible number of output bytes of current LZMA chunk */
-              SizeT rem = limit - dicPos;
+              size_t rem = limit - dicPos;
               if (rem > unpackRem)
                 rem = unpackRem;
               dicPos += rem;
@@ -444,7 +445,7 @@ static SRes Lzma2DecMt_MtCallback_PreCode(void *pp, unsigned coderIndex)
   }
 
   t->dec.decoder.dic = dest;
-  t->dec.decoder.dicBufSize = t->outPreSize;
+  t->dec.decoder.dicBufSize = (SizeT)t->outPreSize;
 
   t->needInit = True;
 
@@ -462,7 +463,7 @@ static SRes Lzma2DecMt_MtCallback_Code(void *pp, unsigned coderIndex,
 
   UNUSED_VAR(srcFinished)
 
-  PRF_STR_INT_2("Code", coderIndex, srcSize);
+  PRF_STR_INT_2("Code", coderIndex, srcSize)
 
   *inCodePos = t->inCodeSize;
   *outCodePos = 0;
@@ -476,13 +477,13 @@ static SRes Lzma2DecMt_MtCallback_Code(void *pp, unsigned coderIndex,
 
   {
     ELzmaStatus status;
-    size_t srcProcessed = srcSize;
+    SizeT srcProcessed = (SizeT)srcSize;
     BoolInt blockWasFinished =
         ((int)t->parseStatus == LZMA_STATUS_FINISHED_WITH_MARK
         || t->parseStatus == LZMA2_PARSE_STATUS_NEW_BLOCK);
     
     SRes res = Lzma2Dec_DecodeToDic(&t->dec,
-        t->outPreSize,
+        (SizeT)t->outPreSize,
         src, &srcProcessed,
         blockWasFinished ? LZMA_FINISH_END : LZMA_FINISH_ANY,
         &status);
@@ -527,7 +528,7 @@ static SRes Lzma2DecMt_MtCallback_Code(void *pp, unsigned coderIndex,
 
 static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex,
     BoolInt needWriteToStream,
-    const Byte *src, size_t srcSize,
+    const Byte *src, size_t srcSize, BoolInt isCross,
     BoolInt *needContinue, BoolInt *canRecode)
 {
   CLzma2DecMt *me = (CLzma2DecMt *)pp;
@@ -536,12 +537,14 @@ static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex,
   const Byte *data = t->outBuf;
   BoolInt needContinue2 = True;
 
-  PRF_STR_INT_2("Write", coderIndex, srcSize);
+  UNUSED_VAR(src)
+  UNUSED_VAR(srcSize)
+  UNUSED_VAR(isCross)
+
+  PRF_STR_INT_2("Write", coderIndex, srcSize)
 
   *needContinue = False;
   *canRecode = True;
-  UNUSED_VAR(src)
-  UNUSED_VAR(srcSize)
 
   if (
       // t->parseStatus == LZMA_STATUS_FINISHED_WITH_MARK
@@ -586,7 +589,7 @@ static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex,
         *needContinue = needContinue2;
         return SZ_OK;
       }
-      RINOK(MtProgress_ProgressAdd(&me->mtc.mtProgress, 0, 0));
+      RINOK(MtProgress_ProgressAdd(&me->mtc.mtProgress, 0, 0))
     }
   }
   
@@ -609,11 +612,11 @@ static SRes Lzma2Dec_Prepare_ST(CLzma2DecMt *p)
 {
   if (!p->dec_created)
   {
-    Lzma2Dec_Construct(&p->dec);
+    Lzma2Dec_CONSTRUCT(&p->dec)
     p->dec_created = True;
   }
 
-  RINOK(Lzma2Dec_Allocate(&p->dec, p->prop, &p->alignOffsetAlloc.vt));
+  RINOK(Lzma2Dec_Allocate(&p->dec, p->prop, &p->alignOffsetAlloc.vt))
 
   if (!p->inBuf || p->inBufSize != p->props.inBufSize_ST)
   {
@@ -632,7 +635,7 @@ static SRes Lzma2Dec_Prepare_ST(CLzma2DecMt *p)
 
 
 static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
-    #ifndef _7ZIP_ST
+    #ifndef Z7_ST
     , BoolInt tMode
     #endif
     )
@@ -644,7 +647,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
 
   CLzma2Dec *dec;
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   if (tMode)
   {
     Lzma2DecMt_FreeOutBufs(p);
@@ -652,7 +655,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
   }
   #endif
 
-  RINOK(Lzma2Dec_Prepare_ST(p));
+  RINOK(Lzma2Dec_Prepare_ST(p))
 
   dec = &p->dec;
 
@@ -679,7 +682,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
 
     if (inPos == inLim)
     {
-      #ifndef _7ZIP_ST
+      #ifndef Z7_ST
       if (tMode)
       {
         inData = MtDec_Read(&p->mtc, &inLim);
@@ -696,7 +699,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
         inPos = 0;
         inLim = p->inBufSize;
         inData = p->inBuf;
-        p->readRes = ISeqInStream_Read(p->inStream, (void *)inData, &inLim);
+        p->readRes = ISeqInStream_Read(p->inStream, (void *)(p->inBuf), &inLim);
         // p->readProcessed += inLim;
         // inLim -= 5; p->readWasFinished = True; // for test
         if (inLim == 0 || p->readRes != SZ_OK)
@@ -708,7 +711,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
     {
       SizeT next = dec->decoder.dicBufSize;
       if (next - wrPos > p->props.outStep_ST)
-        next = wrPos + p->props.outStep_ST;
+        next = wrPos + (SizeT)p->props.outStep_ST;
       size = next - dicPos;
     }
 
@@ -724,7 +727,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
       }
     }
 
-    inProcessed = inLim - inPos;
+    inProcessed = (SizeT)(inLim - inPos);
     
     res = Lzma2Dec_DecodeToDic(dec, dicPos + size, inData + inPos, &inProcessed, finishMode, &status);
 
@@ -753,7 +756,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
         dec->decoder.dicPos = 0;
       wrPos = dec->decoder.dicPos;
 
-      RINOK(res2);
+      RINOK(res2)
 
       if (needStop)
       {
@@ -786,7 +789,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
       UInt64 outDelta = p->outProcessed - outPrev;
       if (inDelta >= (1 << 22) || outDelta >= (1 << 22))
       {
-        RINOK(ICompressProgress_Progress(p->progress, p->inProcessed, p->outProcessed));
+        RINOK(ICompressProgress_Progress(p->progress, p->inProcessed, p->outProcessed))
         inPrev = p->inProcessed;
         outPrev = p->outProcessed;
       }
@@ -796,20 +799,20 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
 
 
 
-SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
+SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
     Byte prop,
     const CLzma2DecMtProps *props,
-    ISeqOutStream *outStream, const UInt64 *outDataSize, int finishMode,
+    ISeqOutStreamPtr outStream, const UInt64 *outDataSize, int finishMode,
     // Byte *outBuf, size_t *outBufSize,
-    ISeqInStream *inStream,
+    ISeqInStreamPtr inStream,
     // const Byte *inData, size_t inDataSize,
     UInt64 *inProcessed,
     // UInt64 *outProcessed,
     int *isMT,
-    ICompressProgress *progress)
+    ICompressProgressPtr progress)
 {
-  CLzma2DecMt *p = (CLzma2DecMt *)pp;
-  #ifndef _7ZIP_ST
+  // GET_CLzma2DecMt_p
+  #ifndef Z7_ST
   BoolInt tMode;
   #endif
 
@@ -838,11 +841,12 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
   p->inProcessed = 0;
 
   p->readWasFinished = False;
+  p->readRes = SZ_OK;
 
   *isMT = False;
 
   
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
 
   tMode = False;
 
@@ -856,7 +860,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
 
   if (p->props.numThreads > 1)
   {
-    IMtDecCallback vt;
+    IMtDecCallback2 vt;
 
     Lzma2DecMt_FreeSt(p);
 
@@ -936,7 +940,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
       p->readWasFinished = p->mtc.readWasFinished;
       p->inProcessed = p->mtc.inProcessed;
       
-      PRF_STR("----- decoding ST -----");
+      PRF_STR("----- decoding ST -----")
     }
   }
 
@@ -947,7 +951,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
 
   {
     SRes res = Lzma2Dec_Decode_ST(p
-        #ifndef _7ZIP_ST
+        #ifndef Z7_ST
         , tMode
         #endif
         );
@@ -955,11 +959,16 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
     *inProcessed = p->inProcessed;
 
     // res = SZ_OK; // for test
-    if (res == SZ_OK && p->readRes != SZ_OK)
+    if (res == SZ_ERROR_INPUT_EOF)
+    {
+      if (p->readRes != SZ_OK)
+        res = p->readRes;
+    }
+    else if (res == SZ_OK && p->readRes != SZ_OK)
       res = p->readRes;
     
     /*
-    #ifndef _7ZIP_ST
+    #ifndef Z7_ST
     if (res == SZ_OK && tMode && p->mtc.parseRes != SZ_OK)
       res = p->mtc.parseRes;
     #endif
@@ -972,13 +981,13 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
 
 /* ---------- Read from CLzma2DecMtHandle Interface ---------- */
 
-SRes Lzma2DecMt_Init(CLzma2DecMtHandle pp,
+SRes Lzma2DecMt_Init(CLzma2DecMtHandle p,
     Byte prop,
     const CLzma2DecMtProps *props,
     const UInt64 *outDataSize, int finishMode,
-    ISeqInStream *inStream)
+    ISeqInStreamPtr inStream)
 {
-  CLzma2DecMt *p = (CLzma2DecMt *)pp;
+  // GET_CLzma2DecMt_p
 
   if (prop > 40)
     return SZ_ERROR_UNSUPPORTED;
@@ -1007,11 +1016,11 @@ SRes Lzma2DecMt_Init(CLzma2DecMtHandle pp,
 }
 
 
-SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp,
+SRes Lzma2DecMt_Read(CLzma2DecMtHandle p,
     Byte *data, size_t *outSize,
     UInt64 *inStreamProcessed)
 {
-  CLzma2DecMt *p = (CLzma2DecMt *)pp;
+  // GET_CLzma2DecMt_p
   ELzmaFinishMode finishMode;
   SRes readRes;
   size_t size = *outSize;
@@ -1047,8 +1056,8 @@ SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp,
       readRes = ISeqInStream_Read(p->inStream, p->inBuf, &p->inLim);
     }
 
-    inCur = p->inLim - p->inPos;
-    outCur = size;
+    inCur = (SizeT)(p->inLim - p->inPos);
+    outCur = (SizeT)size;
 
     res = Lzma2Dec_DecodeToBuf(&p->dec, data, &outCur,
         p->inBuf + p->inPos, &inCur, finishMode, &status);
@@ -1080,3 +1089,7 @@ SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp,
       return readRes;
   }
 }
+
+#undef PRF
+#undef PRF_STR
+#undef PRF_STR_INT_2
diff --git a/src/sdk/C/Lzma2DecMt.h b/src/sdk/C/Lzma2DecMt.h
index 7791c31..93a5cd5 100644
--- a/src/sdk/C/Lzma2DecMt.h
+++ b/src/sdk/C/Lzma2DecMt.h
@@ -1,8 +1,8 @@
 /* Lzma2DecMt.h -- LZMA2 Decoder Multi-thread
-2018-02-17 : Igor Pavlov : Public domain */
+2023-04-13 : Igor Pavlov : Public domain */
 
-#ifndef __LZMA2_DEC_MT_H
-#define __LZMA2_DEC_MT_H
+#ifndef ZIP7_INC_LZMA2_DEC_MT_H
+#define ZIP7_INC_LZMA2_DEC_MT_H
 
 #include "7zTypes.h"
 
@@ -13,7 +13,7 @@ typedef struct
   size_t inBufSize_ST;
   size_t outStep_ST;
   
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   unsigned numThreads;
   size_t inBufSize_MT;
   size_t outBlockMax;
@@ -38,7 +38,9 @@ void Lzma2DecMtProps_Init(CLzma2DecMtProps *p);
   SZ_ERROR_THREAD - error in multithreading functions (only for Mt version)
 */
 
-typedef void * CLzma2DecMtHandle;
+typedef struct CLzma2DecMt CLzma2DecMt;
+typedef CLzma2DecMt * CLzma2DecMtHandle;
+// Z7_DECLARE_HANDLE(CLzma2DecMtHandle)
 
 CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid);
 void Lzma2DecMt_Destroy(CLzma2DecMtHandle p);
@@ -46,11 +48,11 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle p);
 SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
     Byte prop,
     const CLzma2DecMtProps *props,
-    ISeqOutStream *outStream,
+    ISeqOutStreamPtr outStream,
     const UInt64 *outDataSize, // NULL means undefined
     int finishMode,            // 0 - partial unpacking is allowed, 1 - if lzma2 stream must be finished
     // Byte *outBuf, size_t *outBufSize,
-    ISeqInStream *inStream,
+    ISeqInStreamPtr inStream,
     // const Byte *inData, size_t inDataSize,
     
     // out variables:
@@ -58,7 +60,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
     int *isMT,  /* out: (*isMT == 0), if single thread decoding was used */
 
     // UInt64 *outProcessed,
-    ICompressProgress *progress);
+    ICompressProgressPtr progress);
 
 
 /* ---------- Read from CLzma2DecMtHandle Interface ---------- */
@@ -67,7 +69,7 @@ SRes Lzma2DecMt_Init(CLzma2DecMtHandle pp,
     Byte prop,
     const CLzma2DecMtProps *props,
     const UInt64 *outDataSize, int finishMode,
-    ISeqInStream *inStream);
+    ISeqInStreamPtr inStream);
 
 SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp,
     Byte *data, size_t *outSize,
diff --git a/src/sdk/C/Lzma2Enc.c b/src/sdk/C/Lzma2Enc.c
index 5c1ad49..72aec69 100644
--- a/src/sdk/C/Lzma2Enc.c
+++ b/src/sdk/C/Lzma2Enc.c
@@ -1,18 +1,18 @@
 /* Lzma2Enc.c -- LZMA2 Encoder
-2018-07-04 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include <string.h>
 
-/* #define _7ZIP_ST */
+/* #define Z7_ST */
 
 #include "Lzma2Enc.h"
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 #include "MtCoder.h"
 #else
-#define MTCODER__THREADS_MAX 1
+#define MTCODER_THREADS_MAX 1
 #endif
 
 #define LZMA2_CONTROL_LZMA (1 << 7)
@@ -40,7 +40,7 @@
 typedef struct
 {
   ISeqInStream vt;
-  ISeqInStream *realStream;
+  ISeqInStreamPtr realStream;
   UInt64 limit;
   UInt64 processed;
   int finished;
@@ -53,15 +53,15 @@ static void LimitedSeqInStream_Init(CLimitedSeqInStream *p)
   p->finished = 0;
 }
 
-static SRes LimitedSeqInStream_Read(const ISeqInStream *pp, void *data, size_t *size)
+static SRes LimitedSeqInStream_Read(ISeqInStreamPtr pp, void *data, size_t *size)
 {
-  CLimitedSeqInStream *p = CONTAINER_FROM_VTBL(pp, CLimitedSeqInStream, vt);
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CLimitedSeqInStream)
   size_t size2 = *size;
   SRes res = SZ_OK;
   
   if (p->limit != (UInt64)(Int64)-1)
   {
-    UInt64 rem = p->limit - p->processed;
+    const UInt64 rem = p->limit - p->processed;
     if (size2 > rem)
       size2 = (size_t)rem;
   }
@@ -95,8 +95,8 @@ static SRes Lzma2EncInt_InitStream(CLzma2EncInt *p, const CLzma2EncProps *props)
   {
     SizeT propsSize = LZMA_PROPS_SIZE;
     Byte propsEncoded[LZMA_PROPS_SIZE];
-    RINOK(LzmaEnc_SetProps(p->enc, &props->lzmaProps));
-    RINOK(LzmaEnc_WriteProperties(p->enc, propsEncoded, &propsSize));
+    RINOK(LzmaEnc_SetProps(p->enc, &props->lzmaProps))
+    RINOK(LzmaEnc_WriteProperties(p->enc, propsEncoded, &propsSize))
     p->propsByte = propsEncoded[0];
     p->propsAreSet = True;
   }
@@ -111,23 +111,23 @@ static void Lzma2EncInt_InitBlock(CLzma2EncInt *p)
 }
 
 
-SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp, ISeqInStream *inStream, UInt32 keepWindowSize,
+SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle p, ISeqInStreamPtr inStream, UInt32 keepWindowSize,
     ISzAllocPtr alloc, ISzAllocPtr allocBig);
-SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const Byte *src, SizeT srcLen,
+SRes LzmaEnc_MemPrepare(CLzmaEncHandle p, const Byte *src, SizeT srcLen,
     UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig);
-SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit,
+SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle p, BoolInt reInit,
     Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize);
-const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle pp);
-void LzmaEnc_Finish(CLzmaEncHandle pp);
-void LzmaEnc_SaveState(CLzmaEncHandle pp);
-void LzmaEnc_RestoreState(CLzmaEncHandle pp);
+const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle p);
+void LzmaEnc_Finish(CLzmaEncHandle p);
+void LzmaEnc_SaveState(CLzmaEncHandle p);
+void LzmaEnc_RestoreState(CLzmaEncHandle p);
 
 /*
-UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp);
+UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle p);
 */
 
 static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
-    size_t *packSizeRes, ISeqOutStream *outStream)
+    size_t *packSizeRes, ISeqOutStreamPtr outStream)
 {
   size_t packSizeLimit = *packSizeRes;
   size_t packSize = packSizeLimit;
@@ -167,7 +167,7 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
 
     while (unpackSize > 0)
     {
-      UInt32 u = (unpackSize < LZMA2_COPY_CHUNK_SIZE) ? unpackSize : LZMA2_COPY_CHUNK_SIZE;
+      const UInt32 u = (unpackSize < LZMA2_COPY_CHUNK_SIZE) ? unpackSize : LZMA2_COPY_CHUNK_SIZE;
       if (packSizeLimit - destPos < u + 3)
         return SZ_ERROR_OUTPUT_EOF;
       outBuf[destPos++] = (Byte)(p->srcPos == 0 ? LZMA2_CONTROL_COPY_RESET_DIC : LZMA2_CONTROL_COPY_NO_RESET);
@@ -196,9 +196,9 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
 
   {
     size_t destPos = 0;
-    UInt32 u = unpackSize - 1;
-    UInt32 pm = (UInt32)(packSize - 1);
-    unsigned mode = (p->srcPos == 0) ? 3 : (p->needInitState ? (p->needInitProp ? 2 : 1) : 0);
+    const UInt32 u = unpackSize - 1;
+    const UInt32 pm = (UInt32)(packSize - 1);
+    const unsigned mode = (p->srcPos == 0) ? 3 : (p->needInitState ? (p->needInitProp ? 2 : 1) : 0);
 
     PRF(printf("               "));
 
@@ -231,10 +231,11 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
 void Lzma2EncProps_Init(CLzma2EncProps *p)
 {
   LzmaEncProps_Init(&p->lzmaProps);
-  p->blockSize = LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO;
+  p->blockSize = LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO;
   p->numBlockThreads_Reduced = -1;
   p->numBlockThreads_Max = -1;
   p->numTotalThreads = -1;
+  p->numThreadGroups = 0;
 }
 
 void Lzma2EncProps_Normalize(CLzma2EncProps *p)
@@ -251,8 +252,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
   t2 = p->numBlockThreads_Max;
   t3 = p->numTotalThreads;
 
-  if (t2 > MTCODER__THREADS_MAX)
-    t2 = MTCODER__THREADS_MAX;
+  if (t2 > MTCODER_THREADS_MAX)
+    t2 = MTCODER_THREADS_MAX;
 
   if (t3 <= 0)
   {
@@ -268,8 +269,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
       t1 = 1;
       t2 = t3;
     }
-    if (t2 > MTCODER__THREADS_MAX)
-      t2 = MTCODER__THREADS_MAX;
+    if (t2 > MTCODER_THREADS_MAX)
+      t2 = MTCODER_THREADS_MAX;
   }
   else if (t1 <= 0)
   {
@@ -286,8 +287,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
 
   fileSize = p->lzmaProps.reduceSize;
 
-  if (   p->blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID
-      && p->blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO
+  if (   p->blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID
+      && p->blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO
       && (p->blockSize < fileSize || fileSize == (UInt64)(Int64)-1))
     p->lzmaProps.reduceSize = p->blockSize;
 
@@ -297,19 +298,19 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
 
   t1 = p->lzmaProps.numThreads;
 
-  if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID)
+  if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID)
   {
     t2r = t2 = 1;
     t3 = t1;
   }
-  else if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO && t2 <= 1)
+  else if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO && t2 <= 1)
   {
     /* if there is no block multi-threading, we use SOLID block */
-    p->blockSize = LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID;
+    p->blockSize = LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID;
   }
   else
   {
-    if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO)
+    if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO)
     {
       const UInt32 kMinSize = (UInt32)1 << 20;
       const UInt32 kMaxSize = (UInt32)1 << 28;
@@ -330,7 +331,7 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
         numBlocks++;
       if (numBlocks < (unsigned)t2)
       {
-        t2r = (unsigned)numBlocks;
+        t2r = (int)numBlocks;
         if (t2r == 0)
           t2r = 1;
         t3 = t1 * t2r;
@@ -344,7 +345,7 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
 }
 
 
-static SRes Progress(ICompressProgress *p, UInt64 inSize, UInt64 outSize)
+static SRes Progress(ICompressProgressPtr p, UInt64 inSize, UInt64 outSize)
 {
   return (p && ICompressProgress_Progress(p, inSize, outSize) != SZ_OK) ? SZ_ERROR_PROGRESS : SZ_OK;
 }
@@ -352,7 +353,7 @@ static SRes Progress(ICompressProgress *p, UInt64 inSize, UInt64 outSize)
 
 /* ---------- Lzma2 ---------- */
 
-typedef struct
+struct CLzma2Enc
 {
   Byte propEncoded;
   CLzma2EncProps props;
@@ -363,23 +364,22 @@ typedef struct
   ISzAllocPtr alloc;
   ISzAllocPtr allocBig;
 
-  CLzma2EncInt coders[MTCODER__THREADS_MAX];
+  CLzma2EncInt coders[MTCODER_THREADS_MAX];
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   
-  ISeqOutStream *outStream;
+  ISeqOutStreamPtr outStream;
   Byte *outBuf;
   size_t outBuf_Rem;   /* remainder in outBuf */
 
   size_t outBufSize;   /* size of allocated outBufs[i] */
-  size_t outBufsDataSizes[MTCODER__BLOCKS_MAX];
+  size_t outBufsDataSizes[MTCODER_BLOCKS_MAX];
   BoolInt mtCoder_WasConstructed;
   CMtCoder mtCoder;
-  Byte *outBufs[MTCODER__BLOCKS_MAX];
+  Byte *outBufs[MTCODER_BLOCKS_MAX];
 
   #endif
-
-} CLzma2Enc;
+};
 
 
 
@@ -396,30 +396,30 @@ CLzma2EncHandle Lzma2Enc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig)
   p->allocBig = allocBig;
   {
     unsigned i;
-    for (i = 0; i < MTCODER__THREADS_MAX; i++)
+    for (i = 0; i < MTCODER_THREADS_MAX; i++)
       p->coders[i].enc = NULL;
   }
   
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   p->mtCoder_WasConstructed = False;
   {
     unsigned i;
-    for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
+    for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
       p->outBufs[i] = NULL;
     p->outBufSize = 0;
   }
   #endif
 
-  return p;
+  return (CLzma2EncHandle)p;
 }
 
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
 static void Lzma2Enc_FreeOutBufs(CLzma2Enc *p)
 {
   unsigned i;
-  for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
+  for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
     if (p->outBufs[i])
     {
       ISzAlloc_Free(p->alloc, p->outBufs[i]);
@@ -430,12 +430,13 @@ static void Lzma2Enc_FreeOutBufs(CLzma2Enc *p)
 
 #endif
 
+// #define GET_CLzma2Enc_p  CLzma2Enc *p = (CLzma2Enc *)(void *)p;
 
-void Lzma2Enc_Destroy(CLzma2EncHandle pp)
+void Lzma2Enc_Destroy(CLzma2EncHandle p)
 {
-  CLzma2Enc *p = (CLzma2Enc *)pp;
+  // GET_CLzma2Enc_p
   unsigned i;
-  for (i = 0; i < MTCODER__THREADS_MAX; i++)
+  for (i = 0; i < MTCODER_THREADS_MAX; i++)
   {
     CLzma2EncInt *t = &p->coders[i];
     if (t->enc)
@@ -446,7 +447,7 @@ void Lzma2Enc_Destroy(CLzma2EncHandle pp)
   }
 
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   if (p->mtCoder_WasConstructed)
   {
     MtCoder_Destruct(&p->mtCoder);
@@ -458,13 +459,13 @@ void Lzma2Enc_Destroy(CLzma2EncHandle pp)
   ISzAlloc_Free(p->alloc, p->tempBufLzma);
   p->tempBufLzma = NULL;
 
-  ISzAlloc_Free(p->alloc, pp);
+  ISzAlloc_Free(p->alloc, p);
 }
 
 
-SRes Lzma2Enc_SetProps(CLzma2EncHandle pp, const CLzma2EncProps *props)
+SRes Lzma2Enc_SetProps(CLzma2EncHandle p, const CLzma2EncProps *props)
 {
-  CLzma2Enc *p = (CLzma2Enc *)pp;
+  // GET_CLzma2Enc_p
   CLzmaEncProps lzmaProps = props->lzmaProps;
   LzmaEncProps_Normalize(&lzmaProps);
   if (lzmaProps.lc + lzmaProps.lp > LZMA2_LCLP_MAX)
@@ -475,16 +476,16 @@ SRes Lzma2Enc_SetProps(CLzma2EncHandle pp, const CLzma2EncProps *props)
 }
 
 
-void Lzma2Enc_SetDataSize(CLzmaEncHandle pp, UInt64 expectedDataSiize)
+void Lzma2Enc_SetDataSize(CLzma2EncHandle p, UInt64 expectedDataSiize)
 {
-  CLzma2Enc *p = (CLzma2Enc *)pp;
+  // GET_CLzma2Enc_p
   p->expectedDataSize = expectedDataSiize;
 }
 
 
-Byte Lzma2Enc_WriteProperties(CLzma2EncHandle pp)
+Byte Lzma2Enc_WriteProperties(CLzma2EncHandle p)
 {
-  CLzma2Enc *p = (CLzma2Enc *)pp;
+  // GET_CLzma2Enc_p
   unsigned i;
   UInt32 dicSize = LzmaEncProps_GetDictSize(&p->props.lzmaProps);
   for (i = 0; i < 40; i++)
@@ -497,12 +498,12 @@ Byte Lzma2Enc_WriteProperties(CLzma2EncHandle pp)
 static SRes Lzma2Enc_EncodeMt1(
     CLzma2Enc *me,
     CLzma2EncInt *p,
-    ISeqOutStream *outStream,
+    ISeqOutStreamPtr outStream,
     Byte *outBuf, size_t *outBufSize,
-    ISeqInStream *inStream,
+    ISeqInStreamPtr inStream,
     const Byte *inData, size_t inDataSize,
     int finished,
-    ICompressProgress *progress)
+    ICompressProgressPtr progress)
 {
   UInt64 unpackTotal = 0;
   UInt64 packTotal = 0;
@@ -540,12 +541,12 @@ static SRes Lzma2Enc_EncodeMt1(
     }
   }
 
-  RINOK(Lzma2EncInt_InitStream(p, &me->props));
+  RINOK(Lzma2EncInt_InitStream(p, &me->props))
 
   for (;;)
   {
     SRes res = SZ_OK;
-    size_t inSizeCur = 0;
+    SizeT inSizeCur = 0;
 
     Lzma2EncInt_InitBlock(p);
     
@@ -559,7 +560,7 @@ static SRes Lzma2Enc_EncodeMt1(
       if (me->expectedDataSize != (UInt64)(Int64)-1
           && me->expectedDataSize >= unpackTotal)
         expected = me->expectedDataSize - unpackTotal;
-      if (me->props.blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID
+      if (me->props.blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID
           && expected > me->props.blockSize)
         expected = (size_t)me->props.blockSize;
 
@@ -569,14 +570,14 @@ static SRes Lzma2Enc_EncodeMt1(
           &limitedInStream.vt,
           LZMA2_KEEP_WINDOW_SIZE,
           me->alloc,
-          me->allocBig));
+          me->allocBig))
     }
     else
     {
-      inSizeCur = inDataSize - (size_t)unpackTotal;
-      if (me->props.blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID
+      inSizeCur = (SizeT)(inDataSize - (size_t)unpackTotal);
+      if (me->props.blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID
           && inSizeCur > me->props.blockSize)
-        inSizeCur = (size_t)me->props.blockSize;
+        inSizeCur = (SizeT)(size_t)me->props.blockSize;
     
       // LzmaEnc_SetDataSize(p->enc, inSizeCur);
       
@@ -584,7 +585,7 @@ static SRes Lzma2Enc_EncodeMt1(
           inData + (size_t)unpackTotal, inSizeCur,
           LZMA2_KEEP_WINDOW_SIZE,
           me->alloc,
-          me->allocBig));
+          me->allocBig))
     }
 
     for (;;)
@@ -621,7 +622,7 @@ static SRes Lzma2Enc_EncodeMt1(
     
     unpackTotal += p->srcPos;
     
-    RINOK(res);
+    RINOK(res)
 
     if (p->srcPos != (inStream ? limitedInStream.processed : inSizeCur))
       return SZ_ERROR_FAIL;
@@ -632,15 +633,15 @@ static SRes Lzma2Enc_EncodeMt1(
       {
         if (outBuf)
         {
-          size_t destPos = *outBufSize;
+          const size_t destPos = *outBufSize;
           if (destPos >= outLim)
             return SZ_ERROR_OUTPUT_EOF;
-          outBuf[destPos] = 0;
+          outBuf[destPos] = LZMA2_CONTROL_EOF; // 0
           *outBufSize = destPos + 1;
         }
         else
         {
-          Byte b = 0;
+          const Byte b = LZMA2_CONTROL_EOF; // 0;
           if (ISeqOutStream_Write(outStream, &b, 1) != 1)
             return SZ_ERROR_WRITE;
         }
@@ -652,12 +653,12 @@ static SRes Lzma2Enc_EncodeMt1(
 
 
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
-static SRes Lzma2Enc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBufIndex,
+static SRes Lzma2Enc_MtCallback_Code(void *p, unsigned coderIndex, unsigned outBufIndex,
     const Byte *src, size_t srcSize, int finished)
 {
-  CLzma2Enc *me = (CLzma2Enc *)pp;
+  CLzma2Enc *me = (CLzma2Enc *)p;
   size_t destSize = me->outBufSize;
   SRes res;
   CMtProgressThunk progressThunk;
@@ -692,9 +693,9 @@ static SRes Lzma2Enc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned out
 }
 
 
-static SRes Lzma2Enc_MtCallback_Write(void *pp, unsigned outBufIndex)
+static SRes Lzma2Enc_MtCallback_Write(void *p, unsigned outBufIndex)
 {
-  CLzma2Enc *me = (CLzma2Enc *)pp;
+  CLzma2Enc *me = (CLzma2Enc *)p;
   size_t size = me->outBufsDataSizes[outBufIndex];
   const Byte *data = me->outBufs[outBufIndex];
   
@@ -713,14 +714,14 @@ static SRes Lzma2Enc_MtCallback_Write(void *pp, unsigned outBufIndex)
 
 
 
-SRes Lzma2Enc_Encode2(CLzma2EncHandle pp,
-    ISeqOutStream *outStream,
+SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
+    ISeqOutStreamPtr outStream,
     Byte *outBuf, size_t *outBufSize,
-    ISeqInStream *inStream,
+    ISeqInStreamPtr inStream,
     const Byte *inData, size_t inDataSize,
-    ICompressProgress *progress)
+    ICompressProgressPtr progress)
 {
-  CLzma2Enc *p = (CLzma2Enc *)pp;
+  // GET_CLzma2Enc_p
 
   if (inStream && inData)
     return SZ_ERROR_PARAM;
@@ -730,11 +731,11 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp,
 
   {
     unsigned i;
-    for (i = 0; i < MTCODER__THREADS_MAX; i++)
+    for (i = 0; i < MTCODER_THREADS_MAX; i++)
       p->coders[i].propsAreSet = False;
   }
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   
   if (p->props.numBlockThreads_Reduced > 1)
   {
@@ -772,7 +773,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp,
       return SZ_ERROR_PARAM; /* SZ_ERROR_MEM */
 
     {
-      size_t destBlockSize = p->mtCoder.blockSize + (p->mtCoder.blockSize >> 10) + 16;
+      const size_t destBlockSize = p->mtCoder.blockSize + (p->mtCoder.blockSize >> 10) + 16;
       if (destBlockSize < p->mtCoder.blockSize)
         return SZ_ERROR_PARAM;
       if (p->outBufSize != destBlockSize)
@@ -780,13 +781,14 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp,
       p->outBufSize = destBlockSize;
     }
 
-    p->mtCoder.numThreadsMax = p->props.numBlockThreads_Max;
+    p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max;
+    p->mtCoder.numThreadGroups = p->props.numThreadGroups;
     p->mtCoder.expectedDataSize = p->expectedDataSize;
     
     {
-      SRes res = MtCoder_Code(&p->mtCoder);
+      const SRes res = MtCoder_Code(&p->mtCoder);
       if (!outStream)
-        *outBufSize = p->outBuf - outBuf;
+        *outBufSize = (size_t)(p->outBuf - outBuf);
       return res;
     }
   }
@@ -801,3 +803,5 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp,
       True, /* finished */
       progress);
 }
+
+#undef PRF
diff --git a/src/sdk/C/Lzma2Enc.h b/src/sdk/C/Lzma2Enc.h
index 6a6110f..1e6b50c 100644
--- a/src/sdk/C/Lzma2Enc.h
+++ b/src/sdk/C/Lzma2Enc.h
@@ -1,15 +1,15 @@
 /* Lzma2Enc.h -- LZMA2 Encoder
-2017-07-27 : Igor Pavlov : Public domain */
+2023-04-13 : Igor Pavlov : Public domain */
 
-#ifndef __LZMA2_ENC_H
-#define __LZMA2_ENC_H
+#ifndef ZIP7_INC_LZMA2_ENC_H
+#define ZIP7_INC_LZMA2_ENC_H
 
 #include "LzmaEnc.h"
 
 EXTERN_C_BEGIN
 
-#define LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO 0
-#define LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID ((UInt64)(Int64)-1)
+#define LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO   0
+#define LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID  ((UInt64)(Int64)-1)
 
 typedef struct
 {
@@ -18,6 +18,7 @@ typedef struct
   int numBlockThreads_Reduced;
   int numBlockThreads_Max;
   int numTotalThreads;
+  unsigned numThreadGroups; // 0 : no groups
 } CLzma2EncProps;
 
 void Lzma2EncProps_Init(CLzma2EncProps *p);
@@ -36,7 +37,9 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p);
   SZ_ERROR_THREAD - error in multithreading functions (only for Mt version)
 */
 
-typedef void * CLzma2EncHandle;
+typedef struct CLzma2Enc CLzma2Enc;
+typedef CLzma2Enc * CLzma2EncHandle;
+// Z7_DECLARE_HANDLE(CLzma2EncHandle)
 
 CLzma2EncHandle Lzma2Enc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig);
 void Lzma2Enc_Destroy(CLzma2EncHandle p);
@@ -44,11 +47,11 @@ SRes Lzma2Enc_SetProps(CLzma2EncHandle p, const CLzma2EncProps *props);
 void Lzma2Enc_SetDataSize(CLzma2EncHandle p, UInt64 expectedDataSiize);
 Byte Lzma2Enc_WriteProperties(CLzma2EncHandle p);
 SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
-    ISeqOutStream *outStream,
+    ISeqOutStreamPtr outStream,
     Byte *outBuf, size_t *outBufSize,
-    ISeqInStream *inStream,
+    ISeqInStreamPtr inStream,
     const Byte *inData, size_t inDataSize,
-    ICompressProgress *progress);
+    ICompressProgressPtr progress);
 
 EXTERN_C_END
 
diff --git a/src/sdk/C/Lzma86.h b/src/sdk/C/Lzma86.h
index bebed5c..e7707e2 100644
--- a/src/sdk/C/Lzma86.h
+++ b/src/sdk/C/Lzma86.h
@@ -1,8 +1,8 @@
 /* Lzma86.h -- LZMA + x86 (BCJ) Filter
-2013-01-18 : Igor Pavlov : Public domain */
+2023-03-03 : Igor Pavlov : Public domain */
 
-#ifndef __LZMA86_H
-#define __LZMA86_H
+#ifndef ZIP7_INC_LZMA86_H
+#define ZIP7_INC_LZMA86_H
 
 #include "7zTypes.h"
 
diff --git a/src/sdk/C/Lzma86Dec.c b/src/sdk/C/Lzma86Dec.c
index 2103174..f094d4c 100644
--- a/src/sdk/C/Lzma86Dec.c
+++ b/src/sdk/C/Lzma86Dec.c
@@ -1,5 +1,5 @@
 /* Lzma86Dec.c -- LZMA + x86 (BCJ) Filter Decoder
-2016-05-16 : Igor Pavlov : Public domain */
+2023-03-03 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -46,9 +46,8 @@ SRes Lzma86_Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen)
     return res;
   if (useFilter == 1)
   {
-    UInt32 x86State;
-    x86_Convert_Init(x86State);
-    x86_Convert(dest, *destLen, 0, &x86State, 0);
+    UInt32 x86State = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL;
+    z7_BranchConvSt_X86_Dec(dest, *destLen, 0, &x86State);
   }
   return SZ_OK;
 }
diff --git a/src/sdk/C/Lzma86Enc.c b/src/sdk/C/Lzma86Enc.c
index 2617bab..0cdde1c 100644
--- a/src/sdk/C/Lzma86Enc.c
+++ b/src/sdk/C/Lzma86Enc.c
@@ -1,5 +1,5 @@
 /* Lzma86Enc.c -- LZMA + x86 (BCJ) Filter Encoder
-2018-07-04 : Igor Pavlov : Public domain */
+2023-03-03 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -11,8 +11,6 @@
 #include "Bra.h"
 #include "LzmaEnc.h"
 
-#define SZE_OUT_OVERFLOW SZE_DATA_ERROR
-
 int Lzma86_Encode(Byte *dest, size_t *destLen, const Byte *src, size_t srcLen,
     int level, UInt32 dictSize, int filterMode)
 {
@@ -48,9 +46,8 @@ int Lzma86_Encode(Byte *dest, size_t *destLen, const Byte *src, size_t srcLen,
       memcpy(filteredStream, src, srcLen);
     }
     {
-      UInt32 x86State;
-      x86_Convert_Init(x86State);
-      x86_Convert(filteredStream, srcLen, 0, &x86State, 1);
+      UInt32 x86State = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL;
+      z7_BranchConvSt_X86_Enc(filteredStream, srcLen, 0, &x86State);
     }
   }
 
diff --git a/src/sdk/C/LzmaDec.c b/src/sdk/C/LzmaDec.c
index ba3e1dd..69bb8bb 100644
--- a/src/sdk/C/LzmaDec.c
+++ b/src/sdk/C/LzmaDec.c
@@ -1,5 +1,5 @@
 /* LzmaDec.c -- LZMA Decoder
-2018-07-04 : Igor Pavlov : Public domain */
+2023-04-07 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -8,29 +8,31 @@
 /* #include "CpuArch.h" */
 #include "LzmaDec.h"
 
-#define kNumTopBits 24
-#define kTopValue ((UInt32)1 << kNumTopBits)
+// #define kNumTopBits 24
+#define kTopValue ((UInt32)1 << 24)
 
 #define kNumBitModelTotalBits 11
 #define kBitModelTotal (1 << kNumBitModelTotalBits)
-#define kNumMoveBits 5
 
 #define RC_INIT_SIZE 5
 
+#ifndef Z7_LZMA_DEC_OPT
+
+#define kNumMoveBits 5
 #define NORMALIZE if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); }
 
 #define IF_BIT_0(p) ttt = *(p); NORMALIZE; bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound)
 #define UPDATE_0(p) range = bound; *(p) = (CLzmaProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
 #define UPDATE_1(p) range -= bound; code -= bound; *(p) = (CLzmaProb)(ttt - (ttt >> kNumMoveBits));
 #define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \
-  { UPDATE_0(p); i = (i + i); A0; } else \
-  { UPDATE_1(p); i = (i + i) + 1; A1; }
+  { UPDATE_0(p)  i = (i + i); A0; } else \
+  { UPDATE_1(p)  i = (i + i) + 1; A1; }
 
 #define TREE_GET_BIT(probs, i) { GET_BIT2(probs + i, i, ;, ;); }
 
 #define REV_BIT(p, i, A0, A1) IF_BIT_0(p + i) \
-  { UPDATE_0(p + i); A0; } else \
-  { UPDATE_1(p + i); A1; }
+  { UPDATE_0(p + i)  A0; } else \
+  { UPDATE_1(p + i)  A1; }
 #define REV_BIT_VAR(  p, i, m) REV_BIT(p, i, i += m; m += m, m += m; i += m; )
 #define REV_BIT_CONST(p, i, m) REV_BIT(p, i, i += m;       , i += m * 2; )
 #define REV_BIT_LAST( p, i, m) REV_BIT(p, i, i -= m        , ; )
@@ -38,19 +40,19 @@
 #define TREE_DECODE(probs, limit, i) \
   { i = 1; do { TREE_GET_BIT(probs, i); } while (i < limit); i -= limit; }
 
-/* #define _LZMA_SIZE_OPT */
+/* #define Z7_LZMA_SIZE_OPT */
 
-#ifdef _LZMA_SIZE_OPT
+#ifdef Z7_LZMA_SIZE_OPT
 #define TREE_6_DECODE(probs, i) TREE_DECODE(probs, (1 << 6), i)
 #else
 #define TREE_6_DECODE(probs, i) \
   { i = 1; \
-  TREE_GET_BIT(probs, i); \
-  TREE_GET_BIT(probs, i); \
-  TREE_GET_BIT(probs, i); \
-  TREE_GET_BIT(probs, i); \
-  TREE_GET_BIT(probs, i); \
-  TREE_GET_BIT(probs, i); \
+  TREE_GET_BIT(probs, i) \
+  TREE_GET_BIT(probs, i) \
+  TREE_GET_BIT(probs, i) \
+  TREE_GET_BIT(probs, i) \
+  TREE_GET_BIT(probs, i) \
+  TREE_GET_BIT(probs, i) \
   i -= 0x40; }
 #endif
 
@@ -62,24 +64,25 @@
   probLit = prob + (offs + bit + symbol); \
   GET_BIT2(probLit, symbol, offs ^= bit; , ;)
 
+#endif // Z7_LZMA_DEC_OPT
 
 
-#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); }
+#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_INPUT_EOF; range <<= 8; code = (code << 8) | (*buf++); }
 
-#define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound)
+#define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound)
 #define UPDATE_0_CHECK range = bound;
 #define UPDATE_1_CHECK range -= bound; code -= bound;
 #define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \
-  { UPDATE_0_CHECK; i = (i + i); A0; } else \
-  { UPDATE_1_CHECK; i = (i + i) + 1; A1; }
+  { UPDATE_0_CHECK  i = (i + i); A0; } else \
+  { UPDATE_1_CHECK  i = (i + i) + 1; A1; }
 #define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;)
 #define TREE_DECODE_CHECK(probs, limit, i) \
   { i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; }
 
 
 #define REV_BIT_CHECK(p, i, m) IF_BIT_0_CHECK(p + i) \
-  { UPDATE_0_CHECK; i += m; m += m; } else \
-  { UPDATE_1_CHECK; m += m; i += m; }
+  { UPDATE_0_CHECK  i += m; m += m; } else \
+  { UPDATE_1_CHECK  m += m; i += m; }
 
 
 #define kNumPosBitsMax 4
@@ -114,6 +117,9 @@
 #define kMatchMinLen 2
 #define kMatchSpecLenStart (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
 
+#define kMatchSpecLen_Error_Data (1 << 9)
+#define kMatchSpecLen_Error_Fail (kMatchSpecLen_Error_Data - 1)
+
 /* External ASM code needs same CLzmaProb array layout. So don't change it. */
 
 /* (probs_1664) is faster and better for code size at some platforms */
@@ -166,10 +172,12 @@
 
 /*
 p->remainLen : shows status of LZMA decoder:
-    < kMatchSpecLenStart : normal remain
-    = kMatchSpecLenStart : finished
-    = kMatchSpecLenStart + 1 : need init range coder
-    = kMatchSpecLenStart + 2 : need init range coder and state
+    < kMatchSpecLenStart  : the number of bytes to be copied with (p->rep0) offset
+    = kMatchSpecLenStart  : the LZMA stream was finished with end mark
+    = kMatchSpecLenStart + 1  : need init range coder
+    = kMatchSpecLenStart + 2  : need init range coder and state
+    = kMatchSpecLen_Error_Fail                : Internal Code Failure
+    = kMatchSpecLen_Error_Data + [0 ... 273]  : LZMA Data Error
 */
 
 /* ---------- LZMA_DECODE_REAL ---------- */
@@ -188,34 +196,42 @@ LZMA_DECODE_REAL()
   {
     LzmaDec_TryDummy() was called before to exclude LITERAL and MATCH-REP cases.
     So first symbol can be only MATCH-NON-REP. And if that MATCH-NON-REP symbol
-    is not END_OF_PAYALOAD_MARKER, then function returns error code.
+    is not END_OF_PAYALOAD_MARKER, then the function doesn't write any byte to dictionary,
+    the function returns SZ_OK, and the caller can use (p->remainLen) and (p->reps[0]) later.
   }
 
 Processing:
-  first LZMA symbol will be decoded in any case
-  All checks for limits are at the end of main loop,
-  It will decode new LZMA-symbols while (p->buf < bufLimit && dicPos < limit),
+  The first LZMA symbol will be decoded in any case.
+  All main checks for limits are at the end of main loop,
+  It decodes additional LZMA-symbols while (p->buf < bufLimit && dicPos < limit),
   RangeCoder is still without last normalization when (p->buf < bufLimit) is being checked.
+  But if (p->buf < bufLimit), the caller provided at least (LZMA_REQUIRED_INPUT_MAX + 1) bytes for
+  next iteration  before limit (bufLimit + LZMA_REQUIRED_INPUT_MAX),
+  that is enough for worst case LZMA symbol with one additional RangeCoder normalization for one bit.
+  So that function never reads bufLimit [LZMA_REQUIRED_INPUT_MAX] byte.
 
 Out:
   RangeCoder is normalized
   Result:
     SZ_OK - OK
-    SZ_ERROR_DATA - Error
-  p->remainLen:
-    < kMatchSpecLenStart : normal remain
-    = kMatchSpecLenStart : finished
+      p->remainLen:
+        < kMatchSpecLenStart : the number of bytes to be copied with (p->reps[0]) offset
+        = kMatchSpecLenStart : the LZMA stream was finished with end mark
+
+    SZ_ERROR_DATA - error, when the MATCH-Symbol refers out of dictionary
+      p->remainLen : undefined
+      p->reps[*]    : undefined
 */
 
 
-#ifdef _LZMA_DEC_OPT
+#ifdef Z7_LZMA_DEC_OPT
 
-int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit);
+int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit);
 
 #else
 
 static
-int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
+int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
 {
   CLzmaProb *probs = GET_PROBS;
   unsigned state = (unsigned)p->state;
@@ -247,7 +263,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
     IF_BIT_0(prob)
     {
       unsigned symbol;
-      UPDATE_0(prob);
+      UPDATE_0(prob)
       prob = probs + Literal;
       if (processedPos != 0 || checkDicSize != 0)
         prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
@@ -257,7 +273,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
       {
         state -= (state < 4) ? state : 3;
         symbol = 1;
-        #ifdef _LZMA_SIZE_OPT
+        #ifdef Z7_LZMA_SIZE_OPT
         do { NORMAL_LITER_DEC } while (symbol < 0x100);
         #else
         NORMAL_LITER_DEC
@@ -276,7 +292,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
         unsigned offs = 0x100;
         state -= (state < 10) ? 3 : 6;
         symbol = 1;
-        #ifdef _LZMA_SIZE_OPT
+        #ifdef Z7_LZMA_SIZE_OPT
         do
         {
           unsigned bit;
@@ -305,60 +321,62 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
     }
     
     {
-      UPDATE_1(prob);
+      UPDATE_1(prob)
       prob = probs + IsRep + state;
       IF_BIT_0(prob)
       {
-        UPDATE_0(prob);
+        UPDATE_0(prob)
         state += kNumStates;
         prob = probs + LenCoder;
       }
       else
       {
-        UPDATE_1(prob);
-        /*
-        // that case was checked before with kBadRepCode
-        if (checkDicSize == 0 && processedPos == 0)
-          return SZ_ERROR_DATA;
-        */
+        UPDATE_1(prob)
         prob = probs + IsRepG0 + state;
         IF_BIT_0(prob)
         {
-          UPDATE_0(prob);
+          UPDATE_0(prob)
           prob = probs + IsRep0Long + COMBINED_PS_STATE;
           IF_BIT_0(prob)
           {
-            UPDATE_0(prob);
+            UPDATE_0(prob)
+  
+            // that case was checked before with kBadRepCode
+            // if (checkDicSize == 0 && processedPos == 0) { len = kMatchSpecLen_Error_Data + 1; break; }
+            // The caller doesn't allow (dicPos == limit) case here
+            // so we don't need the following check:
+            // if (dicPos == limit) { state = state < kNumLitStates ? 9 : 11; len = 1; break; }
+            
             dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
             dicPos++;
             processedPos++;
             state = state < kNumLitStates ? 9 : 11;
             continue;
           }
-          UPDATE_1(prob);
+          UPDATE_1(prob)
         }
         else
         {
           UInt32 distance;
-          UPDATE_1(prob);
+          UPDATE_1(prob)
           prob = probs + IsRepG1 + state;
           IF_BIT_0(prob)
           {
-            UPDATE_0(prob);
+            UPDATE_0(prob)
             distance = rep1;
           }
           else
           {
-            UPDATE_1(prob);
+            UPDATE_1(prob)
             prob = probs + IsRepG2 + state;
             IF_BIT_0(prob)
             {
-              UPDATE_0(prob);
+              UPDATE_0(prob)
               distance = rep2;
             }
             else
             {
-              UPDATE_1(prob);
+              UPDATE_1(prob)
               distance = rep3;
               rep3 = rep2;
             }
@@ -371,37 +389,37 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
         prob = probs + RepLenCoder;
       }
       
-      #ifdef _LZMA_SIZE_OPT
+      #ifdef Z7_LZMA_SIZE_OPT
       {
         unsigned lim, offset;
         CLzmaProb *probLen = prob + LenChoice;
         IF_BIT_0(probLen)
         {
-          UPDATE_0(probLen);
+          UPDATE_0(probLen)
           probLen = prob + LenLow + GET_LEN_STATE;
           offset = 0;
           lim = (1 << kLenNumLowBits);
         }
         else
         {
-          UPDATE_1(probLen);
+          UPDATE_1(probLen)
           probLen = prob + LenChoice2;
           IF_BIT_0(probLen)
           {
-            UPDATE_0(probLen);
+            UPDATE_0(probLen)
             probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits);
             offset = kLenNumLowSymbols;
             lim = (1 << kLenNumLowBits);
           }
           else
           {
-            UPDATE_1(probLen);
+            UPDATE_1(probLen)
             probLen = prob + LenHigh;
             offset = kLenNumLowSymbols * 2;
             lim = (1 << kLenNumHighBits);
           }
         }
-        TREE_DECODE(probLen, lim, len);
+        TREE_DECODE(probLen, lim, len)
         len += offset;
       }
       #else
@@ -409,32 +427,32 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
         CLzmaProb *probLen = prob + LenChoice;
         IF_BIT_0(probLen)
         {
-          UPDATE_0(probLen);
+          UPDATE_0(probLen)
           probLen = prob + LenLow + GET_LEN_STATE;
           len = 1;
-          TREE_GET_BIT(probLen, len);
-          TREE_GET_BIT(probLen, len);
-          TREE_GET_BIT(probLen, len);
+          TREE_GET_BIT(probLen, len)
+          TREE_GET_BIT(probLen, len)
+          TREE_GET_BIT(probLen, len)
           len -= 8;
         }
         else
         {
-          UPDATE_1(probLen);
+          UPDATE_1(probLen)
           probLen = prob + LenChoice2;
           IF_BIT_0(probLen)
           {
-            UPDATE_0(probLen);
+            UPDATE_0(probLen)
             probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits);
             len = 1;
-            TREE_GET_BIT(probLen, len);
-            TREE_GET_BIT(probLen, len);
-            TREE_GET_BIT(probLen, len);
+            TREE_GET_BIT(probLen, len)
+            TREE_GET_BIT(probLen, len)
+            TREE_GET_BIT(probLen, len)
           }
           else
           {
-            UPDATE_1(probLen);
+            UPDATE_1(probLen)
             probLen = prob + LenHigh;
-            TREE_DECODE(probLen, (1 << kLenNumHighBits), len);
+            TREE_DECODE(probLen, (1 << kLenNumHighBits), len)
             len += kLenNumLowSymbols * 2;
           }
         }
@@ -446,7 +464,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
         UInt32 distance;
         prob = probs + PosSlot +
             ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
-        TREE_6_DECODE(prob, distance);
+        TREE_6_DECODE(prob, distance)
         if (distance >= kStartPosModelIndex)
         {
           unsigned posSlot = (unsigned)distance;
@@ -461,7 +479,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
               distance++;
               do
               {
-                REV_BIT_VAR(prob, distance, m);
+                REV_BIT_VAR(prob, distance, m)
               }
               while (--numDirectBits);
               distance -= m;
@@ -496,10 +514,10 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
             distance <<= kNumAlignBits;
             {
               unsigned i = 1;
-              REV_BIT_CONST(prob, i, 1);
-              REV_BIT_CONST(prob, i, 2);
-              REV_BIT_CONST(prob, i, 4);
-              REV_BIT_LAST (prob, i, 8);
+              REV_BIT_CONST(prob, i, 1)
+              REV_BIT_CONST(prob, i, 2)
+              REV_BIT_CONST(prob, i, 4)
+              REV_BIT_LAST (prob, i, 8)
               distance |= i;
             }
             if (distance == (UInt32)0xFFFFFFFF)
@@ -518,8 +536,10 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
         state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
         if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
         {
-          p->dicPos = dicPos;
-          return SZ_ERROR_DATA;
+          len += kMatchSpecLen_Error_Data + kMatchMinLen;
+          // len = kMatchSpecLen_Error_Data;
+          // len += kMatchMinLen;
+          break;
         }
       }
 
@@ -532,8 +552,13 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
         
         if ((rem = limit - dicPos) == 0)
         {
-          p->dicPos = dicPos;
-          return SZ_ERROR_DATA;
+          /*
+          We stop decoding and return SZ_OK, and we can resume decoding later.
+          Any error conditions can be tested later in caller code.
+          For more strict mode we can stop decoding with error
+          // len += kMatchSpecLen_Error_Data;
+          */
+          break;
         }
         
         curLen = ((rem < len) ? (unsigned)rem : len);
@@ -567,12 +592,12 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
   }
   while (dicPos < limit && buf < bufLimit);
 
-  NORMALIZE;
+  NORMALIZE
   
   p->buf = buf;
   p->range = range;
   p->code = code;
-  p->remainLen = (UInt32)len;
+  p->remainLen = (UInt32)len; // & (kMatchSpecLen_Error_Data - 1); // we can write real length for error matches too.
   p->dicPos = dicPos;
   p->processedPos = processedPos;
   p->reps[0] = rep0;
@@ -580,40 +605,61 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
   p->reps[2] = rep2;
   p->reps[3] = rep3;
   p->state = (UInt32)state;
-
+  if (len >= kMatchSpecLen_Error_Data)
+    return SZ_ERROR_DATA;
   return SZ_OK;
 }
 #endif
 
-static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit)
+
+
+static void Z7_FASTCALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit)
 {
-  if (p->remainLen != 0 && p->remainLen < kMatchSpecLenStart)
+  unsigned len = (unsigned)p->remainLen;
+  if (len == 0 /* || len >= kMatchSpecLenStart */)
+    return;
   {
-    Byte *dic = p->dic;
     SizeT dicPos = p->dicPos;
-    SizeT dicBufSize = p->dicBufSize;
-    unsigned len = (unsigned)p->remainLen;
-    SizeT rep0 = p->reps[0]; /* we use SizeT to avoid the BUG of VC14 for AMD64 */
-    SizeT rem = limit - dicPos;
-    if (rem < len)
-      len = (unsigned)(rem);
+    Byte *dic;
+    SizeT dicBufSize;
+    SizeT rep0;   /* we use SizeT to avoid the BUG of VC14 for AMD64 */
+    {
+      SizeT rem = limit - dicPos;
+      if (rem < len)
+      {
+        len = (unsigned)(rem);
+        if (len == 0)
+          return;
+      }
+    }
 
     if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len)
       p->checkDicSize = p->prop.dicSize;
 
     p->processedPos += (UInt32)len;
     p->remainLen -= (UInt32)len;
-    while (len != 0)
+    dic = p->dic;
+    rep0 = p->reps[0];
+    dicBufSize = p->dicBufSize;
+    do
     {
-      len--;
       dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
       dicPos++;
     }
+    while (--len);
     p->dicPos = dicPos;
   }
 }
 
 
+/*
+At staring of new stream we have one of the following symbols:
+  - Literal        - is allowed
+  - Non-Rep-Match  - is allowed only if it's end marker symbol
+  - Rep-Match      - is not allowed
+We use early check of (RangeCoder:Code) over kBadRepCode to simplify main decoding code
+*/
+
 #define kRange0 0xFFFFFFFF
 #define kBound0 ((kRange0 >> kNumBitModelTotalBits) << (kNumBitModelTotalBits - 1))
 #define kBadRepCode (kBound0 + (((kRange0 - kBound0) >> kNumBitModelTotalBits) << (kNumBitModelTotalBits - 1)))
@@ -621,69 +667,77 @@ static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit)
   #error Stop_Compiling_Bad_LZMA_Check
 #endif
 
-static int MY_FAST_CALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
-{
-  do
-  {
-    SizeT limit2 = limit;
-    if (p->checkDicSize == 0)
-    {
-      UInt32 rem = p->prop.dicSize - p->processedPos;
-      if (limit - p->dicPos > rem)
-        limit2 = p->dicPos + rem;
 
-      if (p->processedPos == 0)
-        if (p->code >= kBadRepCode)
-          return SZ_ERROR_DATA;
-    }
+/*
+LzmaDec_DecodeReal2():
+  It calls LZMA_DECODE_REAL() and it adjusts limit according (p->checkDicSize).
 
-    RINOK(LZMA_DECODE_REAL(p, limit2, bufLimit));
-    
+We correct (p->checkDicSize) after LZMA_DECODE_REAL() and in LzmaDec_WriteRem(),
+and we support the following state of (p->checkDicSize):
+  if (total_processed < p->prop.dicSize) then
+  {
+    (total_processed == p->processedPos)
+    (p->checkDicSize == 0)
+  }
+  else
+    (p->checkDicSize == p->prop.dicSize)
+*/
+
+static int Z7_FASTCALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
+{
+  if (p->checkDicSize == 0)
+  {
+    UInt32 rem = p->prop.dicSize - p->processedPos;
+    if (limit - p->dicPos > rem)
+      limit = p->dicPos + rem;
+  }
+  {
+    int res = LZMA_DECODE_REAL(p, limit, bufLimit);
     if (p->checkDicSize == 0 && p->processedPos >= p->prop.dicSize)
       p->checkDicSize = p->prop.dicSize;
-    
-    LzmaDec_WriteRem(p, limit);
+    return res;
   }
-  while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart);
-
-  return 0;
 }
 
+
+
 typedef enum
 {
-  DUMMY_ERROR, /* unexpected end of input stream */
+  DUMMY_INPUT_EOF, /* need more input data */
   DUMMY_LIT,
   DUMMY_MATCH,
   DUMMY_REP
 } ELzmaDummy;
 
-static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inSize)
+
+#define IS_DUMMY_END_MARKER_POSSIBLE(dummyRes) ((dummyRes) == DUMMY_MATCH)
+
+static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byte **bufOut)
 {
   UInt32 range = p->range;
   UInt32 code = p->code;
-  const Byte *bufLimit = buf + inSize;
+  const Byte *bufLimit = *bufOut;
   const CLzmaProb *probs = GET_PROBS;
   unsigned state = (unsigned)p->state;
   ELzmaDummy res;
 
+  for (;;)
   {
     const CLzmaProb *prob;
     UInt32 bound;
     unsigned ttt;
-    unsigned posState = CALC_POS_STATE(p->processedPos, (1 << p->prop.pb) - 1);
+    unsigned posState = CALC_POS_STATE(p->processedPos, ((unsigned)1 << p->prop.pb) - 1);
 
     prob = probs + IsMatch + COMBINED_PS_STATE;
     IF_BIT_0_CHECK(prob)
     {
       UPDATE_0_CHECK
 
-      /* if (bufLimit - buf >= 7) return DUMMY_LIT; */
-
       prob = probs + Literal;
       if (p->checkDicSize != 0 || p->processedPos != 0)
         prob += ((UInt32)LZMA_LIT_SIZE *
-            ((((p->processedPos) & ((1 << (p->prop.lp)) - 1)) << p->prop.lc) +
-            (p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc))));
+            ((((p->processedPos) & (((unsigned)1 << (p->prop.lp)) - 1)) << p->prop.lc) +
+            ((unsigned)p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc))));
 
       if (state < kNumLitStates)
       {
@@ -713,55 +767,54 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS
     else
     {
       unsigned len;
-      UPDATE_1_CHECK;
+      UPDATE_1_CHECK
 
       prob = probs + IsRep + state;
       IF_BIT_0_CHECK(prob)
       {
-        UPDATE_0_CHECK;
+        UPDATE_0_CHECK
         state = 0;
         prob = probs + LenCoder;
         res = DUMMY_MATCH;
       }
       else
       {
-        UPDATE_1_CHECK;
+        UPDATE_1_CHECK
         res = DUMMY_REP;
         prob = probs + IsRepG0 + state;
         IF_BIT_0_CHECK(prob)
         {
-          UPDATE_0_CHECK;
+          UPDATE_0_CHECK
           prob = probs + IsRep0Long + COMBINED_PS_STATE;
           IF_BIT_0_CHECK(prob)
           {
-            UPDATE_0_CHECK;
-            NORMALIZE_CHECK;
-            return DUMMY_REP;
+            UPDATE_0_CHECK
+            break;
           }
           else
           {
-            UPDATE_1_CHECK;
+            UPDATE_1_CHECK
           }
         }
         else
         {
-          UPDATE_1_CHECK;
+          UPDATE_1_CHECK
           prob = probs + IsRepG1 + state;
           IF_BIT_0_CHECK(prob)
           {
-            UPDATE_0_CHECK;
+            UPDATE_0_CHECK
           }
           else
           {
-            UPDATE_1_CHECK;
+            UPDATE_1_CHECK
             prob = probs + IsRepG2 + state;
             IF_BIT_0_CHECK(prob)
             {
-              UPDATE_0_CHECK;
+              UPDATE_0_CHECK
             }
             else
             {
-              UPDATE_1_CHECK;
+              UPDATE_1_CHECK
             }
           }
         }
@@ -773,31 +826,31 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS
         const CLzmaProb *probLen = prob + LenChoice;
         IF_BIT_0_CHECK(probLen)
         {
-          UPDATE_0_CHECK;
+          UPDATE_0_CHECK
           probLen = prob + LenLow + GET_LEN_STATE;
           offset = 0;
           limit = 1 << kLenNumLowBits;
         }
         else
         {
-          UPDATE_1_CHECK;
+          UPDATE_1_CHECK
           probLen = prob + LenChoice2;
           IF_BIT_0_CHECK(probLen)
           {
-            UPDATE_0_CHECK;
+            UPDATE_0_CHECK
             probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits);
             offset = kLenNumLowSymbols;
             limit = 1 << kLenNumLowBits;
           }
           else
           {
-            UPDATE_1_CHECK;
+            UPDATE_1_CHECK
             probLen = prob + LenHigh;
             offset = kLenNumLowSymbols * 2;
             limit = 1 << kLenNumHighBits;
           }
         }
-        TREE_DECODE_CHECK(probLen, limit, len);
+        TREE_DECODE_CHECK(probLen, limit, len)
         len += offset;
       }
 
@@ -807,13 +860,11 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS
         prob = probs + PosSlot +
             ((len < kNumLenToPosStates - 1 ? len : kNumLenToPosStates - 1) <<
             kNumPosSlotBits);
-        TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot);
+        TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot)
         if (posSlot >= kStartPosModelIndex)
         {
           unsigned numDirectBits = ((posSlot >> 1) - 1);
 
-          /* if (bufLimit - buf >= 8) return DUMMY_MATCH; */
-
           if (posSlot < kEndPosModelIndex)
           {
             prob = probs + SpecPos + ((2 | (posSlot & 1)) << numDirectBits);
@@ -837,19 +888,22 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, SizeT inS
             unsigned m = 1;
             do
             {
-              REV_BIT_CHECK(prob, i, m);
+              REV_BIT_CHECK(prob, i, m)
             }
             while (--numDirectBits);
           }
         }
       }
     }
+    break;
   }
-  NORMALIZE_CHECK;
+  NORMALIZE_CHECK
+
+  *bufOut = buf;
   return res;
 }
 
-
+void LzmaDec_InitDicAndState(CLzmaDec *p, BoolInt initDic, BoolInt initState);
 void LzmaDec_InitDicAndState(CLzmaDec *p, BoolInt initDic, BoolInt initState)
 {
   p->remainLen = kMatchSpecLenStart + 1;
@@ -872,16 +926,41 @@ void LzmaDec_Init(CLzmaDec *p)
 }
 
 
+/*
+LZMA supports optional end_marker.
+So the decoder can lookahead for one additional LZMA-Symbol to check end_marker.
+That additional LZMA-Symbol can require up to LZMA_REQUIRED_INPUT_MAX bytes in input stream.
+When the decoder reaches dicLimit, it looks (finishMode) parameter:
+  if (finishMode == LZMA_FINISH_ANY), the decoder doesn't lookahead
+  if (finishMode != LZMA_FINISH_ANY), the decoder lookahead, if end_marker is possible for current position
+
+When the decoder lookahead, and the lookahead symbol is not end_marker, we have two ways:
+  1) Strict mode (default) : the decoder returns SZ_ERROR_DATA.
+  2) The relaxed mode (alternative mode) : we could return SZ_OK, and the caller
+     must check (status) value. The caller can show the error,
+     if the end of stream is expected, and the (status) is noit
+     LZMA_STATUS_FINISHED_WITH_MARK or LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK.
+*/
+
+
+#define RETURN_NOT_FINISHED_FOR_FINISH \
+  *status = LZMA_STATUS_NOT_FINISHED; \
+  return SZ_ERROR_DATA; // for strict mode
+  // return SZ_OK; // for relaxed mode
+
+
 SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *srcLen,
     ELzmaFinishMode finishMode, ELzmaStatus *status)
 {
   SizeT inSize = *srcLen;
   (*srcLen) = 0;
-  
   *status = LZMA_STATUS_NOT_SPECIFIED;
 
   if (p->remainLen > kMatchSpecLenStart)
   {
+    if (p->remainLen > kMatchSpecLenStart + 2)
+      return p->remainLen == kMatchSpecLen_Error_Fail ? SZ_ERROR_FAIL : SZ_ERROR_DATA;
+
     for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--)
       p->tempBuf[p->tempBufSize++] = *src++;
     if (p->tempBufSize != 0 && p->tempBuf[0] != 0)
@@ -896,6 +975,12 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
       | ((UInt32)p->tempBuf[2] << 16)
       | ((UInt32)p->tempBuf[3] << 8)
       | ((UInt32)p->tempBuf[4]);
+
+    if (p->checkDicSize == 0
+        && p->processedPos == 0
+        && p->code >= kBadRepCode)
+      return SZ_ERROR_DATA;
+
     p->range = 0xFFFFFFFF;
     p->tempBufSize = 0;
 
@@ -913,10 +998,21 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
     p->remainLen = 0;
   }
 
-  LzmaDec_WriteRem(p, dicLimit);
-
-  while (p->remainLen != kMatchSpecLenStart)
+  for (;;)
   {
+    if (p->remainLen == kMatchSpecLenStart)
+    {
+      if (p->code != 0)
+        return SZ_ERROR_DATA;
+      *status = LZMA_STATUS_FINISHED_WITH_MARK;
+      return SZ_OK;
+    }
+
+    LzmaDec_WriteRem(p, dicLimit);
+
+    {
+      // (p->remainLen == 0 || p->dicPos == dicLimit)
+
       int checkEndMarkNow = 0;
 
       if (p->dicPos >= dicLimit)
@@ -933,92 +1029,174 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
         }
         if (p->remainLen != 0)
         {
-          *status = LZMA_STATUS_NOT_FINISHED;
-          return SZ_ERROR_DATA;
+          RETURN_NOT_FINISHED_FOR_FINISH
         }
         checkEndMarkNow = 1;
       }
 
+      // (p->remainLen == 0)
+
       if (p->tempBufSize == 0)
       {
-        SizeT processed;
         const Byte *bufLimit;
+        int dummyProcessed = -1;
+        
         if (inSize < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow)
         {
-          int dummyRes = LzmaDec_TryDummy(p, src, inSize);
-          if (dummyRes == DUMMY_ERROR)
+          const Byte *bufOut = src + inSize;
+          
+          ELzmaDummy dummyRes = LzmaDec_TryDummy(p, src, &bufOut);
+          
+          if (dummyRes == DUMMY_INPUT_EOF)
           {
-            memcpy(p->tempBuf, src, inSize);
-            p->tempBufSize = (unsigned)inSize;
+            size_t i;
+            if (inSize >= LZMA_REQUIRED_INPUT_MAX)
+              break;
             (*srcLen) += inSize;
+            p->tempBufSize = (unsigned)inSize;
+            for (i = 0; i < inSize; i++)
+              p->tempBuf[i] = src[i];
             *status = LZMA_STATUS_NEEDS_MORE_INPUT;
             return SZ_OK;
           }
-          if (checkEndMarkNow && dummyRes != DUMMY_MATCH)
+ 
+          dummyProcessed = (int)(bufOut - src);
+          if ((unsigned)dummyProcessed > LZMA_REQUIRED_INPUT_MAX)
+            break;
+          
+          if (checkEndMarkNow && !IS_DUMMY_END_MARKER_POSSIBLE(dummyRes))
           {
-            *status = LZMA_STATUS_NOT_FINISHED;
-            return SZ_ERROR_DATA;
+            unsigned i;
+            (*srcLen) += (unsigned)dummyProcessed;
+            p->tempBufSize = (unsigned)dummyProcessed;
+            for (i = 0; i < (unsigned)dummyProcessed; i++)
+              p->tempBuf[i] = src[i];
+            // p->remainLen = kMatchSpecLen_Error_Data;
+            RETURN_NOT_FINISHED_FOR_FINISH
           }
+          
           bufLimit = src;
+          // we will decode only one iteration
         }
         else
           bufLimit = src + inSize - LZMA_REQUIRED_INPUT_MAX;
+
         p->buf = src;
-        if (LzmaDec_DecodeReal2(p, dicLimit, bufLimit) != 0)
-          return SZ_ERROR_DATA;
-        processed = (SizeT)(p->buf - src);
-        (*srcLen) += processed;
-        src += processed;
-        inSize -= processed;
+        
+        {
+          int res = LzmaDec_DecodeReal2(p, dicLimit, bufLimit);
+          
+          SizeT processed = (SizeT)(p->buf - src);
+
+          if (dummyProcessed < 0)
+          {
+            if (processed > inSize)
+              break;
+          }
+          else if ((unsigned)dummyProcessed != processed)
+            break;
+
+          src += processed;
+          inSize -= processed;
+          (*srcLen) += processed;
+
+          if (res != SZ_OK)
+          {
+            p->remainLen = kMatchSpecLen_Error_Data;
+            return SZ_ERROR_DATA;
+          }
+        }
+        continue;
       }
-      else
+
       {
-        unsigned rem = p->tempBufSize, lookAhead = 0;
-        while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize)
-          p->tempBuf[rem++] = src[lookAhead++];
-        p->tempBufSize = rem;
+        // we have some data in (p->tempBuf)
+        // in strict mode: tempBufSize is not enough for one Symbol decoding.
+        // in relaxed mode: tempBufSize not larger than required for one Symbol decoding.
+
+        unsigned rem = p->tempBufSize;
+        unsigned ahead = 0;
+        int dummyProcessed = -1;
+        
+        while (rem < LZMA_REQUIRED_INPUT_MAX && ahead < inSize)
+          p->tempBuf[rem++] = src[ahead++];
+        
+        // ahead - the size of new data copied from (src) to (p->tempBuf)
+        // rem   - the size of temp buffer including new data from (src)
+        
         if (rem < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow)
         {
-          int dummyRes = LzmaDec_TryDummy(p, p->tempBuf, (SizeT)rem);
-          if (dummyRes == DUMMY_ERROR)
+          const Byte *bufOut = p->tempBuf + rem;
+        
+          ELzmaDummy dummyRes = LzmaDec_TryDummy(p, p->tempBuf, &bufOut);
+          
+          if (dummyRes == DUMMY_INPUT_EOF)
           {
-            (*srcLen) += (SizeT)lookAhead;
+            if (rem >= LZMA_REQUIRED_INPUT_MAX)
+              break;
+            p->tempBufSize = rem;
+            (*srcLen) += (SizeT)ahead;
             *status = LZMA_STATUS_NEEDS_MORE_INPUT;
             return SZ_OK;
           }
-          if (checkEndMarkNow && dummyRes != DUMMY_MATCH)
+          
+          dummyProcessed = (int)(bufOut - p->tempBuf);
+
+          if ((unsigned)dummyProcessed < p->tempBufSize)
+            break;
+
+          if (checkEndMarkNow && !IS_DUMMY_END_MARKER_POSSIBLE(dummyRes))
           {
-            *status = LZMA_STATUS_NOT_FINISHED;
-            return SZ_ERROR_DATA;
+            (*srcLen) += (unsigned)dummyProcessed - p->tempBufSize;
+            p->tempBufSize = (unsigned)dummyProcessed;
+            // p->remainLen = kMatchSpecLen_Error_Data;
+            RETURN_NOT_FINISHED_FOR_FINISH
           }
         }
+
         p->buf = p->tempBuf;
-        if (LzmaDec_DecodeReal2(p, dicLimit, p->buf) != 0)
-          return SZ_ERROR_DATA;
         
         {
-          unsigned kkk = (unsigned)(p->buf - p->tempBuf);
-          if (rem < kkk)
-            return SZ_ERROR_FAIL; /* some internal error */
-          rem -= kkk;
-          if (lookAhead < rem)
-            return SZ_ERROR_FAIL; /* some internal error */
-          lookAhead -= rem;
+          // we decode one symbol from (p->tempBuf) here, so the (bufLimit) is equal to (p->buf)
+          int res = LzmaDec_DecodeReal2(p, dicLimit, p->buf);
+
+          SizeT processed = (SizeT)(p->buf - p->tempBuf);
+          rem = p->tempBufSize;
+          
+          if (dummyProcessed < 0)
+          {
+            if (processed > LZMA_REQUIRED_INPUT_MAX)
+              break;
+            if (processed < rem)
+              break;
+          }
+          else if ((unsigned)dummyProcessed != processed)
+            break;
+          
+          processed -= rem;
+
+          src += processed;
+          inSize -= processed;
+          (*srcLen) += processed;
+          p->tempBufSize = 0;
+          
+          if (res != SZ_OK)
+          {
+            p->remainLen = kMatchSpecLen_Error_Data;
+            return SZ_ERROR_DATA;
+          }
         }
-        (*srcLen) += (SizeT)lookAhead;
-        src += lookAhead;
-        inSize -= (SizeT)lookAhead;
-        p->tempBufSize = 0;
       }
+    }
   }
-  
-  if (p->code != 0)
-    return SZ_ERROR_DATA;
-  *status = LZMA_STATUS_FINISHED_WITH_MARK;
-  return SZ_OK;
+
+  /*  Some unexpected error: internal error of code, memory corruption or hardware failure */
+  p->remainLen = kMatchSpecLen_Error_Fail;
+  return SZ_ERROR_FAIL;
 }
 
 
+
 SRes LzmaDec_DecodeToBuf(CLzmaDec *p, Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status)
 {
   SizeT outSize = *destLen;
@@ -1121,8 +1299,8 @@ static SRes LzmaDec_AllocateProbs2(CLzmaDec *p, const CLzmaProps *propNew, ISzAl
 SRes LzmaDec_AllocateProbs(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAllocPtr alloc)
 {
   CLzmaProps propNew;
-  RINOK(LzmaProps_Decode(&propNew, props, propsSize));
-  RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc));
+  RINOK(LzmaProps_Decode(&propNew, props, propsSize))
+  RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc))
   p->prop = propNew;
   return SZ_OK;
 }
@@ -1131,14 +1309,14 @@ SRes LzmaDec_Allocate(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAll
 {
   CLzmaProps propNew;
   SizeT dicBufSize;
-  RINOK(LzmaProps_Decode(&propNew, props, propsSize));
-  RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc));
+  RINOK(LzmaProps_Decode(&propNew, props, propsSize))
+  RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc))
 
   {
     UInt32 dictSize = propNew.dicSize;
     SizeT mask = ((UInt32)1 << 12) - 1;
          if (dictSize >= ((UInt32)1 << 30)) mask = ((UInt32)1 << 22) - 1;
-    else if (dictSize >= ((UInt32)1 << 22)) mask = ((UInt32)1 << 20) - 1;;
+    else if (dictSize >= ((UInt32)1 << 22)) mask = ((UInt32)1 << 20) - 1;
     dicBufSize = ((SizeT)dictSize + mask) & ~mask;
     if (dicBufSize < dictSize)
       dicBufSize = dictSize;
@@ -1170,8 +1348,8 @@ SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
   *status = LZMA_STATUS_NOT_SPECIFIED;
   if (inSize < RC_INIT_SIZE)
     return SZ_ERROR_INPUT_EOF;
-  LzmaDec_Construct(&p);
-  RINOK(LzmaDec_AllocateProbs(&p, propData, propSize, alloc));
+  LzmaDec_CONSTRUCT(&p)
+  RINOK(LzmaDec_AllocateProbs(&p, propData, propSize, alloc))
   p.dic = dest;
   p.dicBufSize = outSize;
   LzmaDec_Init(&p);
diff --git a/src/sdk/C/LzmaDec.h b/src/sdk/C/LzmaDec.h
index 1f0927a..b0ce28f 100644
--- a/src/sdk/C/LzmaDec.h
+++ b/src/sdk/C/LzmaDec.h
@@ -1,19 +1,19 @@
 /* LzmaDec.h -- LZMA Decoder
-2018-04-21 : Igor Pavlov : Public domain */
+2023-04-02 : Igor Pavlov : Public domain */
 
-#ifndef __LZMA_DEC_H
-#define __LZMA_DEC_H
+#ifndef ZIP7_INC_LZMA_DEC_H
+#define ZIP7_INC_LZMA_DEC_H
 
 #include "7zTypes.h"
 
 EXTERN_C_BEGIN
 
-/* #define _LZMA_PROB32 */
-/* _LZMA_PROB32 can increase the speed on some CPUs,
+/* #define Z7_LZMA_PROB32 */
+/* Z7_LZMA_PROB32 can increase the speed on some CPUs,
    but memory usage for CLzmaDec::probs will be doubled in that case */
 
 typedef
-#ifdef _LZMA_PROB32
+#ifdef Z7_LZMA_PROB32
   UInt32
 #else
   UInt16
@@ -25,7 +25,7 @@ typedef
 
 #define LZMA_PROPS_SIZE 5
 
-typedef struct _CLzmaProps
+typedef struct
 {
   Byte lc;
   Byte lp;
@@ -73,7 +73,8 @@ typedef struct
   Byte tempBuf[LZMA_REQUIRED_INPUT_MAX];
 } CLzmaDec;
 
-#define LzmaDec_Construct(p) { (p)->dic = NULL; (p)->probs = NULL; }
+#define LzmaDec_CONSTRUCT(p) { (p)->dic = NULL; (p)->probs = NULL; }
+#define LzmaDec_Construct(p) LzmaDec_CONSTRUCT(p)
 
 void LzmaDec_Init(CLzmaDec *p);
 
@@ -181,6 +182,7 @@ void LzmaDec_Free(CLzmaDec *p, ISzAllocPtr alloc);
       LZMA_STATUS_NEEDS_MORE_INPUT
       LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK
   SZ_ERROR_DATA - Data error
+  SZ_ERROR_FAIL - Some unexpected error: internal error of code, memory corruption or hardware failure
 */
 
 SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit,
@@ -223,6 +225,7 @@ SRes LzmaDec_DecodeToBuf(CLzmaDec *p, Byte *dest, SizeT *destLen,
   SZ_ERROR_MEM  - Memory allocation error
   SZ_ERROR_UNSUPPORTED - Unsupported properties
   SZ_ERROR_INPUT_EOF - It needs more bytes in input buffer (src).
+  SZ_ERROR_FAIL - Some unexpected error: internal error of code, memory corruption or hardware failure
 */
 
 SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
diff --git a/src/sdk/C/LzmaEnc.c b/src/sdk/C/LzmaEnc.c
index 1159ca2..d6b9e6e 100644
--- a/src/sdk/C/LzmaEnc.c
+++ b/src/sdk/C/LzmaEnc.c
@@ -1,5 +1,5 @@
 /* LzmaEnc.c -- LZMA Encoder
-2019-01-10: Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -12,22 +12,36 @@
 #include <stdio.h>
 #endif
 
+#include "CpuArch.h"
 #include "LzmaEnc.h"
 
 #include "LzFind.h"
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 #include "LzFindMt.h"
 #endif
 
+/* the following LzmaEnc_* declarations is internal LZMA interface for LZMA2 encoder */
+
+SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle p, ISeqInStreamPtr inStream, UInt32 keepWindowSize,
+    ISzAllocPtr alloc, ISzAllocPtr allocBig);
+SRes LzmaEnc_MemPrepare(CLzmaEncHandle p, const Byte *src, SizeT srcLen,
+    UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig);
+SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle p, BoolInt reInit,
+    Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize);
+const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle p);
+void LzmaEnc_Finish(CLzmaEncHandle p);
+void LzmaEnc_SaveState(CLzmaEncHandle p);
+void LzmaEnc_RestoreState(CLzmaEncHandle p);
+
 #ifdef SHOW_STAT
 static unsigned g_STAT_OFFSET = 0;
 #endif
 
-#define kLzmaMaxHistorySize ((UInt32)3 << 29)
-/* #define kLzmaMaxHistorySize ((UInt32)7 << 29) */
+/* for good normalization speed we still reserve 256 MB before 4 GB range */
+#define kLzmaMaxHistorySize ((UInt32)15 << 28)
 
-#define kNumTopBits 24
-#define kTopValue ((UInt32)1 << kNumTopBits)
+// #define kNumTopBits 24
+#define kTopValue ((UInt32)1 << 24)
 
 #define kNumBitModelTotalBits 11
 #define kBitModelTotal (1 << kNumBitModelTotalBits)
@@ -36,7 +50,7 @@ static unsigned g_STAT_OFFSET = 0;
 
 #define kNumMoveReducingBits 4
 #define kNumBitPriceShiftBits 4
-#define kBitPrice (1 << kNumBitPriceShiftBits)
+// #define kBitPrice (1 << kNumBitPriceShiftBits)
 
 #define REP_LEN_COUNT 64
 
@@ -46,7 +60,11 @@ void LzmaEncProps_Init(CLzmaEncProps *p)
   p->dictSize = p->mc = 0;
   p->reduceSize = (UInt64)(Int64)-1;
   p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1;
+  p->numHashOutBits = 0;
   p->writeEndMark = 0;
+  p->affinityGroup = -1;
+  p->affinity = 0;
+  p->affinityInGroup = 0;
 }
 
 void LzmaEncProps_Normalize(CLzmaEncProps *p)
@@ -55,31 +73,36 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
   if (level < 0) level = 5;
   p->level = level;
   
-  if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level <= 7 ? (1 << 25) : (1 << 26)));
+  if (p->dictSize == 0)
+    p->dictSize = (unsigned)level <= 4 ?
+        (UInt32)1 << (level * 2 + 16) :
+        (unsigned)level <= sizeof(size_t) / 2 + 4 ?
+          (UInt32)1 << (level + 20) :
+          (UInt32)1 << (sizeof(size_t) / 2 + 24);
+
   if (p->dictSize > p->reduceSize)
   {
-    unsigned i;
-    UInt32 reduceSize = (UInt32)p->reduceSize;
-    for (i = 11; i <= 30; i++)
-    {
-      if (reduceSize <= ((UInt32)2 << i)) { p->dictSize = ((UInt32)2 << i); break; }
-      if (reduceSize <= ((UInt32)3 << i)) { p->dictSize = ((UInt32)3 << i); break; }
-    }
+    UInt32 v = (UInt32)p->reduceSize;
+    const UInt32 kReduceMin = ((UInt32)1 << 12);
+    if (v < kReduceMin)
+      v = kReduceMin;
+    if (p->dictSize > v)
+      p->dictSize = v;
   }
 
   if (p->lc < 0) p->lc = 3;
   if (p->lp < 0) p->lp = 0;
   if (p->pb < 0) p->pb = 2;
 
-  if (p->algo < 0) p->algo = (level < 5 ? 0 : 1);
-  if (p->fb < 0) p->fb = (level < 7 ? 32 : 64);
+  if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1;
+  if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64;
   if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1);
-  if (p->numHashBytes < 0) p->numHashBytes = 4;
-  if (p->mc == 0) p->mc = (16 + (p->fb >> 1)) >> (p->btMode ? 0 : 1);
+  if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5);
+  if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1);
   
   if (p->numThreads < 0)
     p->numThreads =
-      #ifndef _7ZIP_ST
+      #ifndef Z7_ST
       ((p->btMode && p->algo) ? 2 : 1);
       #else
       1;
@@ -93,30 +116,97 @@ UInt32 LzmaEncProps_GetDictSize(const CLzmaEncProps *props2)
   return props.dictSize;
 }
 
-#if (_MSC_VER >= 1400)
-/* BSR code is fast for some new CPUs */
-/* #define LZMA_LOG_BSR */
+
+/*
+x86/x64:
+
+BSR:
+  IF (SRC == 0) ZF = 1, DEST is undefined;
+                  AMD : DEST is unchanged;
+  IF (SRC != 0) ZF = 0; DEST is index of top non-zero bit
+  BSR is slow in some processors
+
+LZCNT:
+  IF (SRC  == 0) CF = 1, DEST is size_in_bits_of_register(src) (32 or 64)
+  IF (SRC  != 0) CF = 0, DEST = num_lead_zero_bits
+  IF (DEST == 0) ZF = 1;
+
+LZCNT works only in new processors starting from Haswell.
+if LZCNT is not supported by processor, then it's executed as BSR.
+LZCNT can be faster than BSR, if supported.
+*/
+
+// #define LZMA_LOG_BSR
+
+#if defined(MY_CPU_ARM_OR_ARM64) /* || defined(MY_CPU_X86_OR_AMD64) */
+
+  #if (defined(__clang__) && (__clang_major__ >= 6)) \
+      || (defined(__GNUC__) && (__GNUC__ >= 6))
+      #define LZMA_LOG_BSR
+  #elif defined(_MSC_VER) && (_MSC_VER >= 1300)
+    // #if defined(MY_CPU_ARM_OR_ARM64)
+      #define LZMA_LOG_BSR
+    // #endif
+  #endif
 #endif
 
+// #include <intrin.h>
+
 #ifdef LZMA_LOG_BSR
 
-#define kDicLogSizeMaxCompress 32
+#if defined(__clang__) \
+    || defined(__GNUC__)
+
+/*
+  C code:                  : (30 - __builtin_clz(x))
+    gcc9/gcc10 for x64 /x86  : 30 - (bsr(x) xor 31)
+    clang10 for x64          : 31 + (bsr(x) xor -32)
+*/
+
+  #define MY_clz(x)  ((unsigned)__builtin_clz(x))
+  // __lzcnt32
+  // __builtin_ia32_lzcnt_u32
 
-#define BSR2_RET(pos, res) { unsigned long zz; _BitScanReverse(&zz, (pos)); res = (zz + zz) + ((pos >> (zz - 1)) & 1); }
+#else  // #if defined(_MSC_VER)
 
-static unsigned GetPosSlot1(UInt32 pos)
+  #ifdef MY_CPU_ARM_OR_ARM64
+
+    #define MY_clz  _CountLeadingZeros
+
+  #else // if defined(MY_CPU_X86_OR_AMD64)
+
+    // #define MY_clz  __lzcnt  // we can use lzcnt (unsupported by old CPU)
+    // _BitScanReverse code is not optimal for some MSVC compilers
+    #define BSR2_RET(pos, res) { unsigned long zz; _BitScanReverse(&zz, (pos)); zz--; \
+      res = (zz + zz) + (pos >> zz); }
+
+  #endif // MY_CPU_X86_OR_AMD64
+
+#endif // _MSC_VER
+
+
+#ifndef BSR2_RET
+
+    #define BSR2_RET(pos, res) { unsigned zz = 30 - MY_clz(pos); \
+      res = (zz + zz) + (pos >> zz); }
+
+#endif
+
+
+unsigned GetPosSlot1(UInt32 pos);
+unsigned GetPosSlot1(UInt32 pos)
 {
   unsigned res;
-  BSR2_RET(pos, res);
+  BSR2_RET(pos, res)
   return res;
 }
-#define GetPosSlot2(pos, res) { BSR2_RET(pos, res); }
-#define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res); }
+#define GetPosSlot2(pos, res) { BSR2_RET(pos, res) }
+#define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res) }
 
-#else
 
-#define kNumLogBits (9 + sizeof(size_t) / 2)
-/* #define kNumLogBits (11 + sizeof(size_t) / 8 * 3) */
+#else // ! LZMA_LOG_BSR
+
+#define kNumLogBits (11 + sizeof(size_t) / 8 * 3)
 
 #define kDicLogSizeMaxCompress ((kNumLogBits - 1) * 2 + 7)
 
@@ -163,7 +253,7 @@ static void LzmaEnc_FastPosInit(Byte *g_FastPos)
 #define GetPosSlot2(pos, res) { BSR2_RET(pos, res); }
 #define GetPosSlot(pos, res) { if (pos < kNumFullDistances) res = p->g_FastPos[pos & (kNumFullDistances - 1)]; else BSR2_RET(pos, res); }
 
-#endif
+#endif // LZMA_LOG_BSR
 
 
 #define LZMA_NUM_REPS 4
@@ -193,7 +283,7 @@ typedef struct
 
 #define kNumLenToPosStates 4
 #define kNumPosSlotBits 6
-#define kDicLogSizeMin 0
+// #define kDicLogSizeMin 0
 #define kDicLogSizeMax 32
 #define kDistTableSizeMax (kDicLogSizeMax * 2)
 
@@ -206,7 +296,7 @@ typedef struct
 #define kNumFullDistances (1 << (kEndPosModelIndex >> 1))
 
 typedef
-#ifdef _LZMA_PROB32
+#ifdef Z7_LZMA_PROB32
   UInt32
 #else
   UInt16
@@ -263,7 +353,7 @@ typedef struct
   Byte *buf;
   Byte *bufLim;
   Byte *bufBase;
-  ISeqOutStream *outStream;
+  ISeqOutStreamPtr outStream;
   UInt64 processed;
   SRes res;
 } CRangeEnc;
@@ -296,10 +386,10 @@ typedef struct
 typedef UInt32 CProbPrice;
 
 
-typedef struct
+struct CLzmaEnc
 {
   void *matchFinderObj;
-  IMatchFinder matchFinder;
+  IMatchFinder2 matchFinder;
 
   unsigned optCur;
   unsigned optEnd;
@@ -339,24 +429,30 @@ typedef struct
   UInt32 dictSize;
   SRes result;
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   BoolInt mtMode;
   // begin of CMatchFinderMt is used in LZ thread
   CMatchFinderMt matchFinderMt;
   // end of CMatchFinderMt is used in BT and HASH threads
+  // #else
+  // CMatchFinder matchFinderBase;
   #endif
-
   CMatchFinder matchFinderBase;
 
-  #ifndef _7ZIP_ST
+  
+  // we suppose that we have 8-bytes alignment after CMatchFinder
+ 
+  #ifndef Z7_ST
   Byte pad[128];
   #endif
   
   // LZ thread
   CProbPrice ProbPrices[kBitModelTotal >> kNumMoveReducingBits];
 
-  UInt32 matches[LZMA_MATCH_LEN_MAX * 2 + 2 + 1];
+  // we want {len , dist} pairs to be 8-bytes aligned in matches array
+  UInt32 matches[LZMA_MATCH_LEN_MAX * 2 + 2];
 
+  // we want 8-bytes alignment here
   UInt32 alignPrices[kAlignTableSize];
   UInt32 posSlotPrices[kNumLenToPosStates][kDistTableSizeMax];
   UInt32 distancesPrices[kNumLenToPosStates][kNumFullDistances];
@@ -385,113 +481,115 @@ typedef struct
 
   CSaveState saveState;
 
-  #ifndef _7ZIP_ST
+  // BoolInt mf_Failure;
+  #ifndef Z7_ST
   Byte pad2[128];
   #endif
-} CLzmaEnc;
-
+};
 
 
-#define COPY_ARR(dest, src, arr) memcpy(dest->arr, src->arr, sizeof(src->arr));
+#define MFB (p->matchFinderBase)
+/*
+#ifndef Z7_ST
+#define MFB (p->matchFinderMt.MatchFinder)
+#endif
+*/
 
-void LzmaEnc_SaveState(CLzmaEncHandle pp)
-{
-  CLzmaEnc *p = (CLzmaEnc *)pp;
-  CSaveState *dest = &p->saveState;
-  
-  dest->state = p->state;
-  
-  dest->lenProbs = p->lenProbs;
-  dest->repLenProbs = p->repLenProbs;
-
-  COPY_ARR(dest, p, reps);
-
-  COPY_ARR(dest, p, posAlignEncoder);
-  COPY_ARR(dest, p, isRep);
-  COPY_ARR(dest, p, isRepG0);
-  COPY_ARR(dest, p, isRepG1);
-  COPY_ARR(dest, p, isRepG2);
-  COPY_ARR(dest, p, isMatch);
-  COPY_ARR(dest, p, isRep0Long);
-  COPY_ARR(dest, p, posSlotEncoder);
-  COPY_ARR(dest, p, posEncoders);
-
-  memcpy(dest->litProbs, p->litProbs, ((UInt32)0x300 << p->lclp) * sizeof(CLzmaProb));
+// #define GET_CLzmaEnc_p  CLzmaEnc *p = (CLzmaEnc*)(void *)p;
+// #define GET_const_CLzmaEnc_p  const CLzmaEnc *p = (const CLzmaEnc*)(const void *)p;
+
+#define COPY_ARR(dest, src, arr)  memcpy((dest)->arr, (src)->arr, sizeof((src)->arr));
+
+#define COPY_LZMA_ENC_STATE(d, s, p)  \
+  (d)->state = (s)->state;  \
+  COPY_ARR(d, s, reps)  \
+  COPY_ARR(d, s, posAlignEncoder)  \
+  COPY_ARR(d, s, isRep)  \
+  COPY_ARR(d, s, isRepG0)  \
+  COPY_ARR(d, s, isRepG1)  \
+  COPY_ARR(d, s, isRepG2)  \
+  COPY_ARR(d, s, isMatch)  \
+  COPY_ARR(d, s, isRep0Long)  \
+  COPY_ARR(d, s, posSlotEncoder)  \
+  COPY_ARR(d, s, posEncoders)  \
+  (d)->lenProbs = (s)->lenProbs;  \
+  (d)->repLenProbs = (s)->repLenProbs;  \
+  memcpy((d)->litProbs, (s)->litProbs, ((size_t)0x300 * sizeof(CLzmaProb)) << (p)->lclp);
+
+void LzmaEnc_SaveState(CLzmaEncHandle p)
+{
+  // GET_CLzmaEnc_p
+  CSaveState *v = &p->saveState;
+  COPY_LZMA_ENC_STATE(v, p, p)
 }
 
-
-void LzmaEnc_RestoreState(CLzmaEncHandle pp)
+void LzmaEnc_RestoreState(CLzmaEncHandle p)
 {
-  CLzmaEnc *dest = (CLzmaEnc *)pp;
-  const CSaveState *p = &dest->saveState;
-
-  dest->state = p->state;
-
-  dest->lenProbs = p->lenProbs;
-  dest->repLenProbs = p->repLenProbs;
-  
-  COPY_ARR(dest, p, reps);
-  
-  COPY_ARR(dest, p, posAlignEncoder);
-  COPY_ARR(dest, p, isRep);
-  COPY_ARR(dest, p, isRepG0);
-  COPY_ARR(dest, p, isRepG1);
-  COPY_ARR(dest, p, isRepG2);
-  COPY_ARR(dest, p, isMatch);
-  COPY_ARR(dest, p, isRep0Long);
-  COPY_ARR(dest, p, posSlotEncoder);
-  COPY_ARR(dest, p, posEncoders);
-
-  memcpy(dest->litProbs, p->litProbs, ((UInt32)0x300 << dest->lclp) * sizeof(CLzmaProb));
+  // GET_CLzmaEnc_p
+  const CSaveState *v = &p->saveState;
+  COPY_LZMA_ENC_STATE(p, v, p)
 }
 
 
-
-SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2)
+Z7_NO_INLINE
+SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props2)
 {
-  CLzmaEnc *p = (CLzmaEnc *)pp;
+  // GET_CLzmaEnc_p
   CLzmaEncProps props = *props2;
   LzmaEncProps_Normalize(&props);
 
   if (props.lc > LZMA_LC_MAX
       || props.lp > LZMA_LP_MAX
-      || props.pb > LZMA_PB_MAX
-      || props.dictSize > ((UInt64)1 << kDicLogSizeMaxCompress)
-      || props.dictSize > kLzmaMaxHistorySize)
+      || props.pb > LZMA_PB_MAX)
     return SZ_ERROR_PARAM;
 
+
+  if (props.dictSize > kLzmaMaxHistorySize)
+    props.dictSize = kLzmaMaxHistorySize;
+
+  #ifndef LZMA_LOG_BSR
+  {
+    const UInt64 dict64 = props.dictSize;
+    if (dict64 > ((UInt64)1 << kDicLogSizeMaxCompress))
+      return SZ_ERROR_PARAM;
+  }
+  #endif
+
   p->dictSize = props.dictSize;
   {
-    unsigned fb = props.fb;
+    unsigned fb = (unsigned)props.fb;
     if (fb < 5)
       fb = 5;
     if (fb > LZMA_MATCH_LEN_MAX)
       fb = LZMA_MATCH_LEN_MAX;
     p->numFastBytes = fb;
   }
-  p->lc = props.lc;
-  p->lp = props.lp;
-  p->pb = props.pb;
+  p->lc = (unsigned)props.lc;
+  p->lp = (unsigned)props.lp;
+  p->pb = (unsigned)props.pb;
   p->fastMode = (props.algo == 0);
   // p->_maxMode = True;
-  p->matchFinderBase.btMode = (Byte)(props.btMode ? 1 : 0);
+  MFB.btMode = (Byte)(props.btMode ? 1 : 0);
+  // MFB.btMode = (Byte)(props.btMode);
   {
     unsigned numHashBytes = 4;
     if (props.btMode)
     {
-      if (props.numHashBytes < 2)
-        numHashBytes = 2;
-      else if (props.numHashBytes < 4)
-        numHashBytes = props.numHashBytes;
+           if (props.numHashBytes <  2) numHashBytes = 2;
+      else if (props.numHashBytes <  4) numHashBytes = (unsigned)props.numHashBytes;
     }
-    p->matchFinderBase.numHashBytes = numHashBytes;
+    if (props.numHashBytes >= 5) numHashBytes = 5;
+
+    MFB.numHashBytes = numHashBytes;
+    // MFB.numHashBytes_Min = 2;
+    MFB.numHashOutBits = (Byte)props.numHashOutBits;
   }
 
-  p->matchFinderBase.cutValue = props.mc;
+  MFB.cutValue = props.mc;
 
-  p->writeEndMark = props.writeEndMark;
+  p->writeEndMark = (BoolInt)props.writeEndMark;
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   /*
   if (newMultiThread != _multiThread)
   {
@@ -500,16 +598,22 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2)
   }
   */
   p->multiThread = (props.numThreads > 1);
+  p->matchFinderMt.btSync.affinity =
+  p->matchFinderMt.hashSync.affinity = props.affinity;
+  p->matchFinderMt.btSync.affinityGroup =
+  p->matchFinderMt.hashSync.affinityGroup = props.affinityGroup;
+  p->matchFinderMt.btSync.affinityInGroup =
+  p->matchFinderMt.hashSync.affinityInGroup = props.affinityInGroup;
   #endif
 
   return SZ_OK;
 }
 
 
-void LzmaEnc_SetDataSize(CLzmaEncHandle pp, UInt64 expectedDataSiize)
+void LzmaEnc_SetDataSize(CLzmaEncHandle p, UInt64 expectedDataSiize)
 {
-  CLzmaEnc *p = (CLzmaEnc *)pp;
-  p->matchFinderBase.expectedDataSize = expectedDataSiize;
+  // GET_CLzmaEnc_p
+  MFB.expectedDataSize = expectedDataSiize;
 }
 
 
@@ -536,8 +640,8 @@ static void RangeEnc_Construct(CRangeEnc *p)
   p->bufBase = NULL;
 }
 
-#define RangeEnc_GetProcessed(p)       ((p)->processed + ((p)->buf - (p)->bufBase) + (p)->cacheSize)
-#define RangeEnc_GetProcessed_sizet(p) ((size_t)(p)->processed + ((p)->buf - (p)->bufBase) + (size_t)(p)->cacheSize)
+#define RangeEnc_GetProcessed(p)       (        (p)->processed + (size_t)((p)->buf - (p)->bufBase) +         (p)->cacheSize)
+#define RangeEnc_GetProcessed_sizet(p) ((size_t)(p)->processed + (size_t)((p)->buf - (p)->bufBase) + (size_t)(p)->cacheSize)
 
 #define RC_BUF_SIZE (1 << 16)
 
@@ -556,12 +660,11 @@ static int RangeEnc_Alloc(CRangeEnc *p, ISzAllocPtr alloc)
 static void RangeEnc_Free(CRangeEnc *p, ISzAllocPtr alloc)
 {
   ISzAlloc_Free(alloc, p->bufBase);
-  p->bufBase = 0;
+  p->bufBase = NULL;
 }
 
 static void RangeEnc_Init(CRangeEnc *p)
 {
-  /* Stream.Init(); */
   p->range = 0xFFFFFFFF;
   p->cache = 0;
   p->low = 0;
@@ -573,19 +676,19 @@ static void RangeEnc_Init(CRangeEnc *p)
   p->res = SZ_OK;
 }
 
-MY_NO_INLINE static void RangeEnc_FlushStream(CRangeEnc *p)
+Z7_NO_INLINE static void RangeEnc_FlushStream(CRangeEnc *p)
 {
-  size_t num;
-  if (p->res != SZ_OK)
-    return;
-  num = p->buf - p->bufBase;
-  if (num != ISeqOutStream_Write(p->outStream, p->bufBase, num))
-    p->res = SZ_ERROR_WRITE;
+  const size_t num = (size_t)(p->buf - p->bufBase);
+  if (p->res == SZ_OK)
+  {
+    if (num != ISeqOutStream_Write(p->outStream, p->bufBase, num))
+      p->res = SZ_ERROR_WRITE;
+  }
   p->processed += num;
   p->buf = p->bufBase;
 }
 
-MY_NO_INLINE static void MY_FAST_CALL RangeEnc_ShiftLow(CRangeEnc *p)
+Z7_NO_INLINE static void Z7_FASTCALL RangeEnc_ShiftLow(CRangeEnc *p)
 {
   UInt32 low = (UInt32)p->low;
   unsigned high = (unsigned)(p->low >> 32);
@@ -630,9 +733,9 @@ static void RangeEnc_FlushData(CRangeEnc *p)
   ttt = *(prob); \
   newBound = (range >> kNumBitModelTotalBits) * ttt;
 
-// #define _LZMA_ENC_USE_BRANCH
+// #define Z7_LZMA_ENC_USE_BRANCH
 
-#ifdef _LZMA_ENC_USE_BRANCH
+#ifdef Z7_LZMA_ENC_USE_BRANCH
 
 #define RC_BIT(p, prob, bit) { \
   RC_BIT_PRE(p, prob) \
@@ -656,7 +759,7 @@ static void RangeEnc_FlushData(CRangeEnc *p)
   range += newBound & mask; \
   mask &= (kBitModelTotal - ((1 << kNumMoveBits) - 1)); \
   mask += ((1 << kNumMoveBits) - 1); \
-  ttt += (Int32)(mask - ttt) >> kNumMoveBits; \
+  ttt += (UInt32)((Int32)(mask - ttt) >> kNumMoveBits); \
   *(prob) = (CLzmaProb)ttt; \
   RC_NORM(p) \
   }
@@ -700,7 +803,7 @@ static void LitEnc_Encode(CRangeEnc *p, CLzmaProb *probs, UInt32 sym)
     CLzmaProb *prob = probs + (sym >> 8);
     UInt32 bit = (sym >> 7) & 1;
     sym <<= 1;
-    RC_BIT(p, prob, bit);
+    RC_BIT(p, prob, bit)
   }
   while (sym < 0x10000);
   p->range = range;
@@ -722,7 +825,7 @@ static void LitEnc_EncodeMatched(CRangeEnc *p, CLzmaProb *probs, UInt32 sym, UIn
     bit = (sym >> 7) & 1;
     sym <<= 1;
     offs &= ~(matchByte ^ sym);
-    RC_BIT(p, prob, bit);
+    RC_BIT(p, prob, bit)
   }
   while (sym < 0x10000);
   p->range = range;
@@ -749,17 +852,17 @@ static void LzmaEnc_InitPriceTables(CProbPrice *ProbPrices)
         bitCount++;
       }
     }
-    ProbPrices[i] = (CProbPrice)((kNumBitModelTotalBits << kCyclesBits) - 15 - bitCount);
+    ProbPrices[i] = (CProbPrice)(((unsigned)kNumBitModelTotalBits << kCyclesBits) - 15 - bitCount);
     // printf("\n%3d: %5d", i, ProbPrices[i]);
   }
 }
 
 
 #define GET_PRICE(prob, bit) \
-  p->ProbPrices[((prob) ^ (unsigned)(((-(int)(bit))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits];
+  p->ProbPrices[((prob) ^ (unsigned)(((-(int)(bit))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits]
 
 #define GET_PRICEa(prob, bit) \
-     ProbPrices[((prob) ^ (unsigned)((-((int)(bit))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits];
+     ProbPrices[((prob) ^ (unsigned)((-((int)(bit))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits]
 
 #define GET_PRICE_0(prob) p->ProbPrices[(prob) >> kNumMoveReducingBits]
 #define GET_PRICE_1(prob) p->ProbPrices[((prob) ^ (kBitModelTotal - 1)) >> kNumMoveReducingBits]
@@ -810,7 +913,7 @@ static void RcTree_ReverseEncode(CRangeEnc *rc, CLzmaProb *probs, unsigned numBi
     unsigned bit = sym & 1;
     // RangeEnc_EncodeBit(rc, probs + m, bit);
     sym >>= 1;
-    RC_BIT(rc, probs + m, bit);
+    RC_BIT(rc, probs + m, bit)
     m = (m << 1) | bit;
   }
   while (--numBits);
@@ -833,15 +936,15 @@ static void LenEnc_Encode(CLenEnc *p, CRangeEnc *rc, unsigned sym, unsigned posS
   UInt32 range, ttt, newBound;
   CLzmaProb *probs = p->low;
   range = rc->range;
-  RC_BIT_PRE(rc, probs);
+  RC_BIT_PRE(rc, probs)
   if (sym >= kLenNumLowSymbols)
   {
-    RC_BIT_1(rc, probs);
+    RC_BIT_1(rc, probs)
     probs += kLenNumLowSymbols;
-    RC_BIT_PRE(rc, probs);
+    RC_BIT_PRE(rc, probs)
     if (sym >= kLenNumLowSymbols * 2)
     {
-      RC_BIT_1(rc, probs);
+      RC_BIT_1(rc, probs)
       rc->range = range;
       // RcTree_Encode(rc, p->high, kLenNumHighBits, sym - kLenNumLowSymbols * 2);
       LitEnc_Encode(rc, p->high, sym - kLenNumLowSymbols * 2);
@@ -854,11 +957,11 @@ static void LenEnc_Encode(CLenEnc *p, CRangeEnc *rc, unsigned sym, unsigned posS
   {
     unsigned m;
     unsigned bit;
-    RC_BIT_0(rc, probs);
+    RC_BIT_0(rc, probs)
     probs += (posState << (1 + kLenNumLowBits));
-    bit = (sym >> 2)    ; RC_BIT(rc, probs + 1, bit); m = (1 << 1) + bit;
-    bit = (sym >> 1) & 1; RC_BIT(rc, probs + m, bit); m = (m << 1) + bit;
-    bit =  sym       & 1; RC_BIT(rc, probs + m, bit);
+    bit = (sym >> 2)    ; RC_BIT(rc, probs + 1, bit)  m = (1 << 1) + bit;
+    bit = (sym >> 1) & 1; RC_BIT(rc, probs + m, bit)  m = (m << 1) + bit;
+    bit =  sym       & 1; RC_BIT(rc, probs + m, bit)
     rc->range = range;
   }
 }
@@ -879,7 +982,7 @@ static void SetPrices_3(const CLzmaProb *probs, UInt32 startPrice, UInt32 *price
 }
 
 
-MY_NO_INLINE static void MY_FAST_CALL LenPriceEnc_UpdateTables(
+Z7_NO_INLINE static void Z7_FASTCALL LenPriceEnc_UpdateTables(
     CLenPriceEnc *p,
     unsigned numPosStates,
     const CLenEnc *enc,
@@ -943,14 +1046,14 @@ MY_NO_INLINE static void MY_FAST_CALL LenPriceEnc_UpdateTables(
         UInt32 price = b;
         do
         {
-          unsigned bit = sym & 1;
+          const unsigned bit = sym & 1;
           sym >>= 1;
           price += GET_PRICEa(probs[sym], bit);
         }
         while (sym >= 2);
 
         {
-          unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))];
+          const unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))];
           prices[(size_t)i * 2    ] = price + GET_PRICEa_0(prob);
           prices[(size_t)i * 2 + 1] = price + GET_PRICEa_1(prob);
         }
@@ -959,7 +1062,7 @@ MY_NO_INLINE static void MY_FAST_CALL LenPriceEnc_UpdateTables(
 
       {
         unsigned posState;
-        size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]);
+        const size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]);
         for (posState = 1; posState < numPosStates; posState++)
           memcpy(p->prices[posState] + kLenNumLowSymbols * 2, p->prices[0] + kLenNumLowSymbols * 2, num);
       }
@@ -985,7 +1088,11 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes)
   
   p->additionalOffset++;
   p->numAvail = p->matchFinder.GetNumAvailableBytes(p->matchFinderObj);
-  numPairs = p->matchFinder.GetMatches(p->matchFinderObj, p->matches);
+  {
+    const UInt32 *d = p->matchFinder.GetMatches(p->matchFinderObj, p->matches);
+    // if (!d) { p->mf_Failure = True; *numPairsRes = 0;  return 0; }
+    numPairs = (unsigned)(d - p->matches);
+  }
   *numPairsRes = numPairs;
   
   #ifdef SHOW_STAT
@@ -1001,7 +1108,7 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes)
   if (numPairs == 0)
     return 0;
   {
-    unsigned len = p->matches[(size_t)numPairs - 2];
+    const unsigned len = p->matches[(size_t)numPairs - 2];
     if (len != p->numFastBytes)
       return len;
     {
@@ -1011,7 +1118,7 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes)
       {
         const Byte *p1 = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1;
         const Byte *p2 = p1 + len;
-        ptrdiff_t dif = (ptrdiff_t)-1 - p->matches[(size_t)numPairs - 1];
+        const ptrdiff_t dif = (ptrdiff_t)-1 - (ptrdiff_t)p->matches[(size_t)numPairs - 1];
         const Byte *lim = p1 + numAvail;
         for (; p2 != lim && *p2 == p2[dif]; p2++)
         {}
@@ -1037,7 +1144,7 @@ static unsigned ReadMatchDistances(CLzmaEnc *p, unsigned *numPairsRes)
   + GET_PRICE_1(p->isRep[state]) \
   + GET_PRICE_0(p->isRepG0[state])
   
-MY_FORCE_INLINE
+Z7_FORCE_INLINE
 static UInt32 GetPrice_PureRep(const CLzmaEnc *p, unsigned repIndex, size_t state, size_t posState)
 {
   UInt32 price;
@@ -1167,6 +1274,8 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
       repLens[i] = len;
       if (len > repLens[repMaxIndex])
         repMaxIndex = i;
+      if (len == LZMA_MATCH_LEN_MAX) // 21.03 : optimization
+        break;
     }
     
     if (repLens[repMaxIndex] >= p->numFastBytes)
@@ -1179,10 +1288,12 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
     }
     
     matches = p->matches;
+    #define MATCHES  matches
+    // #define MATCHES  p->matches
     
     if (mainLen >= p->numFastBytes)
     {
-      p->backRes = matches[(size_t)numPairs - 1] + LZMA_NUM_REPS;
+      p->backRes = MATCHES[(size_t)numPairs - 1] + LZMA_NUM_REPS;
       MOVE_POS(p, mainLen - 1)
       return mainLen;
     }
@@ -1212,7 +1323,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
           LitEnc_GetPrice(probs, curByte, p->ProbPrices));
     }
 
-    MakeAs_Lit(&p->opt[1]);
+    MakeAs_Lit(&p->opt[1])
     
     matchPrice = GET_PRICE_1(p->isMatch[p->state][posState]);
     repMatchPrice = matchPrice + GET_PRICE_1(p->isRep[p->state]);
@@ -1224,7 +1335,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
       if (shortRepPrice < p->opt[1].price)
       {
         p->opt[1].price = shortRepPrice;
-        MakeAs_ShortRep(&p->opt[1]);
+        MakeAs_ShortRep(&p->opt[1])
       }
       if (last < 2)
       {
@@ -1276,13 +1387,13 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
         if (len < 2)
           len = 2;
         else
-          while (len > matches[offs])
+          while (len > MATCHES[offs])
             offs += 2;
     
         for (; ; len++)
         {
           COptimal *opt;
-          UInt32 dist = matches[(size_t)offs + 1];
+          UInt32 dist = MATCHES[(size_t)offs + 1];
           UInt32 price = normalMatchPrice + GET_PRICE_LEN(&p->lenEnc, posState, len);
           unsigned lenToPosState = GetLenToPosState(len);
        
@@ -1291,7 +1402,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
           else
           {
             unsigned slot;
-            GetPosSlot2(dist, slot);
+            GetPosSlot2(dist, slot)
             price += p->alignPrices[dist & kAlignMask];
             price += p->posSlotPrices[lenToPosState][slot];
           }
@@ -1306,7 +1417,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
             opt->extra = 0;
           }
           
-          if (len == matches[offs])
+          if (len == MATCHES[offs])
           {
             offs += 2;
             if (offs == numPairs)
@@ -1367,7 +1478,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
         unsigned delta = best - cur;
         if (delta != 0)
         {
-          MOVE_POS(p, delta);
+          MOVE_POS(p, delta)
         }
       }
       cur = best;
@@ -1514,7 +1625,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
       {
         nextOpt->price = litPrice;
         nextOpt->len = 1;
-        MakeAs_Lit(nextOpt);
+        MakeAs_Lit(nextOpt)
         nextIsLit = True;
       }
     }
@@ -1548,7 +1659,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
       {
         nextOpt->price = shortRepPrice;
         nextOpt->len = 1;
-        MakeAs_ShortRep(nextOpt);
+        MakeAs_ShortRep(nextOpt)
         nextIsLit = False;
       }
     }
@@ -1727,8 +1838,8 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
     if (newLen > numAvail)
     {
       newLen = numAvail;
-      for (numPairs = 0; newLen > matches[numPairs]; numPairs += 2);
-      matches[numPairs] = (UInt32)newLen;
+      for (numPairs = 0; newLen > MATCHES[numPairs]; numPairs += 2);
+      MATCHES[numPairs] = (UInt32)newLen;
       numPairs += 2;
     }
     
@@ -1747,12 +1858,12 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
       }
 
       offs = 0;
-      while (startLen > matches[offs])
+      while (startLen > MATCHES[offs])
         offs += 2;
-      dist = matches[(size_t)offs + 1];
+      dist = MATCHES[(size_t)offs + 1];
       
       // if (dist >= kNumFullDistances)
-      GetPosSlot2(dist, posSlot);
+      GetPosSlot2(dist, posSlot)
       
       for (len = /*2*/ startLen; ; len++)
       {
@@ -1776,7 +1887,7 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
           }
         }
 
-        if (len == matches[offs])
+        if (len == MATCHES[offs])
         {
           // if (p->_maxMode) {
           // MATCH : LIT : REP_0
@@ -1841,9 +1952,9 @@ static unsigned GetOptimum(CLzmaEnc *p, UInt32 position)
           offs += 2;
           if (offs == numPairs)
             break;
-          dist = matches[(size_t)offs + 1];
+          dist = MATCHES[(size_t)offs + 1];
           // if (dist >= kNumFullDistances)
-            GetPosSlot2(dist, posSlot);
+            GetPosSlot2(dist, posSlot)
         }
       }
     }
@@ -2019,7 +2130,7 @@ static void WriteEndMarker(CLzmaEnc *p, unsigned posState)
     {
       UInt32 ttt, newBound;
       RC_BIT_PRE(p, probs + m)
-      RC_BIT_1(&p->rc, probs + m);
+      RC_BIT_1(&p->rc, probs + m)
       m = (m << 1) + 1;
     }
     while (m < (1 << kNumPosSlotBits));
@@ -2044,7 +2155,7 @@ static void WriteEndMarker(CLzmaEnc *p, unsigned posState)
     {
       UInt32 ttt, newBound;
       RC_BIT_PRE(p, probs + m)
-      RC_BIT_1(&p->rc, probs + m);
+      RC_BIT_1(&p->rc, probs + m)
       m = (m << 1) + 1;
     }
     while (m < kAlignTableSize);
@@ -2059,15 +2170,30 @@ static SRes CheckErrors(CLzmaEnc *p)
     return p->result;
   if (p->rc.res != SZ_OK)
     p->result = SZ_ERROR_WRITE;
-  if (p->matchFinderBase.result != SZ_OK)
+
+  #ifndef Z7_ST
+  if (
+      // p->mf_Failure ||
+        (p->mtMode &&
+          ( // p->matchFinderMt.failure_LZ_LZ ||
+            p->matchFinderMt.failure_LZ_BT))
+     )
+  {
+    p->result = MY_HRES_ERROR_INTERNAL_ERROR;
+    // printf("\nCheckErrors p->matchFinderMt.failureLZ\n");
+  }
+  #endif
+
+  if (MFB.result != SZ_OK)
     p->result = SZ_ERROR_READ;
+  
   if (p->result != SZ_OK)
     p->finished = True;
   return p->result;
 }
 
 
-MY_NO_INLINE static SRes Flush(CLzmaEnc *p, UInt32 nowPos)
+Z7_NO_INLINE static SRes Flush(CLzmaEnc *p, UInt32 nowPos)
 {
   /* ReleaseMFStream(); */
   p->finished = True;
@@ -2079,7 +2205,7 @@ MY_NO_INLINE static SRes Flush(CLzmaEnc *p, UInt32 nowPos)
 }
 
 
-MY_NO_INLINE static void FillAlignPrices(CLzmaEnc *p)
+Z7_NO_INLINE static void FillAlignPrices(CLzmaEnc *p)
 {
   unsigned i;
   const CProbPrice *ProbPrices = p->ProbPrices;
@@ -2103,7 +2229,7 @@ MY_NO_INLINE static void FillAlignPrices(CLzmaEnc *p)
 }
 
 
-MY_NO_INLINE static void FillDistancesPrices(CLzmaEnc *p)
+Z7_NO_INLINE static void FillDistancesPrices(CLzmaEnc *p)
 {
   // int y; for (y = 0; y < 100; y++) {
 
@@ -2198,20 +2324,20 @@ MY_NO_INLINE static void FillDistancesPrices(CLzmaEnc *p)
 
 
 
-void LzmaEnc_Construct(CLzmaEnc *p)
+static void LzmaEnc_Construct(CLzmaEnc *p)
 {
   RangeEnc_Construct(&p->rc);
-  MatchFinder_Construct(&p->matchFinderBase);
+  MatchFinder_Construct(&MFB);
   
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
+  p->matchFinderMt.MatchFinder = &MFB;
   MatchFinderMt_Construct(&p->matchFinderMt);
-  p->matchFinderMt.MatchFinder = &p->matchFinderBase;
   #endif
 
   {
     CLzmaEncProps props;
     LzmaEncProps_Init(&props);
-    LzmaEnc_SetProps(p, &props);
+    LzmaEnc_SetProps((CLzmaEncHandle)(void *)p, &props);
   }
 
   #ifndef LZMA_LOG_BSR
@@ -2221,7 +2347,6 @@ void LzmaEnc_Construct(CLzmaEnc *p)
   LzmaEnc_InitPriceTables(p->ProbPrices);
   p->litProbs = NULL;
   p->saveState.litProbs = NULL;
-
 }
 
 CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc)
@@ -2233,7 +2358,7 @@ CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc)
   return p;
 }
 
-void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAllocPtr alloc)
+static void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAllocPtr alloc)
 {
   ISzAlloc_Free(alloc, p->litProbs);
   ISzAlloc_Free(alloc, p->saveState.litProbs);
@@ -2241,20 +2366,21 @@ void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAllocPtr alloc)
   p->saveState.litProbs = NULL;
 }
 
-void LzmaEnc_Destruct(CLzmaEnc *p, ISzAllocPtr alloc, ISzAllocPtr allocBig)
+static void LzmaEnc_Destruct(CLzmaEnc *p, ISzAllocPtr alloc, ISzAllocPtr allocBig)
 {
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   MatchFinderMt_Destruct(&p->matchFinderMt, allocBig);
   #endif
   
-  MatchFinder_Free(&p->matchFinderBase, allocBig);
+  MatchFinder_Free(&MFB, allocBig);
   LzmaEnc_FreeLits(p, alloc);
   RangeEnc_Free(&p->rc, alloc);
 }
 
 void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAllocPtr alloc, ISzAllocPtr allocBig)
 {
-  LzmaEnc_Destruct((CLzmaEnc *)p, alloc, allocBig);
+  // GET_CLzmaEnc_p
+  LzmaEnc_Destruct(p, alloc, allocBig);
   ISzAlloc_Free(alloc, p);
 }
 
@@ -2265,13 +2391,19 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac
   UInt32 nowPos32, startPos32;
   if (p->needInit)
   {
+    #ifndef Z7_ST
+    if (p->mtMode)
+    {
+      RINOK(MatchFinderMt_InitMt(&p->matchFinderMt))
+    }
+    #endif
     p->matchFinder.Init(p->matchFinderObj);
     p->needInit = 0;
   }
 
   if (p->finished)
     return p->result;
-  RINOK(CheckErrors(p));
+  RINOK(CheckErrors(p))
 
   nowPos32 = (UInt32)p->nowPos64;
   startPos32 = nowPos32;
@@ -2334,7 +2466,7 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac
       const Byte *data;
       unsigned state;
 
-      RC_BIT_0(&p->rc, probs);
+      RC_BIT_0(&p->rc, probs)
       p->rc.range = range;
       data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - p->additionalOffset;
       probs = LIT_PROBS(nowPos32, *(data - 1));
@@ -2348,53 +2480,53 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac
     }
     else
     {
-      RC_BIT_1(&p->rc, probs);
+      RC_BIT_1(&p->rc, probs)
       probs = &p->isRep[p->state];
       RC_BIT_PRE(&p->rc, probs)
       
       if (dist < LZMA_NUM_REPS)
       {
-        RC_BIT_1(&p->rc, probs);
+        RC_BIT_1(&p->rc, probs)
         probs = &p->isRepG0[p->state];
         RC_BIT_PRE(&p->rc, probs)
         if (dist == 0)
         {
-          RC_BIT_0(&p->rc, probs);
+          RC_BIT_0(&p->rc, probs)
           probs = &p->isRep0Long[p->state][posState];
           RC_BIT_PRE(&p->rc, probs)
           if (len != 1)
           {
-            RC_BIT_1_BASE(&p->rc, probs);
+            RC_BIT_1_BASE(&p->rc, probs)
           }
           else
           {
-            RC_BIT_0_BASE(&p->rc, probs);
+            RC_BIT_0_BASE(&p->rc, probs)
             p->state = kShortRepNextStates[p->state];
           }
         }
         else
         {
-          RC_BIT_1(&p->rc, probs);
+          RC_BIT_1(&p->rc, probs)
           probs = &p->isRepG1[p->state];
           RC_BIT_PRE(&p->rc, probs)
           if (dist == 1)
           {
-            RC_BIT_0_BASE(&p->rc, probs);
+            RC_BIT_0_BASE(&p->rc, probs)
             dist = p->reps[1];
           }
           else
           {
-            RC_BIT_1(&p->rc, probs);
+            RC_BIT_1(&p->rc, probs)
             probs = &p->isRepG2[p->state];
             RC_BIT_PRE(&p->rc, probs)
             if (dist == 2)
             {
-              RC_BIT_0_BASE(&p->rc, probs);
+              RC_BIT_0_BASE(&p->rc, probs)
               dist = p->reps[2];
             }
             else
             {
-              RC_BIT_1_BASE(&p->rc, probs);
+              RC_BIT_1_BASE(&p->rc, probs)
               dist = p->reps[3];
               p->reps[3] = p->reps[2];
             }
@@ -2418,7 +2550,7 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac
       else
       {
         unsigned posSlot;
-        RC_BIT_0(&p->rc, probs);
+        RC_BIT_0(&p->rc, probs)
         p->rc.range = range;
         p->state = kMatchNextStates[p->state];
 
@@ -2432,7 +2564,7 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac
         p->reps[0] = dist + 1;
         
         p->matchPriceCount++;
-        GetPosSlot(dist, posSlot);
+        GetPosSlot(dist, posSlot)
         // RcTree_Encode_PosSlot(&p->rc, p->posSlotEncoder[GetLenToPosState(len)], posSlot);
         {
           UInt32 sym = (UInt32)posSlot + (1 << kNumPosSlotBits);
@@ -2443,7 +2575,7 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac
             CLzmaProb *prob = probs + (sym >> kNumPosSlotBits);
             UInt32 bit = (sym >> (kNumPosSlotBits - 1)) & 1;
             sym <<= 1;
-            RC_BIT(&p->rc, prob, bit);
+            RC_BIT(&p->rc, prob, bit)
           }
           while (sym < (1 << kNumPosSlotBits * 2));
           p->rc.range = range;
@@ -2487,10 +2619,10 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac
             {
               unsigned m = 1;
               unsigned bit;
-              bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit); m = (m << 1) + bit;
-              bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit); m = (m << 1) + bit;
-              bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit); m = (m << 1) + bit;
-              bit = dist & 1;             RC_BIT(&p->rc, p->posAlignEncoder + m, bit);
+              bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit)  m = (m << 1) + bit;
+              bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit)  m = (m << 1) + bit;
+              bit = dist & 1; dist >>= 1; RC_BIT(&p->rc, p->posAlignEncoder + m, bit)  m = (m << 1) + bit;
+              bit = dist & 1;             RC_BIT(&p->rc, p->posAlignEncoder + m, bit)
               p->rc.range = range;
               // p->alignPriceCount++;
             }
@@ -2522,12 +2654,12 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac
           // { int y; for (y = 0; y < 100; y++) {
           FillDistancesPrices(p);
           // }}
-          LenPriceEnc_UpdateTables(&p->lenEnc, 1 << p->pb, &p->lenProbs, p->ProbPrices);
+          LenPriceEnc_UpdateTables(&p->lenEnc, (unsigned)1 << p->pb, &p->lenProbs, p->ProbPrices);
         }
         if (p->repLenEncCounter <= 0)
         {
           p->repLenEncCounter = REP_LEN_COUNT;
-          LenPriceEnc_UpdateTables(&p->repLenEnc, 1 << p->pb, &p->repLenProbs, p->ProbPrices);
+          LenPriceEnc_UpdateTables(&p->repLenEnc, (unsigned)1 << p->pb, &p->repLenProbs, p->ProbPrices);
         }
       }
     
@@ -2560,20 +2692,22 @@ SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpac
 static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig)
 {
   UInt32 beforeSize = kNumOpts;
+  UInt32 dictSize;
+
   if (!RangeEnc_Alloc(&p->rc, alloc))
     return SZ_ERROR_MEM;
 
-  #ifndef _7ZIP_ST
-  p->mtMode = (p->multiThread && !p->fastMode && (p->matchFinderBase.btMode != 0));
+  #ifndef Z7_ST
+  p->mtMode = (p->multiThread && !p->fastMode && (MFB.btMode != 0));
   #endif
 
   {
-    unsigned lclp = p->lc + p->lp;
+    const unsigned lclp = p->lc + p->lp;
     if (!p->litProbs || !p->saveState.litProbs || p->lclp != lclp)
     {
       LzmaEnc_FreeLits(p, alloc);
-      p->litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb));
-      p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb));
+      p->litProbs =           (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp);
+      p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp);
       if (!p->litProbs || !p->saveState.litProbs)
       {
         LzmaEnc_FreeLits(p, alloc);
@@ -2583,36 +2717,55 @@ static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc,
     }
   }
 
-  p->matchFinderBase.bigHash = (Byte)(p->dictSize > kBigHashDicLimit ? 1 : 0);
+  MFB.bigHash = (Byte)(p->dictSize > kBigHashDicLimit ? 1 : 0);
 
-  if (beforeSize + p->dictSize < keepWindowSize)
-    beforeSize = keepWindowSize - p->dictSize;
 
-  #ifndef _7ZIP_ST
+  dictSize = p->dictSize;
+  if (dictSize == ((UInt32)2 << 30) ||
+      dictSize == ((UInt32)3 << 30))
+  {
+    /* 21.03 : here we reduce the dictionary for 2 reasons:
+       1) we don't want 32-bit back_distance matches in decoder for 2 GB dictionary.
+       2) we want to elimate useless last MatchFinder_Normalize3() for corner cases,
+          where data size is aligned for 1 GB: 5/6/8 GB.
+          That reducing must be >= 1 for such corner cases. */
+    dictSize -= 1;
+  }
+
+  if (beforeSize + dictSize < keepWindowSize)
+    beforeSize = keepWindowSize - dictSize;
+
+  /* in worst case we can look ahead for
+        max(LZMA_MATCH_LEN_MAX, numFastBytes + 1 + numFastBytes) bytes.
+     we send larger value for (keepAfter) to MantchFinder_Create():
+        (numFastBytes + LZMA_MATCH_LEN_MAX + 1)
+  */
+
+  #ifndef Z7_ST
   if (p->mtMode)
   {
-    RINOK(MatchFinderMt_Create(&p->matchFinderMt, p->dictSize, beforeSize, p->numFastBytes,
-        LZMA_MATCH_LEN_MAX
-        + 1  /* 18.04 */
-        , allocBig));
+    RINOK(MatchFinderMt_Create(&p->matchFinderMt, dictSize, beforeSize,
+        p->numFastBytes, LZMA_MATCH_LEN_MAX + 1 /* 18.04 */
+        , allocBig))
     p->matchFinderObj = &p->matchFinderMt;
-    p->matchFinderBase.bigHash = (Byte)(
-        (p->dictSize > kBigHashDicLimit && p->matchFinderBase.hashMask >= 0xFFFFFF) ? 1 : 0);
+    MFB.bigHash = (Byte)(MFB.hashMask >= 0xFFFFFF ? 1 : 0);
     MatchFinderMt_CreateVTable(&p->matchFinderMt, &p->matchFinder);
   }
   else
   #endif
   {
-    if (!MatchFinder_Create(&p->matchFinderBase, p->dictSize, beforeSize, p->numFastBytes, LZMA_MATCH_LEN_MAX, allocBig))
+    if (!MatchFinder_Create(&MFB, dictSize, beforeSize,
+        p->numFastBytes, LZMA_MATCH_LEN_MAX + 1 /* 21.03 */
+        , allocBig))
       return SZ_ERROR_MEM;
-    p->matchFinderObj = &p->matchFinderBase;
-    MatchFinder_CreateVTable(&p->matchFinderBase, &p->matchFinder);
+    p->matchFinderObj = &MFB;
+    MatchFinder_CreateVTable(&MFB, &p->matchFinder);
   }
   
   return SZ_OK;
 }
 
-void LzmaEnc_Init(CLzmaEnc *p)
+static void LzmaEnc_Init(CLzmaEnc *p)
 {
   unsigned i;
   p->state = 0;
@@ -2655,8 +2808,8 @@ void LzmaEnc_Init(CLzmaEnc *p)
   }
 
   {
-    UInt32 num = (UInt32)0x300 << (p->lp + p->lc);
-    UInt32 k;
+    const size_t num = (size_t)0x300 << (p->lp + p->lc);
+    size_t k;
     CLzmaProb *probs = p->litProbs;
     for (k = 0; k < num; k++)
       probs[k] = kProbInitValue;
@@ -2676,12 +2829,14 @@ void LzmaEnc_Init(CLzmaEnc *p)
 
   p->additionalOffset = 0;
 
-  p->pbMask = (1 << p->pb) - 1;
+  p->pbMask = ((unsigned)1 << p->pb) - 1;
   p->lpMask = ((UInt32)0x100 << p->lp) - ((unsigned)0x100 >> p->lc);
+
+  // p->mf_Failure = False;
 }
 
 
-void LzmaEnc_InitPrices(CLzmaEnc *p)
+static void LzmaEnc_InitPrices(CLzmaEnc *p)
 {
   if (!p->fastMode)
   {
@@ -2695,8 +2850,8 @@ void LzmaEnc_InitPrices(CLzmaEnc *p)
 
   p->repLenEncCounter = REP_LEN_COUNT;
 
-  LenPriceEnc_UpdateTables(&p->lenEnc, 1 << p->pb, &p->lenProbs, p->ProbPrices);
-  LenPriceEnc_UpdateTables(&p->repLenEnc, 1 << p->pb, &p->repLenProbs, p->ProbPrices);
+  LenPriceEnc_UpdateTables(&p->lenEnc, (unsigned)1 << p->pb, &p->lenProbs, p->ProbPrices);
+  LenPriceEnc_UpdateTables(&p->repLenEnc, (unsigned)1 << p->pb, &p->repLenProbs, p->ProbPrices);
 }
 
 static SRes LzmaEnc_AllocAndInit(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig)
@@ -2709,59 +2864,59 @@ static SRes LzmaEnc_AllocAndInit(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr
 
   p->finished = False;
   p->result = SZ_OK;
-  RINOK(LzmaEnc_Alloc(p, keepWindowSize, alloc, allocBig));
+  p->nowPos64 = 0;
+  p->needInit = 1;
+  RINOK(LzmaEnc_Alloc(p, keepWindowSize, alloc, allocBig))
   LzmaEnc_Init(p);
   LzmaEnc_InitPrices(p);
-  p->nowPos64 = 0;
   return SZ_OK;
 }
 
-SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream,
+SRes LzmaEnc_Prepare(CLzmaEncHandle p,
+    ISeqOutStreamPtr outStream,
+    ISeqInStreamPtr inStream,
     ISzAllocPtr alloc, ISzAllocPtr allocBig)
 {
-  CLzmaEnc *p = (CLzmaEnc *)pp;
-  p->matchFinderBase.stream = inStream;
-  p->needInit = 1;
+  // GET_CLzmaEnc_p
+  MatchFinder_SET_STREAM(&MFB, inStream)
   p->rc.outStream = outStream;
   return LzmaEnc_AllocAndInit(p, 0, alloc, allocBig);
 }
 
-SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp,
-    ISeqInStream *inStream, UInt32 keepWindowSize,
-    ISzAllocPtr alloc, ISzAllocPtr allocBig)
+BoolInt LzmaEnc_IsFinished(CLzmaEncHandle pp)
 {
   CLzmaEnc *p = (CLzmaEnc *)pp;
-  p->matchFinderBase.stream = inStream;
-  p->needInit = 1;
-  return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig);
+  return p->finished;
 }
 
-static void LzmaEnc_SetInputBuf(CLzmaEnc *p, const Byte *src, SizeT srcLen)
+SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle p,
+    ISeqInStreamPtr inStream, UInt32 keepWindowSize,
+    ISzAllocPtr alloc, ISzAllocPtr allocBig)
 {
-  p->matchFinderBase.directInput = 1;
-  p->matchFinderBase.bufferBase = (Byte *)src;
-  p->matchFinderBase.directInputRem = srcLen;
+  // GET_CLzmaEnc_p
+  MatchFinder_SET_STREAM(&MFB, inStream)
+  return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig);
 }
 
-SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const Byte *src, SizeT srcLen,
-    UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig)
+SRes LzmaEnc_MemPrepare(CLzmaEncHandle p,
+    const Byte *src, SizeT srcLen,
+    UInt32 keepWindowSize,
+    ISzAllocPtr alloc, ISzAllocPtr allocBig)
 {
-  CLzmaEnc *p = (CLzmaEnc *)pp;
-  LzmaEnc_SetInputBuf(p, src, srcLen);
-  p->needInit = 1;
-
-  LzmaEnc_SetDataSize(pp, srcLen);
+  // GET_CLzmaEnc_p
+  MatchFinder_SET_DIRECT_INPUT_BUF(&MFB, src, srcLen)
+  LzmaEnc_SetDataSize(p, srcLen);
   return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig);
 }
 
-void LzmaEnc_Finish(CLzmaEncHandle pp)
+void LzmaEnc_Finish(CLzmaEncHandle p)
 {
-  #ifndef _7ZIP_ST
-  CLzmaEnc *p = (CLzmaEnc *)pp;
+  #ifndef Z7_ST
+  // GET_CLzmaEnc_p
   if (p->mtMode)
     MatchFinderMt_ReleaseStream(&p->matchFinderMt);
   #else
-  UNUSED_VAR(pp);
+  UNUSED_VAR(p)
   #endif
 }
 
@@ -2770,43 +2925,48 @@ typedef struct
 {
   ISeqOutStream vt;
   Byte *data;
-  SizeT rem;
+  size_t rem;
   BoolInt overflow;
 } CLzmaEnc_SeqOutStreamBuf;
 
-static size_t SeqOutStreamBuf_Write(const ISeqOutStream *pp, const void *data, size_t size)
+static size_t SeqOutStreamBuf_Write(ISeqOutStreamPtr pp, const void *data, size_t size)
 {
-  CLzmaEnc_SeqOutStreamBuf *p = CONTAINER_FROM_VTBL(pp, CLzmaEnc_SeqOutStreamBuf, vt);
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CLzmaEnc_SeqOutStreamBuf)
   if (p->rem < size)
   {
     size = p->rem;
     p->overflow = True;
   }
-  memcpy(p->data, data, size);
-  p->rem -= size;
-  p->data += size;
+  if (size != 0)
+  {
+    memcpy(p->data, data, size);
+    p->rem -= size;
+    p->data += size;
+  }
   return size;
 }
 
 
-UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp)
+/*
+UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle p)
 {
-  const CLzmaEnc *p = (CLzmaEnc *)pp;
+  GET_const_CLzmaEnc_p
   return p->matchFinder.GetNumAvailableBytes(p->matchFinderObj);
 }
+*/
 
-
-const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle pp)
+const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle p)
 {
-  const CLzmaEnc *p = (CLzmaEnc *)pp;
+  // GET_const_CLzmaEnc_p
   return p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - p->additionalOffset;
 }
 
 
-SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit,
+// (desiredPackSize == 0) is not allowed
+SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle p, BoolInt reInit,
     Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize)
 {
-  CLzmaEnc *p = (CLzmaEnc *)pp;
+  // GET_CLzmaEnc_p
   UInt64 nowPos64;
   SRes res;
   CLzmaEnc_SeqOutStreamBuf outStream;
@@ -2823,14 +2983,10 @@ SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit,
   if (reInit)
     LzmaEnc_Init(p);
   LzmaEnc_InitPrices(p);
-
-  nowPos64 = p->nowPos64;
   RangeEnc_Init(&p->rc);
   p->rc.outStream = &outStream.vt;
-
-  if (desiredPackSize == 0)
-    return SZ_ERROR_OUTPUT_EOF;
-
+  nowPos64 = p->nowPos64;
+  
   res = LzmaEnc_CodeOneBlock(p, desiredPackSize, *unpackSize);
   
   *unpackSize = (UInt32)(p->nowPos64 - nowPos64);
@@ -2842,11 +2998,12 @@ SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit,
 }
 
 
-static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress)
+Z7_NO_INLINE
+static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgressPtr progress)
 {
   SRes res = SZ_OK;
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   Byte allocaDummy[0x300];
   allocaDummy[0] = 0;
   allocaDummy[1] = allocaDummy[0];
@@ -2868,10 +3025,10 @@ static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress)
     }
   }
   
-  LzmaEnc_Finish(p);
+  LzmaEnc_Finish((CLzmaEncHandle)(void *)p);
 
   /*
-  if (res == SZ_OK && !Inline_MatchFinder_IsFinishedOK(&p->matchFinderBase))
+  if (res == SZ_OK && !Inline_MatchFinder_IsFinishedOK(&MFB))
     res = SZ_ERROR_FAIL;
   }
   */
@@ -2880,53 +3037,63 @@ static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress)
 }
 
 
-SRes LzmaEnc_Encode(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ICompressProgress *progress,
+SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, ICompressProgressPtr progress,
     ISzAllocPtr alloc, ISzAllocPtr allocBig)
 {
-  RINOK(LzmaEnc_Prepare(pp, outStream, inStream, alloc, allocBig));
-  return LzmaEnc_Encode2((CLzmaEnc *)pp, progress);
+  // GET_CLzmaEnc_p
+  RINOK(LzmaEnc_Prepare(p, outStream, inStream, alloc, allocBig))
+  return LzmaEnc_Encode2(p, progress);
 }
 
 
-SRes LzmaEnc_WriteProperties(CLzmaEncHandle pp, Byte *props, SizeT *size)
+SRes LzmaEnc_WriteProperties(CLzmaEncHandle p, Byte *props, SizeT *size)
 {
-  CLzmaEnc *p = (CLzmaEnc *)pp;
-  unsigned i;
-  UInt32 dictSize = p->dictSize;
   if (*size < LZMA_PROPS_SIZE)
     return SZ_ERROR_PARAM;
   *size = LZMA_PROPS_SIZE;
-  props[0] = (Byte)((p->pb * 5 + p->lp) * 9 + p->lc);
-
-  if (dictSize >= ((UInt32)1 << 22))
   {
-    UInt32 kDictMask = ((UInt32)1 << 20) - 1;
-    if (dictSize < (UInt32)0xFFFFFFFF - kDictMask)
-      dictSize = (dictSize + kDictMask) & ~kDictMask;
-  }
-  else for (i = 11; i <= 30; i++)
-  {
-    if (dictSize <= ((UInt32)2 << i)) { dictSize = (2 << i); break; }
-    if (dictSize <= ((UInt32)3 << i)) { dictSize = (3 << i); break; }
-  }
+    // GET_CLzmaEnc_p
+    const UInt32 dictSize = p->dictSize;
+    UInt32 v;
+    props[0] = (Byte)((p->pb * 5 + p->lp) * 9 + p->lc);
+    
+    // we write aligned dictionary value to properties for lzma decoder
+    if (dictSize >= ((UInt32)1 << 21))
+    {
+      const UInt32 kDictMask = ((UInt32)1 << 20) - 1;
+      v = (dictSize + kDictMask) & ~kDictMask;
+      if (v < dictSize)
+        v = dictSize;
+    }
+    else
+    {
+      unsigned i = 11 * 2;
+      do
+      {
+        v = (UInt32)(2 + (i & 1)) << (i >> 1);
+        i++;
+      }
+      while (v < dictSize);
+    }
 
-  for (i = 0; i < 4; i++)
-    props[1 + i] = (Byte)(dictSize >> (8 * i));
-  return SZ_OK;
+    SetUi32(props + 1, v)
+    return SZ_OK;
+  }
 }
 
 
-unsigned LzmaEnc_IsWriteEndMark(CLzmaEncHandle pp)
+unsigned LzmaEnc_IsWriteEndMark(CLzmaEncHandle p)
 {
-  return ((CLzmaEnc *)pp)->writeEndMark;
+  // GET_CLzmaEnc_p
+  return (unsigned)p->writeEndMark;
 }
 
 
-SRes LzmaEnc_MemEncode(CLzmaEncHandle pp, Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
-    int writeEndMark, ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig)
+SRes LzmaEnc_MemEncode(CLzmaEncHandle p, Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
+    int writeEndMark, ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig)
 {
   SRes res;
-  CLzmaEnc *p = (CLzmaEnc *)pp;
+  // GET_CLzmaEnc_p
 
   CLzmaEnc_SeqOutStreamBuf outStream;
 
@@ -2938,7 +3105,7 @@ SRes LzmaEnc_MemEncode(CLzmaEncHandle pp, Byte *dest, SizeT *destLen, const Byte
   p->writeEndMark = writeEndMark;
   p->rc.outStream = &outStream.vt;
 
-  res = LzmaEnc_MemPrepare(pp, src, srcLen, 0, alloc, allocBig);
+  res = LzmaEnc_MemPrepare(p, src, srcLen, 0, alloc, allocBig);
   
   if (res == SZ_OK)
   {
@@ -2947,7 +3114,7 @@ SRes LzmaEnc_MemEncode(CLzmaEncHandle pp, Byte *dest, SizeT *destLen, const Byte
       res = SZ_ERROR_FAIL;
   }
 
-  *destLen -= outStream.rem;
+  *destLen -= (SizeT)outStream.rem;
   if (outStream.overflow)
     return SZ_ERROR_OUTPUT_EOF;
   return res;
@@ -2956,9 +3123,9 @@ SRes LzmaEnc_MemEncode(CLzmaEncHandle pp, Byte *dest, SizeT *destLen, const Byte
 
 SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
     const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark,
-    ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig)
+    ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig)
 {
-  CLzmaEnc *p = (CLzmaEnc *)LzmaEnc_Create(alloc);
+  CLzmaEncHandle p = LzmaEnc_Create(alloc);
   SRes res;
   if (!p)
     return SZ_ERROR_MEM;
@@ -2976,8 +3143,14 @@ SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
   return res;
 }
 
-BoolInt LzmaEnc_IsFinished(CLzmaEncHandle pp)
+
+/*
+#ifndef Z7_ST
+void LzmaEnc_GetLzThreads(CLzmaEncHandle p, HANDLE lz_threads[2])
 {
-  CLzmaEnc *p = (CLzmaEnc *)pp;
-  return p->finished;
+  GET_const_CLzmaEnc_p
+  lz_threads[0] = p->matchFinderMt.hashSync.thread;
+  lz_threads[1] = p->matchFinderMt.btSync.thread;
 }
+#endif
+*/
diff --git a/src/sdk/C/LzmaEnc.h b/src/sdk/C/LzmaEnc.h
index 37a0906..2071f42 100644
--- a/src/sdk/C/LzmaEnc.h
+++ b/src/sdk/C/LzmaEnc.h
@@ -1,8 +1,8 @@
 /*  LzmaEnc.h -- LZMA Encoder
-2017-07-27 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
-#ifndef __LZMA_ENC_H
-#define __LZMA_ENC_H
+#ifndef ZIP7_INC_LZMA_ENC_H
+#define ZIP7_INC_LZMA_ENC_H
 
 #include "7zTypes.h"
 
@@ -10,7 +10,7 @@ EXTERN_C_BEGIN
 
 #define LZMA_PROPS_SIZE 5
 
-typedef struct _CLzmaEncProps
+typedef struct
 {
   int level;       /* 0 <= level <= 9 */
   UInt32 dictSize; /* (1 << 12) <= dictSize <= (1 << 27) for 32-bit version
@@ -23,12 +23,19 @@ typedef struct _CLzmaEncProps
   int fb;          /* 5 <= fb <= 273, default = 32 */
   int btMode;      /* 0 - hashChain Mode, 1 - binTree mode - normal, default = 1 */
   int numHashBytes; /* 2, 3 or 4, default = 4 */
+  unsigned numHashOutBits;  /* default = ? */
   UInt32 mc;       /* 1 <= mc <= (1 << 30), default = 32 */
   unsigned writeEndMark;  /* 0 - do not write EOPM, 1 - write EOPM, default = 0 */
   int numThreads;  /* 1 or 2, default = 2 */
 
+  // int _pad;
+  Int32 affinityGroup;
+
   UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1.
                         Encoder uses this value to reduce dictionary size */
+
+  UInt64 affinity;
+  UInt64 affinityInGroup;
 } CLzmaEncProps;
 
 void LzmaEncProps_Init(CLzmaEncProps *p);
@@ -49,7 +56,9 @@ UInt32 LzmaEncProps_GetDictSize(const CLzmaEncProps *props2);
   SZ_ERROR_THREAD - error in multithreading functions (only for Mt version)
 */
 
-typedef void * CLzmaEncHandle;
+typedef struct CLzmaEnc CLzmaEnc;
+typedef CLzmaEnc * CLzmaEncHandle;
+// Z7_DECLARE_HANDLE(CLzmaEncHandle)
 
 CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc);
 void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAllocPtr alloc, ISzAllocPtr allocBig);
@@ -59,23 +68,23 @@ void LzmaEnc_SetDataSize(CLzmaEncHandle p, UInt64 expectedDataSiize);
 SRes LzmaEnc_WriteProperties(CLzmaEncHandle p, Byte *properties, SizeT *size);
 unsigned LzmaEnc_IsWriteEndMark(CLzmaEncHandle p);
 
-SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStream *outStream, ISeqInStream *inStream,
-    ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
+SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream,
+    ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
 SRes LzmaEnc_MemEncode(CLzmaEncHandle p, Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
-    int writeEndMark, ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
+    int writeEndMark, ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
 
 
 /* ---------- One Call Interface ---------- */
 
 SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
     const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark,
-    ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
+    ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
 
 EXTERN_C_END
 
 /* ---------- Streaming Interface ---------- */
 
-SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ISzAllocPtr alloc, ISzAllocPtr allocBig);
+SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, ISzAllocPtr alloc, ISzAllocPtr allocBig);
 SRes LzmaEnc_CodeOneBlock(CLzmaEncHandle pp, UInt32 maxPackSize, UInt32 maxUnpackSize);
 BoolInt LzmaEnc_IsFinished(CLzmaEncHandle pp);
 void LzmaEnc_Finish(CLzmaEncHandle pp);
diff --git a/src/sdk/C/LzmaLib.c b/src/sdk/C/LzmaLib.c
index 706e9e5..785e884 100644
--- a/src/sdk/C/LzmaLib.c
+++ b/src/sdk/C/LzmaLib.c
@@ -1,12 +1,14 @@
 /* LzmaLib.c -- LZMA library wrapper
-2015-06-13 : Igor Pavlov : Public domain */
+2023-04-02 : Igor Pavlov : Public domain */
+
+#include "Precomp.h"
 
 #include "Alloc.h"
 #include "LzmaDec.h"
 #include "LzmaEnc.h"
 #include "LzmaLib.h"
 
-MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen,
+Z7_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen,
   unsigned char *outProps, size_t *outPropsSize,
   int level, /* 0 <= level <= 9, default = 5 */
   unsigned dictSize, /* use (1 << N) or (3 << N). 4 KB < dictSize <= 128 MB */
@@ -32,7 +34,7 @@ MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char
 }
 
 
-MY_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t *srcLen,
+Z7_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t *srcLen,
   const unsigned char *props, size_t propsSize)
 {
   ELzmaStatus status;
diff --git a/src/sdk/C/LzmaLib.h b/src/sdk/C/LzmaLib.h
index 88fa87d..d7c0724 100644
--- a/src/sdk/C/LzmaLib.h
+++ b/src/sdk/C/LzmaLib.h
@@ -1,14 +1,14 @@
 /* LzmaLib.h -- LZMA library interface
-2013-01-18 : Igor Pavlov : Public domain */
+2023-04-02 : Igor Pavlov : Public domain */
 
-#ifndef __LZMA_LIB_H
-#define __LZMA_LIB_H
+#ifndef ZIP7_INC_LZMA_LIB_H
+#define ZIP7_INC_LZMA_LIB_H
 
 #include "7zTypes.h"
 
 EXTERN_C_BEGIN
 
-#define MY_STDAPI int MY_STD_CALL
+#define Z7_STDAPI int Z7_STDCALL
 
 #define LZMA_PROPS_SIZE 5
 
@@ -40,14 +40,16 @@ outPropsSize -
 level - compression level: 0 <= level <= 9;
 
   level dictSize algo  fb
-    0:    16 KB   0    32
-    1:    64 KB   0    32
-    2:   256 KB   0    32
-    3:     1 MB   0    32
-    4:     4 MB   0    32
+    0:    64 KB   0    32
+    1:   256 KB   0    32
+    2:     1 MB   0    32
+    3:     4 MB   0    32
+    4:    16 MB   0    32
     5:    16 MB   1    32
     6:    32 MB   1    32
-    7+:   64 MB   1    64
+    7:    32 MB   1    64
+    8:    64 MB   1    64
+    9:    64 MB   1    64
  
   The default value for "level" is 5.
 
@@ -83,6 +85,11 @@ fb - Word size (the number of fast bytes).
 numThreads - The number of thereads. 1 or 2. The default value is 2.
      Fast mode (algo = 0) can use only 1 thread.
 
+In:
+  dest     - output data buffer
+  destLen  - output data buffer size
+  src      - input data
+  srcLen   - input data size
 Out:
   destLen  - processed output size
 Returns:
@@ -93,7 +100,7 @@ numThreads - The number of thereads. 1 or 2. The default value is 2.
   SZ_ERROR_THREAD     - errors in multithreading functions (only for Mt version)
 */
 
-MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen,
+Z7_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen,
   unsigned char *outProps, size_t *outPropsSize, /* *outPropsSize must be = 5 */
   int level,      /* 0 <= level <= 9, default = 5 */
   unsigned dictSize,  /* default = (1 << 24) */
@@ -108,8 +115,8 @@ MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char
 LzmaUncompress
 --------------
 In:
-  dest     - output data
-  destLen  - output data size
+  dest     - output data buffer
+  destLen  - output data buffer size
   src      - input data
   srcLen   - input data size
 Out:
@@ -123,7 +130,7 @@ LzmaUncompress
   SZ_ERROR_INPUT_EOF   - it needs more bytes in input buffer (src)
 */
 
-MY_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, SizeT *srcLen,
+Z7_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, SizeT *srcLen,
   const unsigned char *props, size_t propsSize);
 
 EXTERN_C_END
diff --git a/src/sdk/C/MtCoder.c b/src/sdk/C/MtCoder.c
index 9535985..923b19a 100644
--- a/src/sdk/C/MtCoder.c
+++ b/src/sdk/C/MtCoder.c
@@ -1,28 +1,28 @@
 /* MtCoder.c -- Multi-thread Coder
-2018-07-04 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include "MtCoder.h"
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
-SRes MtProgressThunk_Progress(const ICompressProgress *pp, UInt64 inSize, UInt64 outSize)
+static SRes MtProgressThunk_Progress(ICompressProgressPtr pp, UInt64 inSize, UInt64 outSize)
 {
-  CMtProgressThunk *thunk = CONTAINER_FROM_VTBL(pp, CMtProgressThunk, vt);
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CMtProgressThunk)
   UInt64 inSize2 = 0;
   UInt64 outSize2 = 0;
   if (inSize != (UInt64)(Int64)-1)
   {
-    inSize2 = inSize - thunk->inSize;
-    thunk->inSize = inSize;
+    inSize2 = inSize - p->inSize;
+    p->inSize = inSize;
   }
   if (outSize != (UInt64)(Int64)-1)
   {
-    outSize2 = outSize - thunk->outSize;
-    thunk->outSize = outSize;
+    outSize2 = outSize - p->outSize;
+    p->outSize = outSize;
   }
-  return MtProgress_ProgressAdd(thunk->mtProgress, inSize2, outSize2);
+  return MtProgress_ProgressAdd(p->mtProgress, inSize2, outSize2);
 }
 
 
@@ -36,25 +36,31 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p)
 #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; }
 
 
-static WRes ArEvent_OptCreate_And_Reset(CEvent *p)
-{
-  if (Event_IsCreated(p))
-    return Event_Reset(p);
-  return AutoResetEvent_CreateNotSignaled(p);
-}
-
+static THREAD_FUNC_DECL ThreadFunc(void *pp);
 
-static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp);
 
-
-static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
+static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t
+#ifdef _WIN32
+    , CMtCoder * const mtc
+#endif
+    )
 {
-  WRes wres = ArEvent_OptCreate_And_Reset(&t->startEvent);
+  WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent);
+  // printf("\n====== MtCoderThread_CreateAndStart : \n");
   if (wres == 0)
   {
     t->stop = False;
     if (!Thread_WasCreated(&t->thread))
-      wres = Thread_Create(&t->thread, ThreadFunc, t);
+    {
+#ifdef _WIN32
+      if (mtc->numThreadGroups)
+        wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t,
+            ThreadNextGroup_GetNext(&mtc->nextGroup), // group
+            0); // affinityMask
+      else
+#endif
+        wres = Thread_Create(&t->thread, ThreadFunc, t);
+    }
     if (wres == 0)
       wres = Event_Set(&t->startEvent);
   }
@@ -64,14 +70,14 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
 }
 
 
+Z7_FORCE_INLINE
 static void MtCoderThread_Destruct(CMtCoderThread *t)
 {
   if (Thread_WasCreated(&t->thread))
   {
     t->stop = 1;
     Event_Set(&t->startEvent);
-    Thread_Wait(&t->thread);
-    Thread_Close(&t->thread);
+    Thread_Wait_Close(&t->thread);
   }
 
   Event_Close(&t->startEvent);
@@ -85,24 +91,6 @@ static void MtCoderThread_Destruct(CMtCoderThread *t)
 
 
 
-static SRes FullRead(ISeqInStream *stream, Byte *data, size_t *processedSize)
-{
-  size_t size = *processedSize;
-  *processedSize = 0;
-  while (size != 0)
-  {
-    size_t cur = size;
-    SRes res = ISeqInStream_Read(stream, data, &cur);
-    *processedSize += cur;
-    data += cur;
-    size -= cur;
-    RINOK(res);
-    if (cur == 0)
-      return SZ_OK;
-  }
-  return SZ_OK;
-}
-
 
 /*
   ThreadFunc2() returns:
@@ -112,7 +100,7 @@ static SRes FullRead(ISeqInStream *stream, Byte *data, size_t *processedSize)
 
 static SRes ThreadFunc2(CMtCoderThread *t)
 {
-  CMtCoder *mtc = t->mtCoder;
+  CMtCoder * const mtc = t->mtCoder;
 
   for (;;)
   {
@@ -153,7 +141,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
         }
         if (res == SZ_OK)
         {
-          res = FullRead(mtc->inStream, t->inBuf, &size);
+          res = SeqInStream_ReadMax(mtc->inStream, t->inBuf, &size);
           readProcessed = mtc->readProcessed + size;
           mtc->readProcessed = readProcessed;
         }
@@ -212,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t)
       if (mtc->numStartedThreads < mtc->numStartedThreadsLimit
           && mtc->expectedDataSize != readProcessed)
       {
-        res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]);
+        res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]
+#ifdef _WIN32
+            , mtc
+#endif
+            );
         if (res == SZ_OK)
           mtc->numStartedThreads++;
         else
@@ -248,13 +240,13 @@ static SRes ThreadFunc2(CMtCoderThread *t)
     }
 
     {
-      CMtCoderBlock *block = &mtc->blocks[bi];
+      CMtCoderBlock * const block = &mtc->blocks[bi];
       block->res = res;
       block->bufIndex = bufIndex;
       block->finished = finished;
     }
     
-    #ifdef MTCODER__USE_WRITE_THREAD
+    #ifdef MTCODER_USE_WRITE_THREAD
       RINOK_THREAD(Event_Set(&mtc->writeEvents[bi]))
     #else
     {
@@ -336,29 +328,29 @@ static SRes ThreadFunc2(CMtCoderThread *t)
 }
 
 
-static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp)
+static THREAD_FUNC_DECL ThreadFunc(void *pp)
 {
-  CMtCoderThread *t = (CMtCoderThread *)pp;
+  CMtCoderThread * const t = (CMtCoderThread *)pp;
   for (;;)
   {
     if (Event_Wait(&t->startEvent) != 0)
-      return SZ_ERROR_THREAD;
+      return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD;
     if (t->stop)
       return 0;
     {
-      SRes res = ThreadFunc2(t);
+      const SRes res = ThreadFunc2(t);
       CMtCoder *mtc = t->mtCoder;
       if (res != SZ_OK)
       {
         MtProgress_SetError(&mtc->mtProgress, res);
       }
       
-      #ifndef MTCODER__USE_WRITE_THREAD
+      #ifndef MTCODER_USE_WRITE_THREAD
       {
-        unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
+        const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
         if (numFinished == mtc->numStartedThreads)
           if (Event_Set(&mtc->finishedEvent) != 0)
-            return SZ_ERROR_THREAD;
+            return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD;
       }
       #endif
     }
@@ -373,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p)
   
   p->blockSize = 0;
   p->numThreadsMax = 0;
+  p->numThreadGroups = 0;
   p->expectedDataSize = (UInt64)(Int64)-1;
 
   p->inStream = NULL;
@@ -390,7 +383,7 @@ void MtCoder_Construct(CMtCoder *p)
   Event_Construct(&p->readEvent);
   Semaphore_Construct(&p->blocksSemaphore);
 
-  for (i = 0; i < MTCODER__THREADS_MAX; i++)
+  for (i = 0; i < MTCODER_THREADS_MAX; i++)
   {
     CMtCoderThread *t = &p->threads[i];
     t->mtCoder = p;
@@ -398,11 +391,11 @@ void MtCoder_Construct(CMtCoder *p)
     t->inBuf = NULL;
     t->stop = False;
     Event_Construct(&t->startEvent);
-    Thread_Construct(&t->thread);
+    Thread_CONSTRUCT(&t->thread)
   }
 
-  #ifdef MTCODER__USE_WRITE_THREAD
-    for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
+  #ifdef MTCODER_USE_WRITE_THREAD
+    for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
       Event_Construct(&p->writeEvents[i]);
   #else
     Event_Construct(&p->finishedEvent);
@@ -425,14 +418,14 @@ static void MtCoder_Free(CMtCoder *p)
     Event_Set(&p->readEvent);
   */
 
-  for (i = 0; i < MTCODER__THREADS_MAX; i++)
+  for (i = 0; i < MTCODER_THREADS_MAX; i++)
     MtCoderThread_Destruct(&p->threads[i]);
 
   Event_Close(&p->readEvent);
   Semaphore_Close(&p->blocksSemaphore);
 
-  #ifdef MTCODER__USE_WRITE_THREAD
-    for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
+  #ifdef MTCODER_USE_WRITE_THREAD
+    for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
       Event_Close(&p->writeEvents[i]);
   #else
     Event_Close(&p->finishedEvent);
@@ -456,20 +449,22 @@ SRes MtCoder_Code(CMtCoder *p)
   unsigned i;
   SRes res = SZ_OK;
 
-  if (numThreads > MTCODER__THREADS_MAX)
-    numThreads = MTCODER__THREADS_MAX;
-  numBlocksMax = MTCODER__GET_NUM_BLOCKS_FROM_THREADS(numThreads);
+  // printf("\n====== MtCoder_Code : \n");
+
+  if (numThreads > MTCODER_THREADS_MAX)
+      numThreads = MTCODER_THREADS_MAX;
+  numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads);
   
   if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++;
   if (p->blockSize < ((UInt32)1 << 24)) numBlocksMax++;
   if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++;
 
-  if (numBlocksMax > MTCODER__BLOCKS_MAX)
-    numBlocksMax = MTCODER__BLOCKS_MAX;
+  if (numBlocksMax > MTCODER_BLOCKS_MAX)
+      numBlocksMax = MTCODER_BLOCKS_MAX;
 
   if (p->blockSize != p->allocatedBufsSize)
   {
-    for (i = 0; i < MTCODER__THREADS_MAX; i++)
+    for (i = 0; i < MTCODER_THREADS_MAX; i++)
     {
       CMtCoderThread *t = &p->threads[i];
       if (t->inBuf)
@@ -485,28 +480,23 @@ SRes MtCoder_Code(CMtCoder *p)
 
   MtProgress_Init(&p->mtProgress, p->progress);
 
-  #ifdef MTCODER__USE_WRITE_THREAD
+  #ifdef MTCODER_USE_WRITE_THREAD
     for (i = 0; i < numBlocksMax; i++)
     {
-      RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->writeEvents[i]));
+      RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->writeEvents[i]))
     }
   #else
-    RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->finishedEvent));
+    RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->finishedEvent))
   #endif
 
   {
-    RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->readEvent));
-
-    if (Semaphore_IsCreated(&p->blocksSemaphore))
-    {
-      RINOK_THREAD(Semaphore_Close(&p->blocksSemaphore));
-    }
-    RINOK_THREAD(Semaphore_Create(&p->blocksSemaphore, numBlocksMax, numBlocksMax));
+    RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->readEvent))
+    RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, (UInt32)numBlocksMax, (UInt32)numBlocksMax))
   }
 
-  for (i = 0; i < MTCODER__BLOCKS_MAX - 1; i++)
+  for (i = 0; i < MTCODER_BLOCKS_MAX - 1; i++)
     p->freeBlockList[i] = i + 1;
-  p->freeBlockList[MTCODER__BLOCKS_MAX - 1] = (unsigned)(int)-1;
+  p->freeBlockList[MTCODER_BLOCKS_MAX - 1] = (unsigned)(int)-1;
   p->freeBlockHead = 0;
 
   p->readProcessed = 0;
@@ -514,26 +504,37 @@ SRes MtCoder_Code(CMtCoder *p)
   p->numBlocksMax = numBlocksMax;
   p->stopReading = False;
 
-  #ifndef MTCODER__USE_WRITE_THREAD
+  #ifndef MTCODER_USE_WRITE_THREAD
     p->writeIndex = 0;
     p->writeRes = SZ_OK;
-    for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
+    for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
       p->ReadyBlocks[i] = False;
     p->numFinishedThreads = 0;
   #endif
 
   p->numStartedThreadsLimit = numThreads;
   p->numStartedThreads = 0;
+  ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup
 
   // for (i = 0; i < numThreads; i++)
   {
+    // here we create new thread for first block.
+    // And each new thread will create another new thread after block reading
+    // until numStartedThreadsLimit is reached.
     CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++];
-    RINOK(MtCoderThread_CreateAndStart(nextThread));
+    {
+      const SRes res2 = MtCoderThread_CreateAndStart(nextThread
+#ifdef _WIN32
+            , p
+#endif
+            );
+      RINOK(res2)
+    }
   }
 
   RINOK_THREAD(Event_Set(&p->readEvent))
 
-  #ifdef MTCODER__USE_WRITE_THREAD
+  #ifdef MTCODER_USE_WRITE_THREAD
   {
     unsigned bi = 0;
 
@@ -545,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p)
       RINOK_THREAD(Event_Wait(&p->writeEvents[bi]))
 
       {
-        const CMtCoderBlock *block = &p->blocks[bi];
-        unsigned bufIndex = block->bufIndex;
-        BoolInt finished = block->finished;
+        const CMtCoderBlock * const block = &p->blocks[bi];
+        const unsigned bufIndex = block->bufIndex;
+        const BoolInt finished = block->finished;
         if (res == SZ_OK && block->res != SZ_OK)
           res = block->res;
 
@@ -577,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p)
   }
   #else
   {
-    WRes wres = Event_Wait(&p->finishedEvent);
+    const WRes wres = Event_Wait(&p->finishedEvent);
     res = MY_SRes_HRESULT_FROM_WRes(wres);
   }
   #endif
@@ -588,7 +589,7 @@ SRes MtCoder_Code(CMtCoder *p)
   if (res == SZ_OK)
     res = p->mtProgress.res;
 
-  #ifndef MTCODER__USE_WRITE_THREAD
+  #ifndef MTCODER_USE_WRITE_THREAD
     if (res == SZ_OK)
       res = p->writeRes;
   #endif
@@ -599,3 +600,5 @@ SRes MtCoder_Code(CMtCoder *p)
 }
 
 #endif
+
+#undef RINOK_THREAD
diff --git a/src/sdk/C/MtCoder.h b/src/sdk/C/MtCoder.h
index 5a5f4d1..8166cca 100644
--- a/src/sdk/C/MtCoder.h
+++ b/src/sdk/C/MtCoder.h
@@ -1,30 +1,30 @@
 /* MtCoder.h -- Multi-thread Coder
-2018-07-04 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
-#ifndef __MT_CODER_H
-#define __MT_CODER_H
+#ifndef ZIP7_INC_MT_CODER_H
+#define ZIP7_INC_MT_CODER_H
 
 #include "MtDec.h"
 
 EXTERN_C_BEGIN
 
 /*
-  if (    defined MTCODER__USE_WRITE_THREAD) : main thread writes all data blocks to output stream
-  if (not defined MTCODER__USE_WRITE_THREAD) : any coder thread can write data blocks to output stream
+  if (    defined MTCODER_USE_WRITE_THREAD) : main thread writes all data blocks to output stream
+  if (not defined MTCODER_USE_WRITE_THREAD) : any coder thread can write data blocks to output stream
 */
-/* #define MTCODER__USE_WRITE_THREAD */
+/* #define MTCODER_USE_WRITE_THREAD */
 
-#ifndef _7ZIP_ST
-  #define MTCODER__GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1)
-  #define MTCODER__THREADS_MAX 64
-  #define MTCODER__BLOCKS_MAX (MTCODER__GET_NUM_BLOCKS_FROM_THREADS(MTCODER__THREADS_MAX) + 3)
+#ifndef Z7_ST
+  #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1)
+  #define MTCODER_THREADS_MAX 256
+  #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3)
 #else
-  #define MTCODER__THREADS_MAX 1
-  #define MTCODER__BLOCKS_MAX 1
+  #define MTCODER_THREADS_MAX 1
+  #define MTCODER_BLOCKS_MAX 1
 #endif
 
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
 
 typedef struct
@@ -37,15 +37,15 @@ typedef struct
 
 void MtProgressThunk_CreateVTable(CMtProgressThunk *p);
     
-#define MtProgressThunk_Init(p) { (p)->inSize = 0; (p)->outSize = 0; }
+#define MtProgressThunk_INIT(p) { (p)->inSize = 0; (p)->outSize = 0; }
 
 
-struct _CMtCoder;
+struct CMtCoder_;
 
 
 typedef struct
 {
-  struct _CMtCoder *mtCoder;
+  struct CMtCoder_ *mtCoder;
   unsigned index;
   int stop;
   Byte *inBuf;
@@ -71,19 +71,20 @@ typedef struct
 } CMtCoderBlock;
 
 
-typedef struct _CMtCoder
+typedef struct CMtCoder_
 {
   /* input variables */
   
   size_t blockSize;        /* size of input block */
   unsigned numThreadsMax;
+  unsigned numThreadGroups;
   UInt64 expectedDataSize;
 
-  ISeqInStream *inStream;
+  ISeqInStreamPtr inStream;
   const Byte *inData;
   size_t inDataSize;
 
-  ICompressProgress *progress;
+  ICompressProgressPtr progress;
   ISzAllocPtr allocBig;
 
   IMtCoderCallback2 *mtCallback;
@@ -100,13 +101,13 @@ typedef struct _CMtCoder
   BoolInt stopReading;
   SRes readRes;
 
-  #ifdef MTCODER__USE_WRITE_THREAD
-    CAutoResetEvent writeEvents[MTCODER__BLOCKS_MAX];
+  #ifdef MTCODER_USE_WRITE_THREAD
+    CAutoResetEvent writeEvents[MTCODER_BLOCKS_MAX];
   #else
     CAutoResetEvent finishedEvent;
     SRes writeRes;
     unsigned writeIndex;
-    Byte ReadyBlocks[MTCODER__BLOCKS_MAX];
+    Byte ReadyBlocks[MTCODER_BLOCKS_MAX];
     LONG numFinishedThreads;
   #endif
 
@@ -120,11 +121,13 @@ typedef struct _CMtCoder
   CCriticalSection cs;
 
   unsigned freeBlockHead;
-  unsigned freeBlockList[MTCODER__BLOCKS_MAX];
+  unsigned freeBlockList[MTCODER_BLOCKS_MAX];
 
   CMtProgress mtProgress;
-  CMtCoderBlock blocks[MTCODER__BLOCKS_MAX];
-  CMtCoderThread threads[MTCODER__THREADS_MAX];
+  CMtCoderBlock blocks[MTCODER_BLOCKS_MAX];
+  CMtCoderThread threads[MTCODER_THREADS_MAX];
+
+  CThreadNextGroup nextGroup;
 } CMtCoder;
 
 
diff --git a/src/sdk/C/MtDec.c b/src/sdk/C/MtDec.c
index 7803bf2..96274b6 100644
--- a/src/sdk/C/MtDec.c
+++ b/src/sdk/C/MtDec.c
@@ -1,16 +1,21 @@
 /* MtDec.c -- Multi-thread Decoder
-2019-02-02 : Igor Pavlov : Public domain */
+2024-02-20 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 // #define SHOW_DEBUG_INFO
 
 // #include <stdio.h>
+#include <string.h>
 
 #ifdef SHOW_DEBUG_INFO
 #include <stdio.h>
 #endif
 
+#include "MtDec.h"
+
+#ifndef Z7_ST
+
 #ifdef SHOW_DEBUG_INFO
 #define PRF(x) x
 #else
@@ -19,11 +24,7 @@
 
 #define PRF_STR_INT(s, d) PRF(printf("\n" s " %d\n", (unsigned)d))
 
-#include "MtDec.h"
-
-#ifndef _7ZIP_ST
-
-void MtProgress_Init(CMtProgress *p, ICompressProgress *progress)
+void MtProgress_Init(CMtProgress *p, ICompressProgressPtr progress)
 {
   p->progress = progress;
   p->res = SZ_OK;
@@ -77,39 +78,31 @@ void MtProgress_SetError(CMtProgress *p, SRes res)
 }
 
 
-#define RINOK_THREAD(x) RINOK(x)
-
-
-static WRes ArEvent_OptCreate_And_Reset(CEvent *p)
-{
-  if (Event_IsCreated(p))
-    return Event_Reset(p);
-  return AutoResetEvent_CreateNotSignaled(p);
-}
+#define RINOK_THREAD(x) RINOK_WRes(x)
 
 
-struct __CMtDecBufLink
+struct CMtDecBufLink_
 {
-  struct __CMtDecBufLink *next;
+  struct CMtDecBufLink_ *next;
   void *pad[3];
 };
 
-typedef struct __CMtDecBufLink CMtDecBufLink;
+typedef struct CMtDecBufLink_ CMtDecBufLink;
 
 #define MTDEC__LINK_DATA_OFFSET sizeof(CMtDecBufLink)
 #define MTDEC__DATA_PTR_FROM_LINK(link) ((Byte *)(link) + MTDEC__LINK_DATA_OFFSET)
 
 
 
-static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp);
+static THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp);
 
 
 static WRes MtDecThread_CreateEvents(CMtDecThread *t)
 {
-  WRes wres = ArEvent_OptCreate_And_Reset(&t->canWrite);
+  WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->canWrite);
   if (wres == 0)
   {
-    wres = ArEvent_OptCreate_And_Reset(&t->canRead);
+    wres = AutoResetEvent_OptCreate_And_Reset(&t->canRead);
     if (wres == 0)
       return SZ_OK;
   }
@@ -125,7 +118,7 @@ static SRes MtDecThread_CreateAndStart(CMtDecThread *t)
   {
     if (Thread_WasCreated(&t->thread))
       return SZ_OK;
-    wres = Thread_Create(&t->thread, ThreadFunc, t);
+    wres = Thread_Create(&t->thread, MtDec_ThreadFunc, t);
     if (wres == 0)
       return SZ_OK;
   }
@@ -156,8 +149,7 @@ static void MtDecThread_CloseThread(CMtDecThread *t)
   {
     Event_Set(&t->canWrite); /* we can disable it. There are no threads waiting canWrite in normal cases */
     Event_Set(&t->canRead);
-    Thread_Wait(&t->thread);
-    Thread_Close(&t->thread);
+    Thread_Wait_Close(&t->thread);
   }
 
   Event_Close(&t->canRead);
@@ -167,7 +159,7 @@ static void MtDecThread_CloseThread(CMtDecThread *t)
 static void MtDec_CloseThreads(CMtDec *p)
 {
   unsigned i;
-  for (i = 0; i < MTDEC__THREADS_MAX; i++)
+  for (i = 0; i < MTDEC_THREADS_MAX; i++)
     MtDecThread_CloseThread(&p->threads[i]);
 }
 
@@ -179,25 +171,6 @@ static void MtDecThread_Destruct(CMtDecThread *t)
 
 
 
-static SRes FullRead(ISeqInStream *stream, Byte *data, size_t *processedSize)
-{
-  size_t size = *processedSize;
-  *processedSize = 0;
-  while (size != 0)
-  {
-    size_t cur = size;
-    SRes res = ISeqInStream_Read(stream, data, &cur);
-    *processedSize += cur;
-    data += cur;
-    size -= cur;
-    RINOK(res);
-    if (cur == 0)
-      return SZ_OK;
-  }
-  return SZ_OK;
-}
-
-
 static SRes MtDec_GetError_Spec(CMtDec *p, UInt64 interruptIndex, BoolInt *wasInterrupted)
 {
   SRes res;
@@ -253,7 +226,7 @@ Byte *MtDec_GetCrossBuff(CMtDec *p)
 
 
 /*
-  ThreadFunc2() returns:
+  MtDec_ThreadFunc2() returns:
   0      - in all normal cases (even for stream error or memory allocation error)
   (!= 0) - WRes error return by system threading function
 */
@@ -261,11 +234,11 @@ Byte *MtDec_GetCrossBuff(CMtDec *p)
 // #define MTDEC_ProgessStep (1 << 22)
 #define MTDEC_ProgessStep (1 << 0)
 
-static WRes ThreadFunc2(CMtDecThread *t)
+static WRes MtDec_ThreadFunc2(CMtDecThread *t)
 {
   CMtDec *p = t->mtDec;
 
-  PRF_STR_INT("ThreadFunc2", t->index);
+  PRF_STR_INT("MtDec_ThreadFunc2", t->index)
 
   // SetThreadAffinityMask(GetCurrentThread(), 1 << t->index);
 
@@ -289,18 +262,19 @@ static WRes ThreadFunc2(CMtDecThread *t)
     
     Byte *afterEndData = NULL;
     size_t afterEndData_Size = 0;
+    BoolInt afterEndData_IsCross = False;
 
     BoolInt canCreateNewThread = False;
     // CMtDecCallbackInfo parse;
     CMtDecThread *nextThread;
 
-    PRF_STR_INT("Event_Wait(&t->canRead)", t->index);
+    PRF_STR_INT("=============== Event_Wait(&t->canRead)", t->index)
 
-    RINOK_THREAD(Event_Wait(&t->canRead));
+    RINOK_THREAD(Event_Wait(&t->canRead))
     if (p->exitThread)
       return 0;
 
-    PRF_STR_INT("after Event_Wait(&t->canRead)", t->index);
+    PRF_STR_INT("after Event_Wait(&t->canRead)", t->index)
 
     // if (t->index == 3) return 19; // for test
 
@@ -372,7 +346,7 @@ static WRes ThreadFunc2(CMtDecThread *t)
             {
               size = p->inBufSize;
               
-              res = FullRead(p->inStream, data, &size);
+              res = SeqInStream_ReadMax(p->inStream, data, &size);
               
               // size = 10; // test
 
@@ -418,10 +392,12 @@ static WRes ThreadFunc2(CMtDecThread *t)
               parse.srcFinished = finish;
               parse.canCreateNewThread = True;
 
-              // PRF(printf("\nParse size = %d\n", (unsigned)size))
+              PRF(printf("\nParse size = %d\n", (unsigned)size));
 
               p->mtCallback->Parse(p->mtCallbackObject, t->index, &parse);
 
+              PRF(printf("   Parse processed = %d, state = %d \n", (unsigned)parse.srcSize, (unsigned)parse.state));
+
               needWrite = True;
               canCreateNewThread = parse.canCreateNewThread;
 
@@ -478,16 +454,12 @@ static WRes ThreadFunc2(CMtDecThread *t)
 
                 if (parse.state == MTDEC_PARSE_END)
                 {
-                  p->crossStart = 0;
-                  p->crossEnd = 0;
-
-                  if (crossSize != 0)
-                    memcpy(data + parse.srcSize, parseData + parse.srcSize, size - parse.srcSize); // we need all data
-                  afterEndData_Size = size - parse.srcSize;
                   afterEndData = parseData + parse.srcSize;
-
+                  afterEndData_Size = size - parse.srcSize;
+                  if (crossSize != 0)
+                    afterEndData_IsCross = True;
                   // we reduce data size to required bytes (parsed only)
-                  inDataSize -= (size - parse.srcSize);
+                  inDataSize -= afterEndData_Size;
                   if (!prev)
                     inDataSize_Start = parse.srcSize;
                   break;
@@ -616,7 +588,7 @@ static WRes ThreadFunc2(CMtDecThread *t)
     // if ( !finish ) we must call Event_Set(&nextThread->canWrite) in any case
     // if (  finish ) we switch to single-thread mode and there are 2 ways at the end of current iteration (current block):
     //   - if (needContinue) after Write(&needContinue), we restore decoding with new iteration
-    //   - otherwise we stop decoding and exit from ThreadFunc2()
+    //   - otherwise we stop decoding and exit from MtDec_ThreadFunc2()
 
     // Don't change (finish) variable in the further code
 
@@ -689,7 +661,7 @@ static WRes ThreadFunc2(CMtDecThread *t)
 
     // ---------- WRITE ----------
    
-    RINOK_THREAD(Event_Wait(&t->canWrite));
+    RINOK_THREAD(Event_Wait(&t->canWrite))
 
   {
     BoolInt isErrorMode = False;
@@ -752,13 +724,15 @@ static WRes ThreadFunc2(CMtDecThread *t)
     {
       // p->inProcessed += inCodePos;
 
+      PRF(printf("\n--Write afterSize = %d\n", (unsigned)afterEndData_Size));
+
       res = p->mtCallback->Write(p->mtCallbackObject, t->index,
           res == SZ_OK && needWriteToStream && !wasInterrupted, // needWrite
-          afterEndData, afterEndData_Size,
+          afterEndData, afterEndData_Size, afterEndData_IsCross,
           &needContinue,
           &canRecode);
-      
-      // res= E_INVALIDARG; // for test
+
+      // res = SZ_ERROR_FAIL; // for test
 
       PRF(printf("\nAfter Write needContinue = %d\n", (unsigned)needContinue));
       PRF(printf("\nprocessed = %d\n", (unsigned)p->inProcessed));
@@ -800,14 +774,14 @@ static WRes ThreadFunc2(CMtDecThread *t)
 
     if (!finish)
     {
-      RINOK_THREAD(Event_Set(&nextThread->canWrite));
+      RINOK_THREAD(Event_Set(&nextThread->canWrite))
     }
     else
     {
       if (needContinue)
       {
         // we restore decoding with new iteration
-        RINOK_THREAD(Event_Set(&p->threads[0].canWrite));
+        RINOK_THREAD(Event_Set(&p->threads[0].canWrite))
       }
       else
       {
@@ -816,7 +790,7 @@ static WRes ThreadFunc2(CMtDecThread *t)
           return SZ_OK;
         p->exitThread = True;
       }
-      RINOK_THREAD(Event_Set(&p->threads[0].canRead));
+      RINOK_THREAD(Event_Set(&p->threads[0].canRead))
     }
   }
   }
@@ -835,7 +809,17 @@ static WRes ThreadFunc2(CMtDecThread *t)
 #endif
 
 
-static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc1(void *pp)
+typedef
+  #ifdef _WIN32
+    UINT_PTR
+  #elif 1
+    uintptr_t
+  #else
+    ptrdiff_t
+  #endif
+    MY_uintptr_t;
+
+static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp)
 {
   WRes res;
 
@@ -844,10 +828,10 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc1(void *pp)
 
   // fprintf(stdout, "\n%d = %p\n", t->index, &t);
 
-  res = ThreadFunc2(t);
+  res = MtDec_ThreadFunc2(t);
   p = t->mtDec;
   if (res == 0)
-    return p->exitThreadWRes;
+    return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)p->exitThreadWRes;
   {
     // it's unexpected situation for some threading function error
     if (p->exitThreadWRes == 0)
@@ -858,18 +842,17 @@ static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc1(void *pp)
     Event_Set(&p->threads[0].canWrite);
     MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res));
   }
-  return res;
+  return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)res;
 }
 
-static MY_NO_INLINE THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE ThreadFunc(void *pp)
+static Z7_NO_INLINE THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp)
 {
+  #ifdef USE_ALLOCA
   CMtDecThread *t = (CMtDecThread *)pp;
-
   // fprintf(stderr, "\n%d = %p - before", t->index, &t);
-  #ifdef USE_ALLOCA
   t->allocaPtr = alloca(t->index * 128);
   #endif
-  return ThreadFunc1(pp);
+  return MtDec_ThreadFunc1(pp);
 }
 
 
@@ -883,7 +866,7 @@ int MtDec_PrepareRead(CMtDec *p)
     
   {
     unsigned i;
-    for (i = 0; i < MTDEC__THREADS_MAX; i++)
+    for (i = 0; i < MTDEC_THREADS_MAX; i++)
       if (i > p->numStartedThreads
           || p->numFilledThreads <=
             (i >= p->filledThreadStart ?
@@ -987,7 +970,7 @@ void MtDec_Construct(CMtDec *p)
 
   p->allocatedBufsSize = 0;
 
-  for (i = 0; i < MTDEC__THREADS_MAX; i++)
+  for (i = 0; i < MTDEC_THREADS_MAX; i++)
   {
     CMtDecThread *t = &p->threads[i];
     t->mtDec = p;
@@ -995,7 +978,7 @@ void MtDec_Construct(CMtDec *p)
     t->inBuf = NULL;
     Event_Construct(&t->canRead);
     Event_Construct(&t->canWrite);
-    Thread_Construct(&t->thread);
+    Thread_CONSTRUCT(&t->thread)
   }
 
   // Event_Construct(&p->finishedEvent);
@@ -1010,7 +993,7 @@ static void MtDec_Free(CMtDec *p)
 
   p->exitThread = True;
 
-  for (i = 0; i < MTDEC__THREADS_MAX; i++)
+  for (i = 0; i < MTDEC_THREADS_MAX; i++)
     MtDecThread_Destruct(&p->threads[i]);
 
   // Event_Close(&p->finishedEvent);
@@ -1061,15 +1044,15 @@ SRes MtDec_Code(CMtDec *p)
 
   {
     unsigned numThreads = p->numThreadsMax;
-    if (numThreads > MTDEC__THREADS_MAX)
-      numThreads = MTDEC__THREADS_MAX;
+    if (numThreads > MTDEC_THREADS_MAX)
+      numThreads = MTDEC_THREADS_MAX;
     p->numStartedThreads_Limit = numThreads;
     p->numStartedThreads = 0;
   }
 
   if (p->inBufSize != p->allocatedBufsSize)
   {
-    for (i = 0; i < MTDEC__THREADS_MAX; i++)
+    for (i = 0; i < MTDEC_THREADS_MAX; i++)
     {
       CMtDecThread *t = &p->threads[i];
       if (t->inBuf)
@@ -1086,19 +1069,20 @@ SRes MtDec_Code(CMtDec *p)
 
   MtProgress_Init(&p->mtProgress, p->progress);
 
-  // RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->finishedEvent));
+  // RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->finishedEvent))
   p->exitThread = False;
   p->exitThreadWRes = 0;
 
   {
     WRes wres;
-    WRes sres;
+    SRes sres;
     CMtDecThread *nextThread = &p->threads[p->numStartedThreads++];
     // wres = MtDecThread_CreateAndStart(nextThread);
     wres = MtDecThread_CreateEvents(nextThread);
     if (wres == 0) { wres = Event_Set(&nextThread->canWrite);
     if (wres == 0) { wres = Event_Set(&nextThread->canRead);
-    if (wres == 0) { wres = ThreadFunc(nextThread);
+    if (wres == 0) { THREAD_FUNC_RET_TYPE res = MtDec_ThreadFunc(nextThread);
+    wres = (WRes)(MY_uintptr_t)res;
     if (wres != 0)
     {
       p->needContinue = False;
@@ -1130,9 +1114,11 @@ SRes MtDec_Code(CMtDec *p)
       return SZ_OK;
 
     // if (sres != SZ_OK)
-      return sres;
-    // return E_FAIL;
+    return sres;
+    // return SZ_ERROR_FAIL;
   }
 }
 
 #endif
+
+#undef PRF
diff --git a/src/sdk/C/MtDec.h b/src/sdk/C/MtDec.h
index 9b57766..c28e8d9 100644
--- a/src/sdk/C/MtDec.h
+++ b/src/sdk/C/MtDec.h
@@ -1,46 +1,46 @@
 /* MtDec.h -- Multi-thread Decoder
-2018-07-04 : Igor Pavlov : Public domain */
+2023-04-02 : Igor Pavlov : Public domain */
 
-#ifndef __MT_DEC_H
-#define __MT_DEC_H
+#ifndef ZIP7_INC_MT_DEC_H
+#define ZIP7_INC_MT_DEC_H
 
 #include "7zTypes.h"
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 #include "Threads.h"
 #endif
 
 EXTERN_C_BEGIN
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
-#ifndef _7ZIP_ST
-  #define MTDEC__THREADS_MAX 32
+#ifndef Z7_ST
+  #define MTDEC_THREADS_MAX 32
 #else
-  #define MTDEC__THREADS_MAX 1
+  #define MTDEC_THREADS_MAX 1
 #endif
 
 
 typedef struct
 {
-  ICompressProgress *progress;
+  ICompressProgressPtr progress;
   SRes res;
   UInt64 totalInSize;
   UInt64 totalOutSize;
   CCriticalSection cs;
 } CMtProgress;
 
-void MtProgress_Init(CMtProgress *p, ICompressProgress *progress);
+void MtProgress_Init(CMtProgress *p, ICompressProgressPtr progress);
 SRes MtProgress_Progress_ST(CMtProgress *p);
 SRes MtProgress_ProgressAdd(CMtProgress *p, UInt64 inSize, UInt64 outSize);
 SRes MtProgress_GetError(CMtProgress *p);
 void MtProgress_SetError(CMtProgress *p, SRes res);
 
-struct _CMtDec;
+struct CMtDec;
 
 typedef struct
 {
-  struct _CMtDec *mtDec;
+  struct CMtDec_ *mtDec;
   unsigned index;
   void *inBuf;
 
@@ -108,15 +108,16 @@ typedef struct
   */
   SRes (*Write)(void *p, unsigned coderIndex,
       BoolInt needWriteToStream,
-      const Byte *src, size_t srcSize,
+      const Byte *src, size_t srcSize, BoolInt isCross,
       // int srcFinished,
       BoolInt *needContinue,
       BoolInt *canRecode);
-} IMtDecCallback;
 
+} IMtDecCallback2;
 
 
-typedef struct _CMtDec
+
+typedef struct CMtDec_
 {
   /* input variables */
   
@@ -125,14 +126,14 @@ typedef struct _CMtDec
   // size_t inBlockMax;
   unsigned numThreadsMax_2;
 
-  ISeqInStream *inStream;
+  ISeqInStreamPtr inStream;
   // const Byte *inData;
   // size_t inDataSize;
 
-  ICompressProgress *progress;
+  ICompressProgressPtr progress;
   ISzAllocPtr alloc;
 
-  IMtDecCallback *mtCallback;
+  IMtDecCallback2 *mtCallback;
   void *mtCallbackObject;
 
   
@@ -170,11 +171,11 @@ typedef struct _CMtDec
   unsigned filledThreadStart;
   unsigned numFilledThreads;
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   BoolInt needInterrupt;
   UInt64 interruptIndex;
   CMtProgress mtProgress;
-  CMtDecThread threads[MTDEC__THREADS_MAX];
+  CMtDecThread threads[MTDEC_THREADS_MAX];
   #endif
 } CMtDec;
 
diff --git a/src/sdk/C/Ppmd.h b/src/sdk/C/Ppmd.h
index a5c1e3e..66b2626 100644
--- a/src/sdk/C/Ppmd.h
+++ b/src/sdk/C/Ppmd.h
@@ -1,15 +1,24 @@
 /* Ppmd.h -- PPMD codec common code
-2017-04-03 : Igor Pavlov : Public domain
+2023-03-05 : Igor Pavlov : Public domain
 This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
 
-#ifndef __PPMD_H
-#define __PPMD_H
+#ifndef ZIP7_INC_PPMD_H
+#define ZIP7_INC_PPMD_H
 
 #include "CpuArch.h"
 
 EXTERN_C_BEGIN
 
-#ifdef MY_CPU_32BIT
+#if defined(MY_CPU_SIZEOF_POINTER) && (MY_CPU_SIZEOF_POINTER == 4)
+/*
+   PPMD code always uses 32-bit internal fields in PPMD structures to store internal references in main block.
+   if (PPMD_32BIT is     defined), the PPMD code stores internal pointers to 32-bit reference fields.
+   if (PPMD_32BIT is NOT defined), the PPMD code stores internal UInt32 offsets to reference fields.
+   if (pointer size is 64-bit), then (PPMD_32BIT) mode is not allowed,
+   if (pointer size is 32-bit), then (PPMD_32BIT) mode is optional,
+     and it's allowed to disable PPMD_32BIT mode even if pointer is 32-bit.
+   PPMD code works slightly faster in (PPMD_32BIT) mode.
+*/
   #define PPMD_32BIT
 #endif
 
@@ -28,7 +37,7 @@ EXTERN_C_BEGIN
 #define PPMD_N4 ((128 + 3 - 1 * PPMD_N1 - 2 * PPMD_N2 - 3 * PPMD_N3) / 4)
 #define PPMD_NUM_INDEXES (PPMD_N1 + PPMD_N2 + PPMD_N3 + PPMD_N4)
 
-#pragma pack(push, 1)
+MY_CPU_pragma_pack_push_1
 /* Most compilers works OK here even without #pragma pack(push, 1), but some GCC compilers need it. */
 
 /* SEE-contexts for PPM-contexts with masked symbols */
@@ -39,42 +48,117 @@ typedef struct
   Byte Count;  /* Count to next change of Shift */
 } CPpmd_See;
 
-#define Ppmd_See_Update(p)  if ((p)->Shift < PPMD_PERIOD_BITS && --(p)->Count == 0) \
-    { (p)->Summ <<= 1; (p)->Count = (Byte)(3 << (p)->Shift++); }
+#define Ppmd_See_UPDATE(p) \
+  { if ((p)->Shift < PPMD_PERIOD_BITS && --(p)->Count == 0) \
+    { (p)->Summ = (UInt16)((p)->Summ << 1); \
+      (p)->Count = (Byte)(3 << (p)->Shift++); }}
+
 
 typedef struct
 {
   Byte Symbol;
   Byte Freq;
-  UInt16 SuccessorLow;
-  UInt16 SuccessorHigh;
+  UInt16 Successor_0;
+  UInt16 Successor_1;
 } CPpmd_State;
 
-#pragma pack(pop)
-
-typedef
-  #ifdef PPMD_32BIT
-    CPpmd_State *
-  #else
-    UInt32
-  #endif
-  CPpmd_State_Ref;
-
-typedef
-  #ifdef PPMD_32BIT
-    void *
-  #else
-    UInt32
-  #endif
-  CPpmd_Void_Ref;
-
-typedef
-  #ifdef PPMD_32BIT
-    Byte *
-  #else
-    UInt32
-  #endif
-  CPpmd_Byte_Ref;
+typedef struct CPpmd_State2_
+{
+  Byte Symbol;
+  Byte Freq;
+} CPpmd_State2;
+
+typedef struct CPpmd_State4_
+{
+  UInt16 Successor_0;
+  UInt16 Successor_1;
+} CPpmd_State4;
+
+MY_CPU_pragma_pop
+
+/*
+   PPMD code can write full CPpmd_State structure data to CPpmd*_Context
+      at (byte offset = 2) instead of some fields of original CPpmd*_Context structure.
+   
+   If we use pointers to different types, but that point to shared
+   memory space, we can have aliasing problem (strict aliasing).
+   
+   XLC compiler in -O2 mode can change the order of memory write instructions
+   in relation to read instructions, if we have use pointers to different types.
+   
+   To solve that aliasing problem we use combined CPpmd*_Context structure
+   with unions that contain the fields from both structures:
+   the original CPpmd*_Context and CPpmd_State.
+   So we can access the fields from both structures via one pointer,
+   and the compiler doesn't change the order of write instructions
+   in relation to read instructions.
+
+   If we don't use memory write instructions to shared memory in
+   some local code, and we use only reading instructions (read only),
+   then probably it's safe to use pointers to different types for reading.
+*/
+  
+
+
+#ifdef PPMD_32BIT
+
+  #define Ppmd_Ref_Type(type)   type *
+  #define Ppmd_GetRef(p, ptr)   (ptr)
+  #define Ppmd_GetPtr(p, ptr)   (ptr)
+  #define Ppmd_GetPtr_Type(p, ptr, note_type) (ptr)
+
+#else
+
+  #define Ppmd_Ref_Type(type)   UInt32
+  #define Ppmd_GetRef(p, ptr)   ((UInt32)((Byte *)(ptr) - (p)->Base))
+  #define Ppmd_GetPtr(p, offs)  ((void *)((p)->Base + (offs)))
+  #define Ppmd_GetPtr_Type(p, offs, type) ((type *)Ppmd_GetPtr(p, offs))
+
+#endif // PPMD_32BIT
+
+
+typedef Ppmd_Ref_Type(CPpmd_State) CPpmd_State_Ref;
+typedef Ppmd_Ref_Type(void)        CPpmd_Void_Ref;
+typedef Ppmd_Ref_Type(Byte)        CPpmd_Byte_Ref;
+
+
+/*
+#ifdef MY_CPU_LE_UNALIGN
+// the unaligned 32-bit access latency can be too large, if the data is not in L1 cache.
+#define Ppmd_GET_SUCCESSOR(p) ((CPpmd_Void_Ref)*(const UInt32 *)(const void *)&(p)->Successor_0)
+#define Ppmd_SET_SUCCESSOR(p, v) *(UInt32 *)(void *)(void *)&(p)->Successor_0 = (UInt32)(v)
+
+#else
+*/
+
+/*
+   We can write 16-bit halves to 32-bit (Successor) field in any selected order.
+   But the native order is more consistent way.
+   So we use the native order, if LE/BE order can be detected here at compile time.
+*/
+
+#ifdef MY_CPU_BE
+
+  #define Ppmd_GET_SUCCESSOR(p) \
+    ( (CPpmd_Void_Ref) (((UInt32)(p)->Successor_0 << 16) | (p)->Successor_1) )
+
+  #define Ppmd_SET_SUCCESSOR(p, v) { \
+    (p)->Successor_0 = (UInt16)(((UInt32)(v) >> 16) /* & 0xFFFF */); \
+    (p)->Successor_1 = (UInt16)((UInt32)(v) /* & 0xFFFF */); }
+
+#else
+
+  #define Ppmd_GET_SUCCESSOR(p) \
+    ( (CPpmd_Void_Ref) ((p)->Successor_0 | ((UInt32)(p)->Successor_1 << 16)) )
+
+  #define Ppmd_SET_SUCCESSOR(p, v) { \
+    (p)->Successor_0 = (UInt16)((UInt32)(v) /* & 0xFFFF */); \
+    (p)->Successor_1 = (UInt16)(((UInt32)(v) >> 16) /* & 0xFFFF */); }
+
+#endif
+
+// #endif
+
 
 #define PPMD_SetAllBitsIn256Bytes(p) \
   { size_t z; for (z = 0; z < 256 / sizeof(p[0]); z += 8) { \
diff --git a/src/sdk/C/Ppmd7.c b/src/sdk/C/Ppmd7.c
index 470aadc..efcc5d8 100644
--- a/src/sdk/C/Ppmd7.c
+++ b/src/sdk/C/Ppmd7.c
@@ -1,5 +1,5 @@
 /* Ppmd7.c -- PPMdH codec
-2018-07-04 : Igor Pavlov : Public domain
+2023-09-07 : Igor Pavlov : Public domain
 This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
 
 #include "Precomp.h"
@@ -8,21 +8,23 @@ This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
 
 #include "Ppmd7.h"
 
-const Byte PPMD7_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 };
-static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051};
+/* define PPMD7_ORDER_0_SUPPPORT to suport order-0 mode, unsupported by orignal PPMd var.H. code */
+// #define PPMD7_ORDER_0_SUPPPORT
+ 
+MY_ALIGN(16)
+static const Byte PPMD7_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 };
+MY_ALIGN(16)
+static const UInt16 PPMD7_kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051};
 
 #define MAX_FREQ 124
 #define UNIT_SIZE 12
 
 #define U2B(nu) ((UInt32)(nu) * UNIT_SIZE)
 #define U2I(nu) (p->Units2Indx[(size_t)(nu) - 1])
-#define I2U(indx) (p->Indx2Units[indx])
+#define I2U(indx) ((unsigned)p->Indx2Units[indx])
+#define I2U_UInt16(indx) ((UInt16)p->Indx2Units[indx])
 
-#ifdef PPMD_32BIT
-  #define REF(ptr) (ptr)
-#else
-  #define REF(ptr) ((UInt32)((Byte *)(ptr) - (p)->Base))
-#endif
+#define REF(ptr) Ppmd_GetRef(p, ptr)
 
 #define STATS_REF(ptr) ((CPpmd_State_Ref)REF(ptr))
 
@@ -31,17 +33,11 @@ static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x
 #define ONE_STATE(ctx) Ppmd7Context_OneState(ctx)
 #define SUFFIX(ctx) CTX((ctx)->Suffix)
 
-typedef CPpmd7_Context * CTX_PTR;
+typedef CPpmd7_Context * PPMD7_CTX_PTR;
 
 struct CPpmd7_Node_;
 
-typedef
-  #ifdef PPMD_32BIT
-    struct CPpmd7_Node_ *
-  #else
-    UInt32
-  #endif
-  CPpmd7_Node_Ref;
+typedef Ppmd_Ref_Type(struct CPpmd7_Node_) CPpmd7_Node_Ref;
 
 typedef struct CPpmd7_Node_
 {
@@ -51,17 +47,13 @@ typedef struct CPpmd7_Node_
   CPpmd7_Node_Ref Prev;
 } CPpmd7_Node;
 
-#ifdef PPMD_32BIT
-  #define NODE(ptr) (ptr)
-#else
-  #define NODE(offs) ((CPpmd7_Node *)(p->Base + (offs)))
-#endif
+#define NODE(r)  Ppmd_GetPtr_Type(p, r, CPpmd7_Node)
 
 void Ppmd7_Construct(CPpmd7 *p)
 {
   unsigned i, k, m;
 
-  p->Base = 0;
+  p->Base = NULL;
 
   for (i = 0, k = 0; i < PPMD_NUM_INDEXES; i++)
   {
@@ -77,6 +69,7 @@ void Ppmd7_Construct(CPpmd7 *p)
 
   for (i = 0; i < 3; i++)
     p->NS2Indx[i] = (Byte)i;
+
   for (m = i, k = 1; i < 256; i++)
   {
     p->NS2Indx[i] = (Byte)m;
@@ -84,183 +77,245 @@ void Ppmd7_Construct(CPpmd7 *p)
       k = (++m) - 2;
   }
 
-  memset(p->HB2Flag, 0, 0x40);
-  memset(p->HB2Flag + 0x40, 8, 0x100 - 0x40);
+  memcpy(p->ExpEscape, PPMD7_kExpEscape, 16);
 }
 
+
 void Ppmd7_Free(CPpmd7 *p, ISzAllocPtr alloc)
 {
   ISzAlloc_Free(alloc, p->Base);
   p->Size = 0;
-  p->Base = 0;
+  p->Base = NULL;
 }
 
+
 BoolInt Ppmd7_Alloc(CPpmd7 *p, UInt32 size, ISzAllocPtr alloc)
 {
   if (!p->Base || p->Size != size)
   {
-    size_t size2;
     Ppmd7_Free(p, alloc);
-    size2 = 0
-      #ifndef PPMD_32BIT
-      + UNIT_SIZE
-      #endif
-      ;
-    p->AlignOffset =
-      #ifdef PPMD_32BIT
-        (4 - size) & 3;
-      #else
-        4 - (size & 3);
-      #endif
-    if ((p->Base = (Byte *)ISzAlloc_Alloc(alloc, p->AlignOffset + size + size2)) == 0)
+    p->AlignOffset = (4 - size) & 3;
+    if ((p->Base = (Byte *)ISzAlloc_Alloc(alloc, p->AlignOffset + size)) == NULL)
       return False;
     p->Size = size;
   }
   return True;
 }
 
-static void InsertNode(CPpmd7 *p, void *node, unsigned indx)
+
+
+// ---------- Internal Memory Allocator ----------
+
+/* We can use CPpmd7_Node in list of free units (as in Ppmd8)
+   But we still need one additional list walk pass in Ppmd7_GlueFreeBlocks().
+   So we use simple CPpmd_Void_Ref instead of CPpmd7_Node in Ppmd7_InsertNode() / Ppmd7_RemoveNode()
+*/
+
+#define EMPTY_NODE 0
+
+
+static void Ppmd7_InsertNode(CPpmd7 *p, void *node, unsigned indx)
 {
   *((CPpmd_Void_Ref *)node) = p->FreeList[indx];
+  // ((CPpmd7_Node *)node)->Next = (CPpmd7_Node_Ref)p->FreeList[indx];
+
   p->FreeList[indx] = REF(node);
+
 }
 
-static void *RemoveNode(CPpmd7 *p, unsigned indx)
+
+static void *Ppmd7_RemoveNode(CPpmd7 *p, unsigned indx)
 {
   CPpmd_Void_Ref *node = (CPpmd_Void_Ref *)Ppmd7_GetPtr(p, p->FreeList[indx]);
   p->FreeList[indx] = *node;
+  // CPpmd7_Node *node = NODE((CPpmd7_Node_Ref)p->FreeList[indx]);
+  // p->FreeList[indx] = node->Next;
   return node;
 }
 
-static void SplitBlock(CPpmd7 *p, void *ptr, unsigned oldIndx, unsigned newIndx)
+
+static void Ppmd7_SplitBlock(CPpmd7 *p, void *ptr, unsigned oldIndx, unsigned newIndx)
 {
   unsigned i, nu = I2U(oldIndx) - I2U(newIndx);
   ptr = (Byte *)ptr + U2B(I2U(newIndx));
   if (I2U(i = U2I(nu)) != nu)
   {
     unsigned k = I2U(--i);
-    InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1);
+    Ppmd7_InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1);
   }
-  InsertNode(p, ptr, i);
+  Ppmd7_InsertNode(p, ptr, i);
 }
 
-static void GlueFreeBlocks(CPpmd7 *p)
+
+/* we use CPpmd7_Node_Union union to solve XLC -O2 strict pointer aliasing problem */
+
+typedef union
 {
-  #ifdef PPMD_32BIT
-  CPpmd7_Node headItem;
-  CPpmd7_Node_Ref head = &headItem;
-  #else
-  CPpmd7_Node_Ref head = p->AlignOffset + p->Size;
-  #endif
-  
-  CPpmd7_Node_Ref n = head;
-  unsigned i;
+  CPpmd7_Node     Node;
+  CPpmd7_Node_Ref NextRef;
+} CPpmd7_Node_Union;
+
+/* Original PPmdH (Ppmd7) code uses doubly linked list in Ppmd7_GlueFreeBlocks()
+   we use single linked list similar to Ppmd8 code */
+
 
+static void Ppmd7_GlueFreeBlocks(CPpmd7 *p)
+{
+  /*
+  we use first UInt16 field of 12-bytes UNITs as record type stamp
+    CPpmd_State    { Byte Symbol; Byte Freq; : Freq != 0
+    CPpmd7_Context { UInt16 NumStats;        : NumStats != 0
+    CPpmd7_Node    { UInt16 Stamp            : Stamp == 0 for free record
+                                             : Stamp == 1 for head record and guard
+    Last 12-bytes UNIT in array is always contains 12-bytes order-0 CPpmd7_Context record.
+  */
+  CPpmd7_Node_Ref head, n = 0;
+ 
   p->GlueCount = 255;
 
-  /* create doubly-linked list of free blocks */
-  for (i = 0; i < PPMD_NUM_INDEXES; i++)
+  
+  /* we set guard NODE at LoUnit */
+  if (p->LoUnit != p->HiUnit)
+    ((CPpmd7_Node *)(void *)p->LoUnit)->Stamp = 1;
+
   {
-    UInt16 nu = I2U(i);
-    CPpmd7_Node_Ref next = (CPpmd7_Node_Ref)p->FreeList[i];
-    p->FreeList[i] = 0;
-    while (next != 0)
+    /* Create list of free blocks.
+       We still need one additional list walk pass before Glue. */
+    unsigned i;
+    for (i = 0; i < PPMD_NUM_INDEXES; i++)
     {
-      CPpmd7_Node *node = NODE(next);
-      node->Next = n;
-      n = NODE(n)->Prev = next;
-      next = *(const CPpmd7_Node_Ref *)node;
-      node->Stamp = 0;
-      node->NU = (UInt16)nu;
+      const UInt16 nu = I2U_UInt16(i);
+      CPpmd7_Node_Ref next = (CPpmd7_Node_Ref)p->FreeList[i];
+      p->FreeList[i] = 0;
+      while (next != 0)
+      {
+        /* Don't change the order of the following commands: */
+        CPpmd7_Node_Union *un = (CPpmd7_Node_Union *)NODE(next);
+        const CPpmd7_Node_Ref tmp = next;
+        next = un->NextRef;
+        un->Node.Stamp = EMPTY_NODE;
+        un->Node.NU = nu;
+        un->Node.Next = n;
+        n = tmp;
+      }
     }
   }
-  NODE(head)->Stamp = 1;
-  NODE(head)->Next = n;
-  NODE(n)->Prev = head;
-  if (p->LoUnit != p->HiUnit)
-    ((CPpmd7_Node *)p->LoUnit)->Stamp = 1;
-  
-  /* Glue free blocks */
-  while (n != head)
+
+  head = n;
+  /* Glue and Fill must walk the list in same direction */
   {
-    CPpmd7_Node *node = NODE(n);
-    UInt32 nu = (UInt32)node->NU;
-    for (;;)
+    /* Glue free blocks */
+    CPpmd7_Node_Ref *prev = &head;
+    while (n)
     {
-      CPpmd7_Node *node2 = NODE(n) + nu;
-      nu += node2->NU;
-      if (node2->Stamp != 0 || nu >= 0x10000)
-        break;
-      NODE(node2->Prev)->Next = node2->Next;
-      NODE(node2->Next)->Prev = node2->Prev;
-      node->NU = (UInt16)nu;
+      CPpmd7_Node *node = NODE(n);
+      UInt32 nu = node->NU;
+      n = node->Next;
+      if (nu == 0)
+      {
+        *prev = n;
+        continue;
+      }
+      prev = &node->Next;
+      for (;;)
+      {
+        CPpmd7_Node *node2 = node + nu;
+        nu += node2->NU;
+        if (node2->Stamp != EMPTY_NODE || nu >= 0x10000)
+          break;
+        node->NU = (UInt16)nu;
+        node2->NU = 0;
+      }
     }
-    n = node->Next;
   }
-  
+
   /* Fill lists of free blocks */
-  for (n = NODE(head)->Next; n != head;)
+  for (n = head; n != 0;)
   {
     CPpmd7_Node *node = NODE(n);
-    unsigned nu;
-    CPpmd7_Node_Ref next = node->Next;
-    for (nu = node->NU; nu > 128; nu -= 128, node += 128)
-      InsertNode(p, node, PPMD_NUM_INDEXES - 1);
+    UInt32 nu = node->NU;
+    unsigned i;
+    n = node->Next;
+    if (nu == 0)
+      continue;
+    for (; nu > 128; nu -= 128, node += 128)
+      Ppmd7_InsertNode(p, node, PPMD_NUM_INDEXES - 1);
     if (I2U(i = U2I(nu)) != nu)
     {
       unsigned k = I2U(--i);
-      InsertNode(p, node + k, nu - k - 1);
+      Ppmd7_InsertNode(p, node + k, (unsigned)nu - k - 1);
     }
-    InsertNode(p, node, i);
-    n = next;
+    Ppmd7_InsertNode(p, node, i);
   }
 }
 
-static void *AllocUnitsRare(CPpmd7 *p, unsigned indx)
+
+Z7_NO_INLINE
+static void *Ppmd7_AllocUnitsRare(CPpmd7 *p, unsigned indx)
 {
   unsigned i;
-  void *retVal;
+  
   if (p->GlueCount == 0)
   {
-    GlueFreeBlocks(p);
+    Ppmd7_GlueFreeBlocks(p);
     if (p->FreeList[indx] != 0)
-      return RemoveNode(p, indx);
+      return Ppmd7_RemoveNode(p, indx);
   }
+  
   i = indx;
+  
   do
   {
     if (++i == PPMD_NUM_INDEXES)
     {
       UInt32 numBytes = U2B(I2U(indx));
+      Byte *us = p->UnitsStart;
       p->GlueCount--;
-      return ((UInt32)(p->UnitsStart - p->Text) > numBytes) ? (p->UnitsStart -= numBytes) : (NULL);
+      return ((UInt32)(us - p->Text) > numBytes) ? (p->UnitsStart = us - numBytes) : NULL;
     }
   }
   while (p->FreeList[i] == 0);
-  retVal = RemoveNode(p, i);
-  SplitBlock(p, retVal, i, indx);
-  return retVal;
+
+  {
+    void *block = Ppmd7_RemoveNode(p, i);
+    Ppmd7_SplitBlock(p, block, i, indx);
+    return block;
+  }
 }
 
-static void *AllocUnits(CPpmd7 *p, unsigned indx)
+
+static void *Ppmd7_AllocUnits(CPpmd7 *p, unsigned indx)
 {
-  UInt32 numBytes;
   if (p->FreeList[indx] != 0)
-    return RemoveNode(p, indx);
-  numBytes = U2B(I2U(indx));
-  if (numBytes <= (UInt32)(p->HiUnit - p->LoUnit))
+    return Ppmd7_RemoveNode(p, indx);
   {
-    void *retVal = p->LoUnit;
-    p->LoUnit += numBytes;
-    return retVal;
+    UInt32 numBytes = U2B(I2U(indx));
+    Byte *lo = p->LoUnit;
+    if ((UInt32)(p->HiUnit - lo) >= numBytes)
+    {
+      p->LoUnit = lo + numBytes;
+      return lo;
+    }
   }
-  return AllocUnitsRare(p, indx);
+  return Ppmd7_AllocUnitsRare(p, indx);
 }
 
-#define MyMem12Cpy(dest, src, num) \
-  { UInt32 *d = (UInt32 *)dest; const UInt32 *s = (const UInt32 *)src; UInt32 n = num; \
-    do { d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; s += 3; d += 3; } while (--n); }
 
+#define MEM_12_CPY(dest, src, num) \
+  { UInt32 *d = (UInt32 *)(dest); \
+    const UInt32 *z = (const UInt32 *)(src); \
+    unsigned n = (num); \
+    do { \
+      d[0] = z[0]; \
+      d[1] = z[1]; \
+      d[2] = z[2]; \
+      z += 3; \
+      d += 3; \
+    } while (--n); \
+  }
+
+
+/*
 static void *ShrinkUnits(CPpmd7 *p, void *oldPtr, unsigned oldNU, unsigned newNU)
 {
   unsigned i0 = U2I(oldNU);
@@ -269,28 +324,33 @@ static void *ShrinkUnits(CPpmd7 *p, void *oldPtr, unsigned oldNU, unsigned newNU
     return oldPtr;
   if (p->FreeList[i1] != 0)
   {
-    void *ptr = RemoveNode(p, i1);
-    MyMem12Cpy(ptr, oldPtr, newNU);
-    InsertNode(p, oldPtr, i0);
+    void *ptr = Ppmd7_RemoveNode(p, i1);
+    MEM_12_CPY(ptr, oldPtr, newNU)
+    Ppmd7_InsertNode(p, oldPtr, i0);
     return ptr;
   }
-  SplitBlock(p, oldPtr, i0, i1);
+  Ppmd7_SplitBlock(p, oldPtr, i0, i1);
   return oldPtr;
 }
+*/
 
-#define SUCCESSOR(p) ((CPpmd_Void_Ref)((p)->SuccessorLow | ((UInt32)(p)->SuccessorHigh << 16)))
 
+#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
 static void SetSuccessor(CPpmd_State *p, CPpmd_Void_Ref v)
 {
-  (p)->SuccessorLow = (UInt16)((UInt32)(v) & 0xFFFF);
-  (p)->SuccessorHigh = (UInt16)(((UInt32)(v) >> 16) & 0xFFFF);
+  Ppmd_SET_SUCCESSOR(p, v)
 }
 
-static void RestartModel(CPpmd7 *p)
+
+
+Z7_NO_INLINE
+static
+void Ppmd7_RestartModel(CPpmd7 *p)
 {
-  unsigned i, k, m;
+  unsigned i, k;
 
   memset(p->FreeList, 0, sizeof(p->FreeList));
+  
   p->Text = p->Base + p->AlignOffset;
   p->HiUnit = p->Text + p->Size;
   p->LoUnit = p->UnitsStart = p->HiUnit - p->Size / 8 / UNIT_SIZE * 7 * UNIT_SIZE;
@@ -300,57 +360,110 @@ static void RestartModel(CPpmd7 *p)
   p->RunLength = p->InitRL = -(Int32)((p->MaxOrder < 12) ? p->MaxOrder : 12) - 1;
   p->PrevSuccess = 0;
 
-  p->MinContext = p->MaxContext = (CTX_PTR)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */
-  p->MinContext->Suffix = 0;
-  p->MinContext->NumStats = 256;
-  p->MinContext->SummFreq = 256 + 1;
-  p->FoundState = (CPpmd_State *)p->LoUnit; /* AllocUnits(p, PPMD_NUM_INDEXES - 1); */
-  p->LoUnit += U2B(256 / 2);
-  p->MinContext->Stats = REF(p->FoundState);
-  for (i = 0; i < 256; i++)
   {
-    CPpmd_State *s = &p->FoundState[i];
-    s->Symbol = (Byte)i;
-    s->Freq = 1;
-    SetSuccessor(s, 0);
+    CPpmd7_Context *mc = (PPMD7_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */
+    CPpmd_State *s = (CPpmd_State *)p->LoUnit; /* Ppmd7_AllocUnits(p, PPMD_NUM_INDEXES - 1); */
+    
+    p->LoUnit += U2B(256 / 2);
+    p->MaxContext = p->MinContext = mc;
+    p->FoundState = s;
+
+    mc->NumStats = 256;
+    mc->Union2.SummFreq = 256 + 1;
+    mc->Union4.Stats = REF(s);
+    mc->Suffix = 0;
+
+    for (i = 0; i < 256; i++, s++)
+    {
+      s->Symbol = (Byte)i;
+      s->Freq = 1;
+      SetSuccessor(s, 0);
+    }
+
+    #ifdef PPMD7_ORDER_0_SUPPPORT
+    if (p->MaxOrder == 0)
+    {
+      CPpmd_Void_Ref r = REF(mc);
+      s = p->FoundState;
+      for (i = 0; i < 256; i++, s++)
+        SetSuccessor(s, r);
+      return;
+    }
+    #endif
   }
 
   for (i = 0; i < 128; i++)
+    
+    
+    
     for (k = 0; k < 8; k++)
     {
+      unsigned m;
       UInt16 *dest = p->BinSumm[i] + k;
-      UInt16 val = (UInt16)(PPMD_BIN_SCALE - kInitBinEsc[k] / (i + 2));
+      const UInt16 val = (UInt16)(PPMD_BIN_SCALE - PPMD7_kInitBinEsc[k] / (i + 2));
       for (m = 0; m < 64; m += 8)
         dest[m] = val;
     }
-  
+
+    
   for (i = 0; i < 25; i++)
-    for (k = 0; k < 16; k++)
+  {
+
+    CPpmd_See *s = p->See[i];
+    
+    
+    
+    unsigned summ = ((5 * i + 10) << (PPMD_PERIOD_BITS - 4));
+    for (k = 0; k < 16; k++, s++)
     {
-      CPpmd_See *s = &p->See[i][k];
-      s->Summ = (UInt16)((5 * i + 10) << (s->Shift = PPMD_PERIOD_BITS - 4));
+      s->Summ = (UInt16)summ;
+      s->Shift = (PPMD_PERIOD_BITS - 4);
       s->Count = 4;
     }
+  }
+  
+  p->DummySee.Summ = 0; /* unused */
+  p->DummySee.Shift = PPMD_PERIOD_BITS;
+  p->DummySee.Count = 64; /* unused */
 }
 
+
 void Ppmd7_Init(CPpmd7 *p, unsigned maxOrder)
 {
   p->MaxOrder = maxOrder;
-  RestartModel(p);
-  p->DummySee.Shift = PPMD_PERIOD_BITS;
-  p->DummySee.Summ = 0; /* unused */
-  p->DummySee.Count = 64; /* unused */
+
+  Ppmd7_RestartModel(p);
 }
 
-static CTX_PTR CreateSuccessors(CPpmd7 *p, BoolInt skip)
+
+
+/*
+  Ppmd7_CreateSuccessors()
+  It's called when (FoundState->Successor) is RAW-Successor,
+  that is the link to position in Raw text.
+  So we create Context records and write the links to
+  FoundState->Successor and to identical RAW-Successors in suffix
+  contexts of MinContex.
+  
+  The function returns:
+  if (OrderFall == 0) then MinContext is already at MAX order,
+    { return pointer to new or existing context of same MAX order }
+  else
+    { return pointer to new real context that will be (Order+1) in comparison with MinContext
+
+  also it can return pointer to real context of same order,
+*/
+
+Z7_NO_INLINE
+static PPMD7_CTX_PTR Ppmd7_CreateSuccessors(CPpmd7 *p)
 {
-  CPpmd_State upState;
-  CTX_PTR c = p->MinContext;
+  PPMD7_CTX_PTR c = p->MinContext;
   CPpmd_Byte_Ref upBranch = (CPpmd_Byte_Ref)SUCCESSOR(p->FoundState);
-  CPpmd_State *ps[PPMD7_MAX_ORDER];
+  Byte newSym, newFreq;
   unsigned numPs = 0;
-  
-  if (!skip)
+  CPpmd_State *ps[PPMD7_MAX_ORDER];
+
+  if (p->OrderFall != 0)
     ps[numPs++] = p->FoundState;
   
   while (c->Suffix)
@@ -358,54 +471,83 @@ static CTX_PTR CreateSuccessors(CPpmd7 *p, BoolInt skip)
     CPpmd_Void_Ref successor;
     CPpmd_State *s;
     c = SUFFIX(c);
+    
+
     if (c->NumStats != 1)
     {
-      for (s = STATS(c); s->Symbol != p->FoundState->Symbol; s++);
+      Byte sym = p->FoundState->Symbol;
+      for (s = STATS(c); s->Symbol != sym; s++);
+
     }
     else
+    {
       s = ONE_STATE(c);
+
+    }
     successor = SUCCESSOR(s);
     if (successor != upBranch)
     {
+      // (c) is real record Context here,
       c = CTX(successor);
       if (numPs == 0)
+      {
+        // (c) is real record MAX Order Context here,
+        // So we don't need to create any new contexts.
         return c;
+      }
       break;
     }
     ps[numPs++] = s;
   }
   
-  upState.Symbol = *(const Byte *)Ppmd7_GetPtr(p, upBranch);
-  SetSuccessor(&upState, upBranch + 1);
+  // All created contexts will have single-symbol with new RAW-Successor
+  // All new RAW-Successors will point to next position in RAW text
+  // after FoundState->Successor
+
+  newSym = *(const Byte *)Ppmd7_GetPtr(p, upBranch);
+  upBranch++;
+  
   
   if (c->NumStats == 1)
-    upState.Freq = ONE_STATE(c)->Freq;
+    newFreq = ONE_STATE(c)->Freq;
   else
   {
     UInt32 cf, s0;
     CPpmd_State *s;
-    for (s = STATS(c); s->Symbol != upState.Symbol; s++);
-    cf = s->Freq - 1;
-    s0 = c->SummFreq - c->NumStats - cf;
-    upState.Freq = (Byte)(1 + ((2 * cf <= s0) ? (5 * cf > s0) : ((2 * cf + 3 * s0 - 1) / (2 * s0))));
+    for (s = STATS(c); s->Symbol != newSym; s++);
+    cf = (UInt32)s->Freq - 1;
+    s0 = (UInt32)c->Union2.SummFreq - c->NumStats - cf;
+    /*
+      cf - is frequency of symbol that will be Successor in new context records.
+      s0 - is commulative frequency sum of another symbols from parent context.
+      max(newFreq)= (s->Freq + 1), when (s0 == 1)
+      we have requirement (Ppmd7Context_OneState()->Freq <= 128) in BinSumm[]
+      so (s->Freq < 128) - is requirement for multi-symbol contexts
+    */
+    newFreq = (Byte)(1 + ((2 * cf <= s0) ? (5 * cf > s0) : (2 * cf + s0 - 1) / (2 * s0) + 1));
   }
 
+  // Create new single-symbol contexts from low order to high order in loop
+
   do
   {
-    /* Create Child */
-    CTX_PTR c1; /* = AllocContext(p); */
+    PPMD7_CTX_PTR c1;
+    /* = AllocContext(p); */
     if (p->HiUnit != p->LoUnit)
-      c1 = (CTX_PTR)(p->HiUnit -= UNIT_SIZE);
+      c1 = (PPMD7_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE);
     else if (p->FreeList[0] != 0)
-      c1 = (CTX_PTR)RemoveNode(p, 0);
+      c1 = (PPMD7_CTX_PTR)Ppmd7_RemoveNode(p, 0);
     else
     {
-      c1 = (CTX_PTR)AllocUnitsRare(p, 0);
+      c1 = (PPMD7_CTX_PTR)Ppmd7_AllocUnitsRare(p, 0);
       if (!c1)
         return NULL;
     }
+    
     c1->NumStats = 1;
-    *ONE_STATE(c1) = upState;
+    ONE_STATE(c1)->Symbol = newSym;
+    ONE_STATE(c1)->Freq = newFreq;
+    SetSuccessor(ONE_STATE(c1), upBranch);
     c1->Suffix = REF(c);
     SetSuccessor(ps[--numPs], REF(c1));
     c = c1;
@@ -415,21 +557,26 @@ static CTX_PTR CreateSuccessors(CPpmd7 *p, BoolInt skip)
   return c;
 }
 
-static void SwapStates(CPpmd_State *t1, CPpmd_State *t2)
-{
-  CPpmd_State tmp = *t1;
-  *t1 = *t2;
-  *t2 = tmp;
-}
 
-static void UpdateModel(CPpmd7 *p)
+
+#define SWAP_STATES(s) \
+  { CPpmd_State tmp = s[0]; s[0] = s[-1]; s[-1] = tmp; }
+
+
+void Ppmd7_UpdateModel(CPpmd7 *p);
+Z7_NO_INLINE
+void Ppmd7_UpdateModel(CPpmd7 *p)
 {
-  CPpmd_Void_Ref successor, fSuccessor = SUCCESSOR(p->FoundState);
-  CTX_PTR c;
+  CPpmd_Void_Ref maxSuccessor, minSuccessor;
+  PPMD7_CTX_PTR c, mc;
   unsigned s0, ns;
-  
+
+
+
   if (p->FoundState->Freq < MAX_FREQ / 4 && p->MinContext->Suffix != 0)
   {
+    /* Update Freqs in Suffix Context */
+
     c = SUFFIX(p->MinContext);
     
     if (c->NumStats == 1)
@@ -441,166 +588,273 @@ static void UpdateModel(CPpmd7 *p)
     else
     {
       CPpmd_State *s = STATS(c);
-      if (s->Symbol != p->FoundState->Symbol)
+      Byte sym = p->FoundState->Symbol;
+      
+      if (s->Symbol != sym)
       {
-        do { s++; } while (s->Symbol != p->FoundState->Symbol);
+        do
+        {
+          // s++; if (s->Symbol == sym) break;
+          s++;
+        }
+        while (s->Symbol != sym);
+        
         if (s[0].Freq >= s[-1].Freq)
         {
-          SwapStates(&s[0], &s[-1]);
+          SWAP_STATES(s)
           s--;
         }
       }
+
       if (s->Freq < MAX_FREQ - 9)
       {
-        s->Freq += 2;
-        c->SummFreq += 2;
+        s->Freq = (Byte)(s->Freq + 2);
+        c->Union2.SummFreq = (UInt16)(c->Union2.SummFreq + 2);
       }
     }
   }
 
+  
   if (p->OrderFall == 0)
   {
-    p->MinContext = p->MaxContext = CreateSuccessors(p, True);
-    if (p->MinContext == 0)
+    /* MAX ORDER context */
+    /* (FoundState->Successor) is RAW-Successor. */
+    p->MaxContext = p->MinContext = Ppmd7_CreateSuccessors(p);
+    if (!p->MinContext)
     {
-      RestartModel(p);
+      Ppmd7_RestartModel(p);
       return;
     }
     SetSuccessor(p->FoundState, REF(p->MinContext));
     return;
   }
+
+  
+  /* NON-MAX ORDER context */
   
-  *p->Text++ = p->FoundState->Symbol;
-  successor = REF(p->Text);
-  if (p->Text >= p->UnitsStart)
   {
-    RestartModel(p);
-    return;
+    Byte *text = p->Text;
+    *text++ = p->FoundState->Symbol;
+    p->Text = text;
+    if (text >= p->UnitsStart)
+    {
+      Ppmd7_RestartModel(p);
+      return;
+    }
+    maxSuccessor = REF(text);
   }
   
-  if (fSuccessor)
+  minSuccessor = SUCCESSOR(p->FoundState);
+
+  if (minSuccessor)
   {
-    if (fSuccessor <= successor)
+    // there is Successor for FoundState in MinContext.
+    // So the next context will be one order higher than MinContext.
+    
+    if (minSuccessor <= maxSuccessor)
     {
-      CTX_PTR cs = CreateSuccessors(p, False);
-      if (cs == NULL)
+      // minSuccessor is RAW-Successor. So we will create real contexts records:
+      PPMD7_CTX_PTR cs = Ppmd7_CreateSuccessors(p);
+      if (!cs)
       {
-        RestartModel(p);
+        Ppmd7_RestartModel(p);
         return;
       }
-      fSuccessor = REF(cs);
+      minSuccessor = REF(cs);
     }
+
+    // minSuccessor now is real Context pointer that points to existing (Order+1) context
+    
     if (--p->OrderFall == 0)
     {
-      successor = fSuccessor;
+      /*
+      if we move to MaxOrder context, then minSuccessor will be common Succesor for both:
+        MinContext that is (MaxOrder - 1)
+        MaxContext that is (MaxOrder)
+      so we don't need new RAW-Successor, and we can use real minSuccessor
+      as succssors for both MinContext and MaxContext.
+      */
+      maxSuccessor = minSuccessor;
+      
+      /*
+      if (MaxContext != MinContext)
+      {
+        there was order fall from MaxOrder and we don't need current symbol
+        to transfer some RAW-Succesors to real contexts.
+        So we roll back pointer in raw data for one position.
+      }
+      */
       p->Text -= (p->MaxContext != p->MinContext);
     }
   }
   else
   {
-    SetSuccessor(p->FoundState, successor);
-    fSuccessor = REF(p->MinContext);
+    /*
+    FoundState has NULL-Successor here.
+    And only root 0-order context can contain NULL-Successors.
+    We change Successor in FoundState to RAW-Successor,
+    And next context will be same 0-order root Context.
+    */
+    SetSuccessor(p->FoundState, maxSuccessor);
+    minSuccessor = REF(p->MinContext);
   }
-  
-  s0 = p->MinContext->SummFreq - (ns = p->MinContext->NumStats) - (p->FoundState->Freq - 1);
-  
-  for (c = p->MaxContext; c != p->MinContext; c = SUFFIX(c))
+
+  mc = p->MinContext;
+  c = p->MaxContext;
+
+  p->MaxContext = p->MinContext = CTX(minSuccessor);
+
+  if (c == mc)
+    return;
+
+  // s0 : is pure Escape Freq
+  s0 = mc->Union2.SummFreq - (ns = mc->NumStats) - ((unsigned)p->FoundState->Freq - 1);
+
+  do
   {
     unsigned ns1;
-    UInt32 cf, sf;
+    UInt32 sum;
+    
     if ((ns1 = c->NumStats) != 1)
     {
       if ((ns1 & 1) == 0)
       {
         /* Expand for one UNIT */
-        unsigned oldNU = ns1 >> 1;
-        unsigned i = U2I(oldNU);
+        const unsigned oldNU = ns1 >> 1;
+        const unsigned i = U2I(oldNU);
         if (i != U2I((size_t)oldNU + 1))
         {
-          void *ptr = AllocUnits(p, i + 1);
+          void *ptr = Ppmd7_AllocUnits(p, i + 1);
           void *oldPtr;
           if (!ptr)
           {
-            RestartModel(p);
+            Ppmd7_RestartModel(p);
             return;
           }
           oldPtr = STATS(c);
-          MyMem12Cpy(ptr, oldPtr, oldNU);
-          InsertNode(p, oldPtr, i);
-          c->Stats = STATS_REF(ptr);
+          MEM_12_CPY(ptr, oldPtr, oldNU)
+          Ppmd7_InsertNode(p, oldPtr, i);
+          c->Union4.Stats = STATS_REF(ptr);
         }
       }
-      c->SummFreq = (UInt16)(c->SummFreq + (2 * ns1 < ns) + 2 * ((4 * ns1 <= ns) & (c->SummFreq <= 8 * ns1)));
+      sum = c->Union2.SummFreq;
+      /* max increase of Escape_Freq is 3 here.
+         total increase of Union2.SummFreq for all symbols is less than 256 here */
+      sum += (UInt32)(unsigned)((2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1)));
+      /* original PPMdH uses 16-bit variable for (sum) here.
+         But (sum < 0x9000). So we don't truncate (sum) to 16-bit */
+      // sum = (UInt16)sum;
     }
     else
     {
-      CPpmd_State *s = (CPpmd_State*)AllocUnits(p, 0);
+      // instead of One-symbol context we create 2-symbol context
+      CPpmd_State *s = (CPpmd_State*)Ppmd7_AllocUnits(p, 0);
       if (!s)
       {
-        RestartModel(p);
+        Ppmd7_RestartModel(p);
         return;
       }
-      *s = *ONE_STATE(c);
-      c->Stats = REF(s);
-      if (s->Freq < MAX_FREQ / 4 - 1)
-        s->Freq <<= 1;
-      else
-        s->Freq = MAX_FREQ - 4;
-      c->SummFreq = (UInt16)(s->Freq + p->InitEsc + (ns > 3));
-    }
-    cf = 2 * (UInt32)p->FoundState->Freq * (c->SummFreq + 6);
-    sf = (UInt32)s0 + c->SummFreq;
-    if (cf < 6 * sf)
-    {
-      cf = 1 + (cf > sf) + (cf >= 4 * sf);
-      c->SummFreq += 3;
-    }
-    else
-    {
-      cf = 4 + (cf >= 9 * sf) + (cf >= 12 * sf) + (cf >= 15 * sf);
-      c->SummFreq = (UInt16)(c->SummFreq + cf);
+      {
+        unsigned freq = c->Union2.State2.Freq;
+        // s = *ONE_STATE(c);
+        s->Symbol = c->Union2.State2.Symbol;
+        s->Successor_0 = c->Union4.State4.Successor_0;
+        s->Successor_1 = c->Union4.State4.Successor_1;
+        // SetSuccessor(s, c->Union4.Stats);  // call it only for debug purposes to check the order of
+                                              // (Successor_0 and Successor_1) in LE/BE.
+        c->Union4.Stats = REF(s);
+        if (freq < MAX_FREQ / 4 - 1)
+          freq <<= 1;
+        else
+          freq = MAX_FREQ - 4;
+        // (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context
+        s->Freq = (Byte)freq;
+        // max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here
+        sum = (UInt32)(freq + p->InitEsc + (ns > 3));
+      }
     }
+    
     {
       CPpmd_State *s = STATS(c) + ns1;
-      SetSuccessor(s, successor);
+      UInt32 cf = 2 * (sum + 6) * (UInt32)p->FoundState->Freq;
+      UInt32 sf = (UInt32)s0 + sum;
       s->Symbol = p->FoundState->Symbol;
-      s->Freq = (Byte)cf;
       c->NumStats = (UInt16)(ns1 + 1);
+      SetSuccessor(s, maxSuccessor);
+      
+      if (cf < 6 * sf)
+      {
+        cf = (UInt32)1 + (cf > sf) + (cf >= 4 * sf);
+        sum += 3;
+        /* It can add (0, 1, 2) to Escape_Freq */
+      }
+      else
+      {
+        cf = (UInt32)4 + (cf >= 9 * sf) + (cf >= 12 * sf) + (cf >= 15 * sf);
+        sum += cf;
+      }
+     
+      c->Union2.SummFreq = (UInt16)sum;
+      s->Freq = (Byte)cf;
     }
+    c = SUFFIX(c);
   }
-  p->MaxContext = p->MinContext = CTX(fSuccessor);
+  while (c != mc);
 }
   
-static void Rescale(CPpmd7 *p)
+
+
+Z7_NO_INLINE
+static void Ppmd7_Rescale(CPpmd7 *p)
 {
   unsigned i, adder, sumFreq, escFreq;
   CPpmd_State *stats = STATS(p->MinContext);
   CPpmd_State *s = p->FoundState;
+
+  /* Sort the list by Freq */
+  if (s != stats)
   {
     CPpmd_State tmp = *s;
-    for (; s != stats; s--)
+    do
       s[0] = s[-1];
+    while (--s != stats);
     *s = tmp;
   }
-  escFreq = p->MinContext->SummFreq - s->Freq;
-  s->Freq += 4;
-  adder = (p->OrderFall != 0);
-  s->Freq = (Byte)((s->Freq + adder) >> 1);
+
   sumFreq = s->Freq;
+  escFreq = p->MinContext->Union2.SummFreq - sumFreq;
+  
+  /*
+  if (p->OrderFall == 0), adder = 0 : it's     allowed to remove symbol from     MAX Order context
+  if (p->OrderFall != 0), adder = 1 : it's NOT allowed to remove symbol from NON-MAX Order context
+  */
+
+  adder = (p->OrderFall != 0);
+
+  #ifdef PPMD7_ORDER_0_SUPPPORT
+  adder |= (p->MaxOrder == 0); // we don't remove symbols from order-0 context
+  #endif
+
+  sumFreq = (sumFreq + 4 + adder) >> 1;
+  i = (unsigned)p->MinContext->NumStats - 1;
+  s->Freq = (Byte)sumFreq;
   
-  i = p->MinContext->NumStats - 1;
   do
   {
-    escFreq -= (++s)->Freq;
-    s->Freq = (Byte)((s->Freq + adder) >> 1);
-    sumFreq += s->Freq;
-    if (s[0].Freq > s[-1].Freq)
+    unsigned freq = (++s)->Freq;
+    escFreq -= freq;
+    freq = (freq + adder) >> 1;
+    sumFreq += freq;
+    s->Freq = (Byte)freq;
+    if (freq > s[-1].Freq)
     {
+      CPpmd_State tmp = *s;
       CPpmd_State *s1 = s;
-      CPpmd_State tmp = *s1;
       do
+      {
         s1[0] = s1[-1];
-      while (--s1 != stats && tmp.Freq > s1[-1].Freq);
+      }
+      while (--s1 != stats && freq > s1[-1].Freq);
       *s1 = tmp;
     }
   }
@@ -608,48 +862,90 @@ static void Rescale(CPpmd7 *p)
   
   if (s->Freq == 0)
   {
-    unsigned numStats = p->MinContext->NumStats;
-    unsigned n0, n1;
-    do { i++; } while ((--s)->Freq == 0);
+    /* Remove all items with Freq == 0 */
+    CPpmd7_Context *mc;
+    unsigned numStats, numStatsNew, n0, n1;
+    
+    i = 0; do { i++; } while ((--s)->Freq == 0);
+    
+    /* We increase (escFreq) for the number of removed symbols.
+       So we will have (0.5) increase for Escape_Freq in avarage per
+       removed symbol after Escape_Freq halving */
     escFreq += i;
-    p->MinContext->NumStats = (UInt16)(p->MinContext->NumStats - i);
-    if (p->MinContext->NumStats == 1)
+    mc = p->MinContext;
+    numStats = mc->NumStats;
+    numStatsNew = numStats - i;
+    mc->NumStats = (UInt16)(numStatsNew);
+    n0 = (numStats + 1) >> 1;
+    
+    if (numStatsNew == 1)
     {
-      CPpmd_State tmp = *stats;
+      /* Create Single-Symbol context */
+      unsigned freq = stats->Freq;
+      
       do
       {
-        tmp.Freq = (Byte)(tmp.Freq - (tmp.Freq >> 1));
         escFreq >>= 1;
+        freq = (freq + 1) >> 1;
       }
       while (escFreq > 1);
-      InsertNode(p, stats, U2I(((numStats + 1) >> 1)));
-      *(p->FoundState = ONE_STATE(p->MinContext)) = tmp;
+
+      s = ONE_STATE(mc);
+      *s = *stats;
+      s->Freq = (Byte)freq; // (freq <= 260 / 4)
+      p->FoundState = s;
+      Ppmd7_InsertNode(p, stats, U2I(n0));
       return;
     }
-    n0 = (numStats + 1) >> 1;
-    n1 = (p->MinContext->NumStats + 1) >> 1;
+    
+    n1 = (numStatsNew + 1) >> 1;
     if (n0 != n1)
-      p->MinContext->Stats = STATS_REF(ShrinkUnits(p, stats, n0, n1));
+    {
+      // p->MinContext->Union4.Stats = STATS_REF(ShrinkUnits(p, stats, n0, n1));
+      unsigned i0 = U2I(n0);
+      unsigned i1 = U2I(n1);
+      if (i0 != i1)
+      {
+        if (p->FreeList[i1] != 0)
+        {
+          void *ptr = Ppmd7_RemoveNode(p, i1);
+          p->MinContext->Union4.Stats = STATS_REF(ptr);
+          MEM_12_CPY(ptr, (const void *)stats, n1)
+          Ppmd7_InsertNode(p, stats, i0);
+        }
+        else
+          Ppmd7_SplitBlock(p, stats, i0, i1);
+      }
+    }
+  }
+  {
+    CPpmd7_Context *mc = p->MinContext;
+    mc->Union2.SummFreq = (UInt16)(sumFreq + escFreq - (escFreq >> 1));
+    // Escape_Freq halving here
+    p->FoundState = STATS(mc);
   }
-  p->MinContext->SummFreq = (UInt16)(sumFreq + escFreq - (escFreq >> 1));
-  p->FoundState = STATS(p->MinContext);
 }
 
+
 CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq)
 {
   CPpmd_See *see;
-  unsigned nonMasked = p->MinContext->NumStats - numMasked;
-  if (p->MinContext->NumStats != 256)
+  const CPpmd7_Context *mc = p->MinContext;
+  unsigned numStats = mc->NumStats;
+  if (numStats != 256)
   {
-    see = p->See[(unsigned)p->NS2Indx[(size_t)nonMasked - 1]] +
-        (nonMasked < (unsigned)SUFFIX(p->MinContext)->NumStats - p->MinContext->NumStats) +
-        2 * (unsigned)(p->MinContext->SummFreq < 11 * p->MinContext->NumStats) +
-        4 * (unsigned)(numMasked > nonMasked) +
+    unsigned nonMasked = numStats - numMasked;
+    see = p->See[(unsigned)p->NS2Indx[(size_t)nonMasked - 1]]
+        + (nonMasked < (unsigned)SUFFIX(mc)->NumStats - numStats)
+        + 2 * (unsigned)(mc->Union2.SummFreq < 11 * numStats)
+        + 4 * (unsigned)(numMasked > nonMasked) +
         p->HiBitsFlag;
     {
-      unsigned r = (see->Summ >> see->Shift);
-      see->Summ = (UInt16)(see->Summ - r);
-      *escFreq = r + (r == 0);
+      // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ
+      const unsigned summ = (UInt16)see->Summ; // & 0xFFFF
+      const unsigned r = (summ >> see->Shift);
+      see->Summ = (UInt16)(summ - r);
+      *escFreq = (UInt32)(r + (r == 0));
     }
   }
   else
@@ -660,53 +956,176 @@ CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq)
   return see;
 }
 
-static void NextContext(CPpmd7 *p)
+
+static void Ppmd7_NextContext(CPpmd7 *p)
 {
-  CTX_PTR c = CTX(SUCCESSOR(p->FoundState));
-  if (p->OrderFall == 0 && (Byte *)c > p->Text)
-    p->MinContext = p->MaxContext = c;
+  PPMD7_CTX_PTR c = CTX(SUCCESSOR(p->FoundState));
+  if (p->OrderFall == 0 && (const Byte *)c > p->Text)
+    p->MaxContext = p->MinContext = c;
   else
-    UpdateModel(p);
+    Ppmd7_UpdateModel(p);
 }
 
+
 void Ppmd7_Update1(CPpmd7 *p)
 {
   CPpmd_State *s = p->FoundState;
-  s->Freq += 4;
-  p->MinContext->SummFreq += 4;
-  if (s[0].Freq > s[-1].Freq)
+  unsigned freq = s->Freq;
+  freq += 4;
+  p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4);
+  s->Freq = (Byte)freq;
+  if (freq > s[-1].Freq)
   {
-    SwapStates(&s[0], &s[-1]);
+    SWAP_STATES(s)
     p->FoundState = --s;
-    if (s->Freq > MAX_FREQ)
-      Rescale(p);
+    if (freq > MAX_FREQ)
+      Ppmd7_Rescale(p);
   }
-  NextContext(p);
+  Ppmd7_NextContext(p);
 }
 
+
 void Ppmd7_Update1_0(CPpmd7 *p)
 {
-  p->PrevSuccess = (2 * p->FoundState->Freq > p->MinContext->SummFreq);
-  p->RunLength += p->PrevSuccess;
-  p->MinContext->SummFreq += 4;
-  if ((p->FoundState->Freq += 4) > MAX_FREQ)
-    Rescale(p);
-  NextContext(p);
+  CPpmd_State *s = p->FoundState;
+  CPpmd7_Context *mc = p->MinContext;
+  unsigned freq = s->Freq;
+  const unsigned summFreq = mc->Union2.SummFreq;
+  p->PrevSuccess = (2 * freq > summFreq);
+  p->RunLength += (Int32)p->PrevSuccess;
+  mc->Union2.SummFreq = (UInt16)(summFreq + 4);
+  freq += 4;
+  s->Freq = (Byte)freq;
+  if (freq > MAX_FREQ)
+    Ppmd7_Rescale(p);
+  Ppmd7_NextContext(p);
 }
 
+
+/*
 void Ppmd7_UpdateBin(CPpmd7 *p)
 {
-  p->FoundState->Freq = (Byte)(p->FoundState->Freq + (p->FoundState->Freq < 128 ? 1: 0));
+  unsigned freq = p->FoundState->Freq;
+  p->FoundState->Freq = (Byte)(freq + (freq < 128));
   p->PrevSuccess = 1;
   p->RunLength++;
-  NextContext(p);
+  Ppmd7_NextContext(p);
 }
+*/
 
 void Ppmd7_Update2(CPpmd7 *p)
 {
-  p->MinContext->SummFreq += 4;
-  if ((p->FoundState->Freq += 4) > MAX_FREQ)
-    Rescale(p);
+  CPpmd_State *s = p->FoundState;
+  unsigned freq = s->Freq;
+  freq += 4;
   p->RunLength = p->InitRL;
-  UpdateModel(p);
+  p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4);
+  s->Freq = (Byte)freq;
+  if (freq > MAX_FREQ)
+    Ppmd7_Rescale(p);
+  Ppmd7_UpdateModel(p);
+}
+
+
+
+/*
+PPMd Memory Map:
+{
+  [ 0 ]           contains subset of original raw text, that is required to create context
+                  records, Some symbols are not written, when max order context was reached
+  [ Text ]        free area
+  [ UnitsStart ]  CPpmd_State vectors and CPpmd7_Context records
+  [ LoUnit ]      free  area for CPpmd_State and CPpmd7_Context items
+[ HiUnit ]      CPpmd7_Context records
+  [ Size ]        end of array
 }
+
+These addresses don't cross at any time.
+And the following condtions is true for addresses:
+  (0  <= Text < UnitsStart <= LoUnit <= HiUnit <= Size)
+
+Raw text is BYTE--aligned.
+the data in block [ UnitsStart ... Size ] contains 12-bytes aligned UNITs.
+
+Last UNIT of array at offset (Size - 12) is root order-0 CPpmd7_Context record.
+The code can free UNITs memory blocks that were allocated to store CPpmd_State vectors.
+The code doesn't free UNITs allocated for CPpmd7_Context records.
+
+The code calls Ppmd7_RestartModel(), when there is no free memory for allocation.
+And Ppmd7_RestartModel() changes the state to orignal start state, with full free block.
+
+
+The code allocates UNITs with the following order:
+
+Allocation of 1 UNIT for Context record
+  - from free space (HiUnit) down to (LoUnit)
+  - from FreeList[0]
+  - Ppmd7_AllocUnitsRare()
+
+Ppmd7_AllocUnits() for CPpmd_State vectors:
+  - from FreeList[i]
+  - from free space (LoUnit) up to (HiUnit)
+  - Ppmd7_AllocUnitsRare()
+
+Ppmd7_AllocUnitsRare()
+  - if (GlueCount == 0)
+       {  Glue lists, GlueCount = 255, allocate from FreeList[i]] }
+  - loop for all higher sized FreeList[...] lists
+  - from (UnitsStart - Text), GlueCount--
+  - ERROR
+
+
+Each Record with Context contains the CPpmd_State vector, where each
+CPpmd_State contains the link to Successor.
+There are 3 types of Successor:
+  1) NULL-Successor   - NULL pointer. NULL-Successor links can be stored
+                        only in 0-order Root Context Record.
+                        We use 0 value as NULL-Successor
+  2) RAW-Successor    - the link to position in raw text,
+                        that "RAW-Successor" is being created after first
+                        occurrence of new symbol for some existing context record.
+                        (RAW-Successor > 0).
+  3) RECORD-Successor - the link to CPpmd7_Context record of (Order+1),
+                        that record is being created when we go via RAW-Successor again.
+
+For any successors at any time: the following condtions are true for Successor links:
+(NULL-Successor < RAW-Successor < UnitsStart <= RECORD-Successor)
+
+
+---------- Symbol Frequency, SummFreq and Range in Range_Coder ----------
+
+CPpmd7_Context::SummFreq = Sum(Stats[].Freq) + Escape_Freq
+
+The PPMd code tries to fulfill the condition:
+  (SummFreq <= (256 * 128 = RC::kBot))
+
+We have (Sum(Stats[].Freq) <= 256 * 124), because of (MAX_FREQ = 124)
+So (4 = 128 - 124) is average reserve for Escape_Freq for each symbol.
+If (CPpmd_State::Freq) is not aligned for 4, the reserve can be 5, 6 or 7.
+SummFreq and Escape_Freq can be changed in Ppmd7_Rescale() and *Update*() functions.
+Ppmd7_Rescale() can remove symbols only from max-order contexts. So Escape_Freq can increase after multiple calls of Ppmd7_Rescale() for
+max-order context.
+
+When the PPMd code still break (Total <= RC::Range) condition in range coder,
+we have two ways to resolve that problem:
+  1) we can report error, if we want to keep compatibility with original PPMd code that has no fix for such cases.
+  2) we can reduce (Total) value to (RC::Range) by reducing (Escape_Freq) part of (Total) value.
+*/
+
+#undef MAX_FREQ
+#undef UNIT_SIZE
+#undef U2B
+#undef U2I
+#undef I2U
+#undef I2U_UInt16
+#undef REF
+#undef STATS_REF
+#undef CTX
+#undef STATS
+#undef ONE_STATE
+#undef SUFFIX
+#undef NODE
+#undef EMPTY_NODE
+#undef MEM_12_CPY
+#undef SUCCESSOR
+#undef SWAP_STATES
diff --git a/src/sdk/C/Ppmd7.h b/src/sdk/C/Ppmd7.h
index 610539a..d9eb326 100644
--- a/src/sdk/C/Ppmd7.h
+++ b/src/sdk/C/Ppmd7.h
@@ -1,13 +1,11 @@
-/* Ppmd7.h -- PPMdH compression codec
-2018-07-04 : Igor Pavlov : Public domain
-This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
-
-/* This code supports virtual RangeDecoder and includes the implementation
-of RangeCoder from 7z, instead of RangeCoder from original PPMd var.H.
-If you need the compatibility with original PPMd var.H, you can use external RangeDecoder */
+/* Ppmd7.h -- Ppmd7 (PPMdH) compression codec
+2023-04-02 : Igor Pavlov : Public domain
+This code is based on:
+  PPMd var.H (2001): Dmitry Shkarin : Public domain */
+ 
 
-#ifndef __PPMD7_H
-#define __PPMD7_H
+#ifndef ZIP7_INC_PPMD7_H
+#define ZIP7_INC_PPMD7_H
 
 #include "Ppmd.h"
 
@@ -21,23 +19,56 @@ EXTERN_C_BEGIN
 
 struct CPpmd7_Context_;
 
-typedef
-  #ifdef PPMD_32BIT
-    struct CPpmd7_Context_ *
-  #else
-    UInt32
-  #endif
-  CPpmd7_Context_Ref;
+typedef Ppmd_Ref_Type(struct CPpmd7_Context_) CPpmd7_Context_Ref;
+
+// MY_CPU_pragma_pack_push_1
 
 typedef struct CPpmd7_Context_
 {
   UInt16 NumStats;
-  UInt16 SummFreq;
-  CPpmd_State_Ref Stats;
+
+
+  union
+  {
+    UInt16 SummFreq;
+    CPpmd_State2 State2;
+  } Union2;
+
+  union
+  {
+    CPpmd_State_Ref Stats;
+    CPpmd_State4 State4;
+  } Union4;
+
   CPpmd7_Context_Ref Suffix;
 } CPpmd7_Context;
 
-#define Ppmd7Context_OneState(p) ((CPpmd_State *)&(p)->SummFreq)
+// MY_CPU_pragma_pop
+
+#define Ppmd7Context_OneState(p) ((CPpmd_State *)&(p)->Union2)
+
+
+
+
+typedef struct
+{
+  UInt32 Range;
+  UInt32 Code;
+  UInt32 Low;
+  IByteInPtr Stream;
+} CPpmd7_RangeDec;
+
+
+typedef struct
+{
+  UInt32 Range;
+  Byte Cache;
+  // Byte _dummy_[3];
+  UInt64 Low;
+  UInt64 CacheSize;
+  IByteOutPtr Stream;
+} CPpmd7z_RangeEnc;
+
 
 typedef struct
 {
@@ -48,17 +79,30 @@ typedef struct
 
   UInt32 Size;
   UInt32 GlueCount;
-  Byte *Base, *LoUnit, *HiUnit, *Text, *UnitsStart;
   UInt32 AlignOffset;
+  Byte *Base, *LoUnit, *HiUnit, *Text, *UnitsStart;
 
-  Byte Indx2Units[PPMD_NUM_INDEXES];
+
+  
+  
+  union
+  {
+    CPpmd7_RangeDec dec;
+    CPpmd7z_RangeEnc enc;
+  } rc;
+  
+  Byte Indx2Units[PPMD_NUM_INDEXES + 2]; // +2 for alignment
   Byte Units2Indx[128];
   CPpmd_Void_Ref FreeList[PPMD_NUM_INDEXES];
-  Byte NS2Indx[256], NS2BSIndx[256], HB2Flag[256];
+
+  Byte NS2BSIndx[256], NS2Indx[256];
+  Byte ExpEscape[16];
   CPpmd_See DummySee, See[25][16];
   UInt16 BinSumm[128][64];
+  // int LastSymbol;
 } CPpmd7;
 
+
 void Ppmd7_Construct(CPpmd7 *p);
 BoolInt Ppmd7_Alloc(CPpmd7 *p, UInt32 size, ISzAllocPtr alloc);
 void Ppmd7_Free(CPpmd7 *p, ISzAllocPtr alloc);
@@ -68,74 +112,69 @@ void Ppmd7_Init(CPpmd7 *p, unsigned maxOrder);
 
 /* ---------- Internal Functions ---------- */
 
-extern const Byte PPMD7_kExpEscape[16];
-
-#ifdef PPMD_32BIT
-  #define Ppmd7_GetPtr(p, ptr) (ptr)
-  #define Ppmd7_GetContext(p, ptr) (ptr)
-  #define Ppmd7_GetStats(p, ctx) ((ctx)->Stats)
-#else
-  #define Ppmd7_GetPtr(p, offs) ((void *)((p)->Base + (offs)))
-  #define Ppmd7_GetContext(p, offs) ((CPpmd7_Context *)Ppmd7_GetPtr((p), (offs)))
-  #define Ppmd7_GetStats(p, ctx) ((CPpmd_State *)Ppmd7_GetPtr((p), ((ctx)->Stats)))
-#endif
+#define Ppmd7_GetPtr(p, ptr)     Ppmd_GetPtr(p, ptr)
+#define Ppmd7_GetContext(p, ptr) Ppmd_GetPtr_Type(p, ptr, CPpmd7_Context)
+#define Ppmd7_GetStats(p, ctx)   Ppmd_GetPtr_Type(p, (ctx)->Union4.Stats, CPpmd_State)
 
 void Ppmd7_Update1(CPpmd7 *p);
 void Ppmd7_Update1_0(CPpmd7 *p);
 void Ppmd7_Update2(CPpmd7 *p);
-void Ppmd7_UpdateBin(CPpmd7 *p);
+
+#define PPMD7_HiBitsFlag_3(sym) ((((unsigned)sym + 0xC0) >> (8 - 3)) & (1 << 3))
+#define PPMD7_HiBitsFlag_4(sym) ((((unsigned)sym + 0xC0) >> (8 - 4)) & (1 << 4))
+// #define PPMD7_HiBitsFlag_3(sym) ((sym) < 0x40 ? 0 : (1 << 3))
+// #define PPMD7_HiBitsFlag_4(sym) ((sym) < 0x40 ? 0 : (1 << 4))
 
 #define Ppmd7_GetBinSumm(p) \
-    &p->BinSumm[(size_t)(unsigned)Ppmd7Context_OneState(p->MinContext)->Freq - 1][p->PrevSuccess + \
-    p->NS2BSIndx[(size_t)Ppmd7_GetContext(p, p->MinContext->Suffix)->NumStats - 1] + \
-    (p->HiBitsFlag = p->HB2Flag[p->FoundState->Symbol]) + \
-    2 * p->HB2Flag[(unsigned)Ppmd7Context_OneState(p->MinContext)->Symbol] + \
-    ((p->RunLength >> 26) & 0x20)]
+    &p->BinSumm[(size_t)(unsigned)Ppmd7Context_OneState(p->MinContext)->Freq - 1] \
+    [ p->PrevSuccess + ((p->RunLength >> 26) & 0x20) \
+    + p->NS2BSIndx[(size_t)Ppmd7_GetContext(p, p->MinContext->Suffix)->NumStats - 1] \
+    + PPMD7_HiBitsFlag_4(Ppmd7Context_OneState(p->MinContext)->Symbol) \
+    + (p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol)) ]
 
 CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *scale);
 
 
+/*
+We support two versions of Ppmd7 (PPMdH) methods that use same CPpmd7 structure:
+  1) Ppmd7a_*: original PPMdH
+  2) Ppmd7z_*: modified PPMdH with 7z Range Coder
+Ppmd7_*: the structures and functions that are common for both versions of PPMd7 (PPMdH)
+*/
+
 /* ---------- Decode ---------- */
 
-typedef struct IPpmd7_RangeDec IPpmd7_RangeDec;
+#define PPMD7_SYM_END    (-1)
+#define PPMD7_SYM_ERROR  (-2)
 
-struct IPpmd7_RangeDec
-{
-  UInt32 (*GetThreshold)(const IPpmd7_RangeDec *p, UInt32 total);
-  void (*Decode)(const IPpmd7_RangeDec *p, UInt32 start, UInt32 size);
-  UInt32 (*DecodeBit)(const IPpmd7_RangeDec *p, UInt32 size0);
-};
+/*
+You must set (CPpmd7::rc.dec.Stream) before Ppmd7*_RangeDec_Init()
 
-typedef struct
-{
-  IPpmd7_RangeDec vt;
-  UInt32 Range;
-  UInt32 Code;
-  IByteIn *Stream;
-} CPpmd7z_RangeDec;
+Ppmd7*_DecodeSymbol()
+out:
+  >= 0 : decoded byte
+    -1 : PPMD7_SYM_END   : End of payload marker
+    -2 : PPMD7_SYM_ERROR : Data error
+*/
 
-void Ppmd7z_RangeDec_CreateVTable(CPpmd7z_RangeDec *p);
-BoolInt Ppmd7z_RangeDec_Init(CPpmd7z_RangeDec *p);
-#define Ppmd7z_RangeDec_IsFinishedOK(p) ((p)->Code == 0)
+/* Ppmd7a_* : original PPMdH */
+BoolInt Ppmd7a_RangeDec_Init(CPpmd7_RangeDec *p);
+#define Ppmd7a_RangeDec_IsFinishedOK(p) ((p)->Code == 0)
+int Ppmd7a_DecodeSymbol(CPpmd7 *p);
 
-int Ppmd7_DecodeSymbol(CPpmd7 *p, const IPpmd7_RangeDec *rc);
+/* Ppmd7z_* : modified PPMdH with 7z Range Coder */
+BoolInt Ppmd7z_RangeDec_Init(CPpmd7_RangeDec *p);
+#define Ppmd7z_RangeDec_IsFinishedOK(p) ((p)->Code == 0)
+int Ppmd7z_DecodeSymbol(CPpmd7 *p);
+// Byte *Ppmd7z_DecodeSymbols(CPpmd7 *p, Byte *buf, const Byte *lim);
 
 
 /* ---------- Encode ---------- */
 
-typedef struct
-{
-  UInt64 Low;
-  UInt32 Range;
-  Byte Cache;
-  UInt64 CacheSize;
-  IByteOut *Stream;
-} CPpmd7z_RangeEnc;
-
-void Ppmd7z_RangeEnc_Init(CPpmd7z_RangeEnc *p);
-void Ppmd7z_RangeEnc_FlushData(CPpmd7z_RangeEnc *p);
-
-void Ppmd7_EncodeSymbol(CPpmd7 *p, CPpmd7z_RangeEnc *rc, int symbol);
+void Ppmd7z_Init_RangeEnc(CPpmd7 *p);
+void Ppmd7z_Flush_RangeEnc(CPpmd7 *p);
+// void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol);
+void Ppmd7z_EncodeSymbols(CPpmd7 *p, const Byte *buf, const Byte *lim);
 
 EXTERN_C_END
  
diff --git a/src/sdk/C/Ppmd7Dec.c b/src/sdk/C/Ppmd7Dec.c
index 311e9f9..081ab89 100644
--- a/src/sdk/C/Ppmd7Dec.c
+++ b/src/sdk/C/Ppmd7Dec.c
@@ -1,191 +1,312 @@
-/* Ppmd7Dec.c -- PPMdH Decoder
-2018-07-04 : Igor Pavlov : Public domain
-This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
+/* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder
+2023-09-07 : Igor Pavlov : Public domain
+This code is based on:
+  PPMd var.H (2001): Dmitry Shkarin : Public domain */
+
 
 #include "Precomp.h"
 
 #include "Ppmd7.h"
 
-#define kTopValue (1 << 24)
+#define kTopValue ((UInt32)1 << 24)
+
+
+#define READ_BYTE(p) IByteIn_Read((p)->Stream)
 
-BoolInt Ppmd7z_RangeDec_Init(CPpmd7z_RangeDec *p)
+BoolInt Ppmd7z_RangeDec_Init(CPpmd7_RangeDec *p)
 {
   unsigned i;
   p->Code = 0;
   p->Range = 0xFFFFFFFF;
-  if (IByteIn_Read(p->Stream) != 0)
+  if (READ_BYTE(p) != 0)
     return False;
   for (i = 0; i < 4; i++)
-    p->Code = (p->Code << 8) | IByteIn_Read(p->Stream);
+    p->Code = (p->Code << 8) | READ_BYTE(p);
   return (p->Code < 0xFFFFFFFF);
 }
 
-#define GET_Ppmd7z_RangeDec CPpmd7z_RangeDec *p = CONTAINER_FROM_VTBL(pp, CPpmd7z_RangeDec, vt);
- 
-static UInt32 Range_GetThreshold(const IPpmd7_RangeDec *pp, UInt32 total)
-{
-  GET_Ppmd7z_RangeDec
-  return p->Code / (p->Range /= total);
-}
+#define RC_NORM_BASE(p) if ((p)->Range < kTopValue) \
+  { (p)->Code = ((p)->Code << 8) | READ_BYTE(p); (p)->Range <<= 8;
 
-static void Range_Normalize(CPpmd7z_RangeDec *p)
-{
-  if (p->Range < kTopValue)
-  {
-    p->Code = (p->Code << 8) | IByteIn_Read(p->Stream);
-    p->Range <<= 8;
-    if (p->Range < kTopValue)
-    {
-      p->Code = (p->Code << 8) | IByteIn_Read(p->Stream);
-      p->Range <<= 8;
-    }
-  }
-}
+#define RC_NORM_1(p)  RC_NORM_BASE(p) }
+#define RC_NORM(p)    RC_NORM_BASE(p) RC_NORM_BASE(p) }}
 
-static void Range_Decode(const IPpmd7_RangeDec *pp, UInt32 start, UInt32 size)
-{
-  GET_Ppmd7z_RangeDec
-  p->Code -= start * p->Range;
-  p->Range *= size;
-  Range_Normalize(p);
-}
+// we must use only one type of Normalization from two: LOCAL or REMOTE
+#define RC_NORM_LOCAL(p)    // RC_NORM(p)
+#define RC_NORM_REMOTE(p)   RC_NORM(p)
 
-static UInt32 Range_DecodeBit(const IPpmd7_RangeDec *pp, UInt32 size0)
-{
-  GET_Ppmd7z_RangeDec
-  UInt32 newBound = (p->Range >> 14) * size0;
-  UInt32 symbol;
-  if (p->Code < newBound)
-  {
-    symbol = 0;
-    p->Range = newBound;
-  }
-  else
-  {
-    symbol = 1;
-    p->Code -= newBound;
-    p->Range -= newBound;
-  }
-  Range_Normalize(p);
-  return symbol;
-}
+#define R (&p->rc.dec)
 
-void Ppmd7z_RangeDec_CreateVTable(CPpmd7z_RangeDec *p)
+Z7_FORCE_INLINE
+// Z7_NO_INLINE
+static void Ppmd7z_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
 {
-  p->vt.GetThreshold = Range_GetThreshold;
-  p->vt.Decode = Range_Decode;
-  p->vt.DecodeBit = Range_DecodeBit;
+
+  
+  R->Code -= start * R->Range;
+  R->Range *= size;
+  RC_NORM_LOCAL(R)
 }
 
+#define RC_Decode(start, size)  Ppmd7z_RD_Decode(p, start, size);
+#define RC_DecodeFinal(start, size)  RC_Decode(start, size)  RC_NORM_REMOTE(R)
+#define RC_GetThreshold(total)  (R->Code / (R->Range /= (total)))
+
 
-#define MASK(sym) ((signed char *)charMask)[sym]
+#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref))
+// typedef CPpmd7_Context * CTX_PTR;
+#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
+void Ppmd7_UpdateModel(CPpmd7 *p);
 
-int Ppmd7_DecodeSymbol(CPpmd7 *p, const IPpmd7_RangeDec *rc)
+#define MASK(sym)  ((Byte *)charMask)[sym]
+// Z7_FORCE_INLINE
+// static
+int Ppmd7z_DecodeSymbol(CPpmd7 *p)
 {
   size_t charMask[256 / sizeof(size_t)];
+
   if (p->MinContext->NumStats != 1)
   {
     CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext);
     unsigned i;
     UInt32 count, hiCnt;
-    if ((count = rc->GetThreshold(rc, p->MinContext->SummFreq)) < (hiCnt = s->Freq))
+    const UInt32 summFreq = p->MinContext->Union2.SummFreq;
+
+    
+    
+    
+    count = RC_GetThreshold(summFreq);
+    hiCnt = count;
+    
+    if ((Int32)(count -= s->Freq) < 0)
     {
-      Byte symbol;
-      rc->Decode(rc, 0, s->Freq);
+      Byte sym;
+      RC_DecodeFinal(0, s->Freq)
       p->FoundState = s;
-      symbol = s->Symbol;
+      sym = s->Symbol;
       Ppmd7_Update1_0(p);
-      return symbol;
+      return sym;
     }
+  
     p->PrevSuccess = 0;
-    i = p->MinContext->NumStats - 1;
+    i = (unsigned)p->MinContext->NumStats - 1;
+    
     do
     {
-      if ((hiCnt += (++s)->Freq) > count)
+      if ((Int32)(count -= (++s)->Freq) < 0)
       {
-        Byte symbol;
-        rc->Decode(rc, hiCnt - s->Freq, s->Freq);
+        Byte sym;
+        RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
         p->FoundState = s;
-        symbol = s->Symbol;
+        sym = s->Symbol;
         Ppmd7_Update1(p);
-        return symbol;
+        return sym;
       }
     }
     while (--i);
-    if (count >= p->MinContext->SummFreq)
-      return -2;
-    p->HiBitsFlag = p->HB2Flag[p->FoundState->Symbol];
-    rc->Decode(rc, hiCnt, p->MinContext->SummFreq - hiCnt);
-    PPMD_SetAllBitsIn256Bytes(charMask);
-    MASK(s->Symbol) = 0;
-    i = p->MinContext->NumStats - 1;
-    do { MASK((--s)->Symbol) = 0; } while (--i);
+    
+    if (hiCnt >= summFreq)
+      return PPMD7_SYM_ERROR;
+    
+    hiCnt -= count;
+    RC_Decode(hiCnt, summFreq - hiCnt)
+
+    p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol);
+    PPMD_SetAllBitsIn256Bytes(charMask)
+    // i = p->MinContext->NumStats - 1;
+    // do { MASK((--s)->Symbol) = 0; } while (--i);
+    {
+      CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext);
+      MASK(s->Symbol) = 0;
+      do
+      {
+        const unsigned sym0 = s2[0].Symbol;
+        const unsigned sym1 = s2[1].Symbol;
+        s2 += 2;
+        MASK(sym0) = 0;
+        MASK(sym1) = 0;
+      }
+      while (s2 < s);
+    }
   }
   else
   {
+    CPpmd_State *s = Ppmd7Context_OneState(p->MinContext);
     UInt16 *prob = Ppmd7_GetBinSumm(p);
-    if (rc->DecodeBit(rc, *prob) == 0)
+    UInt32 pr = *prob;
+    UInt32 size0 = (R->Range >> 14) * pr;
+    pr = PPMD_UPDATE_PROB_1(pr);
+
+    if (R->Code < size0)
     {
-      Byte symbol;
-      *prob = (UInt16)PPMD_UPDATE_PROB_0(*prob);
-      symbol = (p->FoundState = Ppmd7Context_OneState(p->MinContext))->Symbol;
-      Ppmd7_UpdateBin(p);
-      return symbol;
+      Byte sym;
+      *prob = (UInt16)(pr + (1 << PPMD_INT_BITS));
+      
+      // RangeDec_DecodeBit0(size0);
+      R->Range = size0;
+      RC_NORM_1(R)
+      /* we can use single byte normalization here because of
+         (min(BinSumm[][]) = 95) > (1 << (14 - 8)) */
+
+      // sym = (p->FoundState = Ppmd7Context_OneState(p->MinContext))->Symbol;
+      // Ppmd7_UpdateBin(p);
+      {
+        unsigned freq = s->Freq;
+        CPpmd7_Context *c = CTX(SUCCESSOR(s));
+        sym = s->Symbol;
+        p->FoundState = s;
+        p->PrevSuccess = 1;
+        p->RunLength++;
+        s->Freq = (Byte)(freq + (freq < 128));
+        // NextContext(p);
+        if (p->OrderFall == 0 && (const Byte *)c > p->Text)
+          p->MaxContext = p->MinContext = c;
+        else
+          Ppmd7_UpdateModel(p);
+      }
+      return sym;
     }
-    *prob = (UInt16)PPMD_UPDATE_PROB_1(*prob);
-    p->InitEsc = PPMD7_kExpEscape[*prob >> 10];
-    PPMD_SetAllBitsIn256Bytes(charMask);
+
+    *prob = (UInt16)pr;
+    p->InitEsc = p->ExpEscape[pr >> 10];
+
+    // RangeDec_DecodeBit1(size0);
+    
+    R->Code -= size0;
+    R->Range -= size0;
+    RC_NORM_LOCAL(R)
+    
+    PPMD_SetAllBitsIn256Bytes(charMask)
     MASK(Ppmd7Context_OneState(p->MinContext)->Symbol) = 0;
     p->PrevSuccess = 0;
   }
+
   for (;;)
   {
-    CPpmd_State *ps[256], *s;
+    CPpmd_State *s, *s2;
     UInt32 freqSum, count, hiCnt;
+
     CPpmd_See *see;
-    unsigned i, num, numMasked = p->MinContext->NumStats;
+    CPpmd7_Context *mc;
+    unsigned numMasked;
+    RC_NORM_REMOTE(R)
+    mc = p->MinContext;
+    numMasked = mc->NumStats;
+
     do
     {
       p->OrderFall++;
-      if (!p->MinContext->Suffix)
-        return -1;
-      p->MinContext = Ppmd7_GetContext(p, p->MinContext->Suffix);
+      if (!mc->Suffix)
+        return PPMD7_SYM_END;
+      mc = Ppmd7_GetContext(p, mc->Suffix);
     }
-    while (p->MinContext->NumStats == numMasked);
-    hiCnt = 0;
-    s = Ppmd7_GetStats(p, p->MinContext);
-    i = 0;
-    num = p->MinContext->NumStats - numMasked;
-    do
+    while (mc->NumStats == numMasked);
+    
+    s = Ppmd7_GetStats(p, mc);
+
     {
-      int k = (int)(MASK(s->Symbol));
-      hiCnt += (s->Freq & k);
-      ps[i] = s++;
-      i -= k;
+      unsigned num = mc->NumStats;
+      unsigned num2 = num / 2;
+      
+      num &= 1;
+      hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num);
+      s += num;
+      p->MinContext = mc;
+
+      do
+      {
+        const unsigned sym0 = s[0].Symbol;
+        const unsigned sym1 = s[1].Symbol;
+        s += 2;
+        hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0)));
+        hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1)));
+      }
+      while (--num2);
     }
-    while (i != num);
-    
+
     see = Ppmd7_MakeEscFreq(p, numMasked, &freqSum);
     freqSum += hiCnt;
-    count = rc->GetThreshold(rc, freqSum);
+
+
+
+
+    count = RC_GetThreshold(freqSum);
     
     if (count < hiCnt)
     {
-      Byte symbol;
-      CPpmd_State **pps = ps;
-      for (hiCnt = 0; (hiCnt += (*pps)->Freq) <= count; pps++);
-      s = *pps;
-      rc->Decode(rc, hiCnt - s->Freq, s->Freq);
-      Ppmd_See_Update(see);
+      Byte sym;
+
+      s = Ppmd7_GetStats(p, p->MinContext);
+      hiCnt = count;
+      // count -= s->Freq & (UInt32)(MASK(s->Symbol));
+      // if ((Int32)count >= 0)
+      {
+        for (;;)
+        {
+          count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
+          // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
+        }
+      }
+      s--;
+      RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
+
+      // new (see->Summ) value can overflow over 16-bits in some rare cases
+      Ppmd_See_UPDATE(see)
       p->FoundState = s;
-      symbol = s->Symbol;
+      sym = s->Symbol;
       Ppmd7_Update2(p);
-      return symbol;
+      return sym;
     }
+
     if (count >= freqSum)
-      return -2;
-    rc->Decode(rc, hiCnt, freqSum - hiCnt);
+      return PPMD7_SYM_ERROR;
+    
+    RC_Decode(hiCnt, freqSum - hiCnt)
+
+    // We increase (see->Summ) for sum of Freqs of all non_Masked symbols.
+    // new (see->Summ) value can overflow over 16-bits in some rare cases
     see->Summ = (UInt16)(see->Summ + freqSum);
-    do { MASK(ps[--i]->Symbol) = 0; } while (i != 0);
+
+    s = Ppmd7_GetStats(p, p->MinContext);
+    s2 = s + p->MinContext->NumStats;
+    do
+    {
+      MASK(s->Symbol) = 0;
+      s++;
+    }
+    while (s != s2);
   }
 }
+
+/*
+Byte *Ppmd7z_DecodeSymbols(CPpmd7 *p, Byte *buf, const Byte *lim)
+{
+  int sym = 0;
+  if (buf != lim)
+  do
+  {
+    sym = Ppmd7z_DecodeSymbol(p);
+    if (sym < 0)
+      break;
+    *buf = (Byte)sym;
+  }
+  while (++buf < lim);
+  p->LastSymbol = sym;
+  return buf;
+}
+*/
+
+#undef kTopValue
+#undef READ_BYTE
+#undef RC_NORM_BASE
+#undef RC_NORM_1
+#undef RC_NORM
+#undef RC_NORM_LOCAL
+#undef RC_NORM_REMOTE
+#undef R
+#undef RC_Decode
+#undef RC_DecodeFinal
+#undef RC_GetThreshold
+#undef CTX
+#undef SUCCESSOR
+#undef MASK
diff --git a/src/sdk/C/Ppmd7Enc.c b/src/sdk/C/Ppmd7Enc.c
index 286b871..49cbbe6 100644
--- a/src/sdk/C/Ppmd7Enc.c
+++ b/src/sdk/C/Ppmd7Enc.c
@@ -1,104 +1,123 @@
-/* Ppmd7Enc.c -- PPMdH Encoder
-2017-04-03 : Igor Pavlov : Public domain
-This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
+/* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder
+2023-09-07 : Igor Pavlov : Public domain
+This code is based on:
+  PPMd var.H (2001): Dmitry Shkarin : Public domain */
+
 
 #include "Precomp.h"
 
 #include "Ppmd7.h"
 
-#define kTopValue (1 << 24)
+#define kTopValue ((UInt32)1 << 24)
+
+#define R (&p->rc.enc)
 
-void Ppmd7z_RangeEnc_Init(CPpmd7z_RangeEnc *p)
+void Ppmd7z_Init_RangeEnc(CPpmd7 *p)
 {
-  p->Low = 0;
-  p->Range = 0xFFFFFFFF;
-  p->Cache = 0;
-  p->CacheSize = 1;
+  R->Low = 0;
+  R->Range = 0xFFFFFFFF;
+  R->Cache = 0;
+  R->CacheSize = 1;
 }
 
-static void RangeEnc_ShiftLow(CPpmd7z_RangeEnc *p)
+Z7_NO_INLINE
+static void Ppmd7z_RangeEnc_ShiftLow(CPpmd7 *p)
 {
-  if ((UInt32)p->Low < (UInt32)0xFF000000 || (unsigned)(p->Low >> 32) != 0)
+  if ((UInt32)R->Low < (UInt32)0xFF000000 || (unsigned)(R->Low >> 32) != 0)
   {
-    Byte temp = p->Cache;
+    Byte temp = R->Cache;
     do
     {
-      IByteOut_Write(p->Stream, (Byte)(temp + (Byte)(p->Low >> 32)));
+      IByteOut_Write(R->Stream, (Byte)(temp + (Byte)(R->Low >> 32)));
       temp = 0xFF;
     }
-    while (--p->CacheSize != 0);
-    p->Cache = (Byte)((UInt32)p->Low >> 24);
+    while (--R->CacheSize != 0);
+    R->Cache = (Byte)((UInt32)R->Low >> 24);
   }
-  p->CacheSize++;
-  p->Low = (UInt32)p->Low << 8;
+  R->CacheSize++;
+  R->Low = (UInt32)((UInt32)R->Low << 8);
 }
 
-static void RangeEnc_Encode(CPpmd7z_RangeEnc *p, UInt32 start, UInt32 size, UInt32 total)
-{
-  p->Low += start * (p->Range /= total);
-  p->Range *= size;
-  while (p->Range < kTopValue)
-  {
-    p->Range <<= 8;
-    RangeEnc_ShiftLow(p);
-  }
-}
+#define RC_NORM_BASE(p) if (R->Range < kTopValue) { R->Range <<= 8;  Ppmd7z_RangeEnc_ShiftLow(p);
+#define RC_NORM_1(p)    RC_NORM_BASE(p) }
+#define RC_NORM(p)      RC_NORM_BASE(p)  RC_NORM_BASE(p) }}
 
-static void RangeEnc_EncodeBit_0(CPpmd7z_RangeEnc *p, UInt32 size0)
-{
-  p->Range = (p->Range >> 14) * size0;
-  while (p->Range < kTopValue)
-  {
-    p->Range <<= 8;
-    RangeEnc_ShiftLow(p);
-  }
-}
+// we must use only one type of Normalization from two: LOCAL or REMOTE
+#define RC_NORM_LOCAL(p)    // RC_NORM(p)
+#define RC_NORM_REMOTE(p)   RC_NORM(p)
 
-static void RangeEnc_EncodeBit_1(CPpmd7z_RangeEnc *p, UInt32 size0)
+/*
+#define Ppmd7z_RangeEnc_Encode(p, start, _size_) \
+  { UInt32 size = _size_; \
+    R->Low += start * R->Range; \
+    R->Range *= size; \
+    RC_NORM_LOCAL(p); }
+*/
+
+Z7_FORCE_INLINE
+// Z7_NO_INLINE
+static void Ppmd7z_RangeEnc_Encode(CPpmd7 *p, UInt32 start, UInt32 size)
 {
-  UInt32 newBound = (p->Range >> 14) * size0;
-  p->Low += newBound;
-  p->Range -= newBound;
-  while (p->Range < kTopValue)
-  {
-    p->Range <<= 8;
-    RangeEnc_ShiftLow(p);
-  }
+  R->Low += start * R->Range;
+  R->Range *= size;
+  RC_NORM_LOCAL(p)
 }
 
-void Ppmd7z_RangeEnc_FlushData(CPpmd7z_RangeEnc *p)
+void Ppmd7z_Flush_RangeEnc(CPpmd7 *p)
 {
   unsigned i;
   for (i = 0; i < 5; i++)
-    RangeEnc_ShiftLow(p);
+    Ppmd7z_RangeEnc_ShiftLow(p);
 }
 
 
-#define MASK(sym) ((signed char *)charMask)[sym]
 
-void Ppmd7_EncodeSymbol(CPpmd7 *p, CPpmd7z_RangeEnc *rc, int symbol)
+#define RC_Encode(start, size)  Ppmd7z_RangeEnc_Encode(p, start, size);
+#define RC_EncodeFinal(start, size)  RC_Encode(start, size) RC_NORM_REMOTE(p)
+
+#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref))
+#define SUFFIX(ctx) CTX((ctx)->Suffix)
+// typedef CPpmd7_Context * CTX_PTR;
+#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
+
+void Ppmd7_UpdateModel(CPpmd7 *p);
+
+#define MASK(sym)  ((Byte *)charMask)[sym]
+
+Z7_FORCE_INLINE
+static
+void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
 {
   size_t charMask[256 / sizeof(size_t)];
+  
   if (p->MinContext->NumStats != 1)
   {
     CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext);
     UInt32 sum;
     unsigned i;
+   
+
+    
+    
+    R->Range /= p->MinContext->Union2.SummFreq;
+    
     if (s->Symbol == symbol)
     {
-      RangeEnc_Encode(rc, 0, s->Freq, p->MinContext->SummFreq);
+      // R->Range /= p->MinContext->Union2.SummFreq;
+      RC_EncodeFinal(0, s->Freq)
       p->FoundState = s;
       Ppmd7_Update1_0(p);
       return;
     }
     p->PrevSuccess = 0;
     sum = s->Freq;
-    i = p->MinContext->NumStats - 1;
+    i = (unsigned)p->MinContext->NumStats - 1;
     do
     {
       if ((++s)->Symbol == symbol)
       {
-        RangeEnc_Encode(rc, sum, s->Freq, p->MinContext->SummFreq);
+        // R->Range /= p->MinContext->Union2.SummFreq;
+        RC_EncodeFinal(sum, s->Freq)
         p->FoundState = s;
         Ppmd7_Update1(p);
         return;
@@ -106,82 +125,213 @@ void Ppmd7_EncodeSymbol(CPpmd7 *p, CPpmd7z_RangeEnc *rc, int symbol)
       sum += s->Freq;
     }
     while (--i);
+
+    // R->Range /= p->MinContext->Union2.SummFreq;
+    RC_Encode(sum, p->MinContext->Union2.SummFreq - sum)
     
-    p->HiBitsFlag = p->HB2Flag[p->FoundState->Symbol];
-    PPMD_SetAllBitsIn256Bytes(charMask);
-    MASK(s->Symbol) = 0;
-    i = p->MinContext->NumStats - 1;
-    do { MASK((--s)->Symbol) = 0; } while (--i);
-    RangeEnc_Encode(rc, sum, p->MinContext->SummFreq - sum, p->MinContext->SummFreq);
+    p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol);
+    PPMD_SetAllBitsIn256Bytes(charMask)
+    // MASK(s->Symbol) = 0;
+    // i = p->MinContext->NumStats - 1;
+    // do { MASK((--s)->Symbol) = 0; } while (--i);
+    {
+      CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext);
+      MASK(s->Symbol) = 0;
+      do
+      {
+        const unsigned sym0 = s2[0].Symbol;
+        const unsigned sym1 = s2[1].Symbol;
+        s2 += 2;
+        MASK(sym0) = 0;
+        MASK(sym1) = 0;
+      }
+      while (s2 < s);
+    }
   }
   else
   {
     UInt16 *prob = Ppmd7_GetBinSumm(p);
     CPpmd_State *s = Ppmd7Context_OneState(p->MinContext);
+    UInt32 pr = *prob;
+    const UInt32 bound = (R->Range >> 14) * pr;
+    pr = PPMD_UPDATE_PROB_1(pr);
     if (s->Symbol == symbol)
     {
-      RangeEnc_EncodeBit_0(rc, *prob);
-      *prob = (UInt16)PPMD_UPDATE_PROB_0(*prob);
-      p->FoundState = s;
-      Ppmd7_UpdateBin(p);
+      *prob = (UInt16)(pr + (1 << PPMD_INT_BITS));
+      // RangeEnc_EncodeBit_0(p, bound);
+      R->Range = bound;
+      RC_NORM_1(p)
+      
+      // p->FoundState = s;
+      // Ppmd7_UpdateBin(p);
+      {
+        const unsigned freq = s->Freq;
+        CPpmd7_Context *c = CTX(SUCCESSOR(s));
+        p->FoundState = s;
+        p->PrevSuccess = 1;
+        p->RunLength++;
+        s->Freq = (Byte)(freq + (freq < 128));
+        // NextContext(p);
+        if (p->OrderFall == 0 && (const Byte *)c > p->Text)
+          p->MaxContext = p->MinContext = c;
+        else
+          Ppmd7_UpdateModel(p);
+      }
       return;
     }
-    else
-    {
-      RangeEnc_EncodeBit_1(rc, *prob);
-      *prob = (UInt16)PPMD_UPDATE_PROB_1(*prob);
-      p->InitEsc = PPMD7_kExpEscape[*prob >> 10];
-      PPMD_SetAllBitsIn256Bytes(charMask);
-      MASK(s->Symbol) = 0;
-      p->PrevSuccess = 0;
-    }
+
+    *prob = (UInt16)pr;
+    p->InitEsc = p->ExpEscape[pr >> 10];
+    // RangeEnc_EncodeBit_1(p, bound);
+    R->Low += bound;
+    R->Range -= bound;
+    RC_NORM_LOCAL(p)
+    
+    PPMD_SetAllBitsIn256Bytes(charMask)
+    MASK(s->Symbol) = 0;
+    p->PrevSuccess = 0;
   }
+
   for (;;)
   {
-    UInt32 escFreq;
     CPpmd_See *see;
     CPpmd_State *s;
-    UInt32 sum;
-    unsigned i, numMasked = p->MinContext->NumStats;
+    UInt32 sum, escFreq;
+    CPpmd7_Context *mc;
+    unsigned i, numMasked;
+    
+    RC_NORM_REMOTE(p)
+
+    mc = p->MinContext;
+    numMasked = mc->NumStats;
+
     do
     {
       p->OrderFall++;
-      if (!p->MinContext->Suffix)
+      if (!mc->Suffix)
         return; /* EndMarker (symbol = -1) */
-      p->MinContext = Ppmd7_GetContext(p, p->MinContext->Suffix);
+      mc = Ppmd7_GetContext(p, mc->Suffix);
+      i = mc->NumStats;
     }
-    while (p->MinContext->NumStats == numMasked);
+    while (i == numMasked);
+
+    p->MinContext = mc;
     
-    see = Ppmd7_MakeEscFreq(p, numMasked, &escFreq);
-    s = Ppmd7_GetStats(p, p->MinContext);
+    // see = Ppmd7_MakeEscFreq(p, numMasked, &escFreq);
+    {
+      if (i != 256)
+      {
+        unsigned nonMasked = i - numMasked;
+        see = p->See[(unsigned)p->NS2Indx[(size_t)nonMasked - 1]]
+            + p->HiBitsFlag
+            + (nonMasked < (unsigned)SUFFIX(mc)->NumStats - i)
+            + 2 * (unsigned)(mc->Union2.SummFreq < 11 * i)
+            + 4 * (unsigned)(numMasked > nonMasked);
+        {
+          // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ
+          unsigned summ = (UInt16)see->Summ; // & 0xFFFF
+          unsigned r = (summ >> see->Shift);
+          see->Summ = (UInt16)(summ - r);
+          escFreq = r + (r == 0);
+        }
+      }
+      else
+      {
+        see = &p->DummySee;
+        escFreq = 1;
+      }
+    }
+
+    s = Ppmd7_GetStats(p, mc);
     sum = 0;
-    i = p->MinContext->NumStats;
+    // i = mc->NumStats;
+
     do
     {
-      int cur = s->Symbol;
-      if (cur == symbol)
+      const unsigned cur = s->Symbol;
+      if ((int)cur == symbol)
       {
-        UInt32 low = sum;
-        CPpmd_State *s1 = s;
-        do
+        const UInt32 low = sum;
+        const UInt32 freq = s->Freq;
+        unsigned num2;
+
+        Ppmd_See_UPDATE(see)
+        p->FoundState = s;
+        sum += escFreq;
+
+        num2 = i / 2;
+        i &= 1;
+        sum += freq & (0 - (UInt32)i);
+        if (num2 != 0)
         {
-          sum += (s->Freq & (int)(MASK(s->Symbol)));
-          s++;
+          s += i;
+          do
+          {
+            const unsigned sym0 = s[0].Symbol;
+            const unsigned sym1 = s[1].Symbol;
+            s += 2;
+            sum += (s[-2].Freq & (unsigned)(MASK(sym0)));
+            sum += (s[-1].Freq & (unsigned)(MASK(sym1)));
+          }
+          while (--num2);
         }
-        while (--i);
-        RangeEnc_Encode(rc, low, s1->Freq, sum + escFreq);
-        Ppmd_See_Update(see);
-        p->FoundState = s1;
+
+        
+        R->Range /= sum;
+        RC_EncodeFinal(low, freq)
         Ppmd7_Update2(p);
         return;
       }
-      sum += (s->Freq & (int)(MASK(cur)));
-      MASK(cur) = 0;
+      sum += (s->Freq & (unsigned)(MASK(cur)));
       s++;
     }
     while (--i);
     
-    RangeEnc_Encode(rc, sum, escFreq, sum + escFreq);
-    see->Summ = (UInt16)(see->Summ + sum + escFreq);
+    {
+      const UInt32 total = sum + escFreq;
+      see->Summ = (UInt16)(see->Summ + total);
+
+      R->Range /= total;
+      RC_Encode(sum, escFreq)
+    }
+
+    {
+      const CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext);
+      s--;
+      MASK(s->Symbol) = 0;
+      do
+      {
+        const unsigned sym0 = s2[0].Symbol;
+        const unsigned sym1 = s2[1].Symbol;
+        s2 += 2;
+        MASK(sym0) = 0;
+        MASK(sym1) = 0;
+      }
+      while (s2 < s);
+    }
   }
 }
+
+
+void Ppmd7z_EncodeSymbols(CPpmd7 *p, const Byte *buf, const Byte *lim)
+{
+  for (; buf < lim; buf++)
+  {
+    Ppmd7z_EncodeSymbol(p, *buf);
+  }
+}
+
+#undef kTopValue
+#undef WRITE_BYTE
+#undef RC_NORM_BASE
+#undef RC_NORM_1
+#undef RC_NORM
+#undef RC_NORM_LOCAL
+#undef RC_NORM_REMOTE
+#undef R
+#undef RC_Encode
+#undef RC_EncodeFinal
+#undef SUFFIX
+#undef CTX
+#undef SUCCESSOR
+#undef MASK
diff --git a/src/sdk/C/Precomp.h b/src/sdk/C/Precomp.h
index e8ff8b4..7747fdd 100644
--- a/src/sdk/C/Precomp.h
+++ b/src/sdk/C/Precomp.h
@@ -1,10 +1,127 @@
-/* Precomp.h -- StdAfx
-2013-11-12 : Igor Pavlov : Public domain */
+/* Precomp.h -- precompilation file
+2024-01-25 : Igor Pavlov : Public domain */
 
-#ifndef __7Z_PRECOMP_H
-#define __7Z_PRECOMP_H
+#ifndef ZIP7_INC_PRECOMP_H
+#define ZIP7_INC_PRECOMP_H
+
+/*
+  this file must be included before another *.h files and before <windows.h>.
+  this file is included from the following files:
+    C\*.c
+    C\Util\*\Precomp.h   <-  C\Util\*\*.c
+    CPP\Common\Common.h  <-  *\StdAfx.h    <-  *\*.cpp
+
+  this file can set the following macros:
+    Z7_LARGE_PAGES 1
+    Z7_LONG_PATH 1
+    Z7_WIN32_WINNT_MIN  0x0500 (or higher) : we require at least win2000+ for 7-Zip
+    _WIN32_WINNT        0x0500 (or higher)
+    WINVER  _WIN32_WINNT
+    UNICODE 1
+    _UNICODE 1
+*/
 
 #include "Compiler.h"
-/* #include "7zTypes.h" */
+
+#ifdef _MSC_VER
+// #pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty
+#if _MSC_VER >= 1912
+// #pragma warning(disable : 5039) // pointer or reference to potentially throwing function passed to 'extern "C"' function under - EHc.Undefined behavior may occur if this function throws an exception.
+#endif
+#endif
+
+/*
+// for debug:
+#define UNICODE 1
+#define _UNICODE 1
+#define  _WIN32_WINNT  0x0500  // win2000
+#ifndef WINVER
+  #define WINVER  _WIN32_WINNT
+#endif
+*/
+
+#ifdef _WIN32
+/*
+  this "Precomp.h" file must be included before <windows.h>,
+  if we want to define _WIN32_WINNT before <windows.h>.
+*/
+
+#ifndef Z7_LARGE_PAGES
+#ifndef Z7_NO_LARGE_PAGES
+#define Z7_LARGE_PAGES 1
+#endif
+#endif
+
+#ifndef Z7_LONG_PATH
+#ifndef Z7_NO_LONG_PATH
+#define Z7_LONG_PATH 1
+#endif
+#endif
+
+#ifndef Z7_DEVICE_FILE
+#ifndef Z7_NO_DEVICE_FILE
+// #define Z7_DEVICE_FILE 1
+#endif
+#endif
+
+// we don't change macros if included after <windows.h>
+#ifndef _WINDOWS_
+
+#ifndef Z7_WIN32_WINNT_MIN
+  #if defined(_M_ARM64) || defined(__aarch64__)
+    // #define Z7_WIN32_WINNT_MIN  0x0a00  // win10
+    #define Z7_WIN32_WINNT_MIN  0x0600  // vista
+  #elif defined(_M_ARM) && defined(_M_ARMT) && defined(_M_ARM_NT)
+    // #define Z7_WIN32_WINNT_MIN  0x0602  // win8
+    #define Z7_WIN32_WINNT_MIN  0x0600  // vista
+  #elif defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(_M_IA64)
+    #define Z7_WIN32_WINNT_MIN  0x0503  // win2003
+  // #elif defined(_M_IX86) || defined(__i386__)
+  //   #define Z7_WIN32_WINNT_MIN  0x0500  // win2000
+  #else // x86 and another(old) systems
+    #define Z7_WIN32_WINNT_MIN  0x0500  // win2000
+    // #define Z7_WIN32_WINNT_MIN  0x0502  // win2003 // for debug
+  #endif
+#endif // Z7_WIN32_WINNT_MIN
+
+
+#ifndef Z7_DO_NOT_DEFINE_WIN32_WINNT
+#ifdef _WIN32_WINNT
+  // #error Stop_Compiling_Bad_WIN32_WINNT
+#else
+  #ifndef Z7_NO_DEFINE_WIN32_WINNT
+Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+    #define _WIN32_WINNT  Z7_WIN32_WINNT_MIN
+Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+  #endif
+#endif // _WIN32_WINNT
+
+#ifndef WINVER
+  #define WINVER  _WIN32_WINNT
+#endif
+#endif // Z7_DO_NOT_DEFINE_WIN32_WINNT
+
+
+#ifndef _MBCS
+#ifndef Z7_NO_UNICODE
+// UNICODE and _UNICODE are used by <windows.h> and by 7-zip code.
+
+#ifndef UNICODE
+#define UNICODE 1
+#endif
+
+#ifndef _UNICODE
+Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+#define _UNICODE 1
+Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+#endif
+
+#endif // Z7_NO_UNICODE
+#endif // _MBCS
+#endif // _WINDOWS_
+
+// #include "7zWindows.h"
+
+#endif // _WIN32
 
 #endif
diff --git a/src/sdk/C/RotateDefs.h b/src/sdk/C/RotateDefs.h
index 8f01d1a..c16b4f8 100644
--- a/src/sdk/C/RotateDefs.h
+++ b/src/sdk/C/RotateDefs.h
@@ -1,14 +1,14 @@
 /* RotateDefs.h -- Rotate functions
-2015-03-25 : Igor Pavlov : Public domain */
+2023-06-18 : Igor Pavlov : Public domain */
 
-#ifndef __ROTATE_DEFS_H
-#define __ROTATE_DEFS_H
+#ifndef ZIP7_INC_ROTATE_DEFS_H
+#define ZIP7_INC_ROTATE_DEFS_H
 
 #ifdef _MSC_VER
 
 #include <stdlib.h>
 
-/* don't use _rotl with MINGW. It can insert slow call to function. */
+/* don't use _rotl with old MINGW. It can insert slow call to function. */
  
 /* #if (_MSC_VER >= 1200) */
 #pragma intrinsic(_rotl)
@@ -18,12 +18,32 @@
 #define rotlFixed(x, n) _rotl((x), (n))
 #define rotrFixed(x, n) _rotr((x), (n))
 
+#if (_MSC_VER >= 1300)
+#define Z7_ROTL64(x, n) _rotl64((x), (n))
+#define Z7_ROTR64(x, n) _rotr64((x), (n))
+#else
+#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
+#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
+#endif
+
 #else
 
 /* new compilers can translate these macros to fast commands. */
 
+#if defined(__clang__) && (__clang_major__ >= 4) \
+  || defined(__GNUC__) && (__GNUC__ >= 5)
+/* GCC 4.9.0 and clang 3.5 can recognize more correct version: */
+#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (-(n) & 31)))
+#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (-(n) & 31)))
+#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (-(n) & 63)))
+#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (-(n) & 63)))
+#else
+/* for old GCC / clang: */
 #define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
 #define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
+#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
+#endif
 
 #endif
 
diff --git a/src/sdk/C/Sha256.c b/src/sdk/C/Sha256.c
index 04b688c..ea7ed8e 100644
--- a/src/sdk/C/Sha256.c
+++ b/src/sdk/C/Sha256.c
@@ -1,25 +1,113 @@
-/* Crypto/Sha256.c -- SHA-256 Hash
-2017-04-03 : Igor Pavlov : Public domain
+/* Sha256.c -- SHA-256 Hash
+: Igor Pavlov : Public domain
 This code is based on public domain code from Wei Dai's Crypto++ library. */
 
 #include "Precomp.h"
 
 #include <string.h>
 
-#include "CpuArch.h"
-#include "RotateDefs.h"
 #include "Sha256.h"
+#include "RotateDefs.h"
+#include "CpuArch.h"
+
+#ifdef MY_CPU_X86_OR_AMD64
+  #if   defined(Z7_LLVM_CLANG_VERSION)  && (Z7_LLVM_CLANG_VERSION  >= 30800) \
+     || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
+     || defined(Z7_GCC_VERSION)         && (Z7_GCC_VERSION         >= 40900) \
+     || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \
+     || defined(_MSC_VER) && (_MSC_VER >= 1200)
+      #define Z7_COMPILER_SHA256_SUPPORTED
+  #endif
+#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
+
+  #if   defined(__ARM_FEATURE_SHA2) \
+     || defined(__ARM_FEATURE_CRYPTO)
+    #define Z7_COMPILER_SHA256_SUPPORTED
+  #else
+    #if  defined(MY_CPU_ARM64) \
+      || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
+      || defined(Z7_MSC_VER_ORIGINAL)
+    #if  defined(__ARM_FP) && \
+          (   defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
+           || defined(__GNUC__) && (__GNUC__ >= 6) \
+          ) \
+      || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
+    #if  defined(MY_CPU_ARM64) \
+      || !defined(Z7_CLANG_VERSION) \
+      || defined(__ARM_NEON) && \
+          (Z7_CLANG_VERSION < 170000 || \
+           Z7_CLANG_VERSION > 170001)
+      #define Z7_COMPILER_SHA256_SUPPORTED
+    #endif
+    #endif
+    #endif
+  #endif
+#endif
+
+void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
+
+#ifdef Z7_COMPILER_SHA256_SUPPORTED
+  void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
+
+  static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks;
+  static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW;
+
+  #define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
+#else
+  #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks
+#endif
+
+
+BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
+{
+  SHA256_FUNC_UPDATE_BLOCKS func = Sha256_UpdateBlocks;
+  
+  #ifdef Z7_COMPILER_SHA256_SUPPORTED
+    if (algo != SHA256_ALGO_SW)
+    {
+      if (algo == SHA256_ALGO_DEFAULT)
+        func = g_SHA256_FUNC_UPDATE_BLOCKS;
+      else
+      {
+        if (algo != SHA256_ALGO_HW)
+          return False;
+        func = g_SHA256_FUNC_UPDATE_BLOCKS_HW;
+        if (!func)
+          return False;
+      }
+    }
+  #else
+    if (algo > 1)
+      return False;
+  #endif
+
+  p->v.vars.func_UpdateBlocks = func;
+  return True;
+}
+
 
 /* define it for speed optimization */
-#ifndef _SFX
-#define _SHA256_UNROLL
-#define _SHA256_UNROLL2
+
+#ifdef Z7_SFX
+  #define STEP_PRE 1
+  #define STEP_MAIN 1
+#else
+  #define STEP_PRE 2
+  #define STEP_MAIN 4
+  // #define Z7_SHA256_UNROLL
 #endif
 
-/* #define _SHA256_UNROLL2 */
+#undef Z7_SHA256_BIG_W
+#if STEP_MAIN != 16
+  #define Z7_SHA256_BIG_W
+#endif
 
-void Sha256_Init(CSha256 *p)
+
+
+
+void Sha256_InitState(CSha256 *p)
 {
+  p->v.vars.count = 0;
   p->state[0] = 0x6a09e667;
   p->state[1] = 0xbb67ae85;
   p->state[2] = 0x3c6ef372;
@@ -28,69 +116,121 @@ void Sha256_Init(CSha256 *p)
   p->state[5] = 0x9b05688c;
   p->state[6] = 0x1f83d9ab;
   p->state[7] = 0x5be0cd19;
-  p->count = 0;
 }
 
-#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22))
-#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25))
-#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
-#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10))
 
-#define blk0(i) (W[i])
-#define blk2(i) (W[i] += s1(W[((i)-2)&15]) + W[((i)-7)&15] + s0(W[((i)-15)&15]))
 
-#define Ch(x,y,z) (z^(x&(y^z)))
-#define Maj(x,y,z) ((x&y)|(z&(x|y)))
 
-#ifdef _SHA256_UNROLL2
 
-#define R(a,b,c,d,e,f,g,h, i) \
-    h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + (j ? blk2(i) : blk0(i)); \
-    d += h; \
-    h += S0(a) + Maj(a, b, c)
 
-#define RX_8(i) \
-  R(a,b,c,d,e,f,g,h, i); \
-  R(h,a,b,c,d,e,f,g, i+1); \
-  R(g,h,a,b,c,d,e,f, i+2); \
-  R(f,g,h,a,b,c,d,e, i+3); \
-  R(e,f,g,h,a,b,c,d, i+4); \
-  R(d,e,f,g,h,a,b,c, i+5); \
-  R(c,d,e,f,g,h,a,b, i+6); \
-  R(b,c,d,e,f,g,h,a, i+7)
 
-#define RX_16  RX_8(0); RX_8(8);
 
-#else
+void Sha256_Init(CSha256 *p)
+{
+  p->v.vars.func_UpdateBlocks =
+  #ifdef Z7_COMPILER_SHA256_SUPPORTED
+      g_SHA256_FUNC_UPDATE_BLOCKS;
+  #else
+      NULL;
+  #endif
+  Sha256_InitState(p);
+}
 
-#define a(i) T[(0-(i))&7]
-#define b(i) T[(1-(i))&7]
-#define c(i) T[(2-(i))&7]
-#define d(i) T[(3-(i))&7]
-#define e(i) T[(4-(i))&7]
-#define f(i) T[(5-(i))&7]
-#define g(i) T[(6-(i))&7]
-#define h(i) T[(7-(i))&7]
+#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x,22))
+#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x,25))
+#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
+#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >>10))
+
+#define Ch(x,y,z) (z^(x&(y^z)))
+#define Maj(x,y,z) ((x&y)|(z&(x|y)))
 
-#define R(i) \
-    h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[(i)+(size_t)(j)] + (j ? blk2(i) : blk0(i)); \
-    d(i) += h(i); \
-    h(i) += S0(a(i)) + Maj(a(i), b(i), c(i)) \
 
-#ifdef _SHA256_UNROLL
+#define W_PRE(i) (W[(i) + (size_t)(j)] = GetBe32(data + ((size_t)(j) + i) * 4))
 
-#define RX_8(i)  R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
-#define RX_16  RX_8(0); RX_8(8);
+#define blk2_main(j, i)  s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15))
 
+#ifdef Z7_SHA256_BIG_W
+    // we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned.
+    #define w(j, i)     W[(size_t)(j) + i]
+    #define blk2(j, i)  (w(j, i) = w(j, (i)-16) + blk2_main(j, i))
 #else
+    #if STEP_MAIN == 16
+        #define w(j, i)  W[(i) & 15]
+    #else
+        #define w(j, i)  W[((size_t)(j) + (i)) & 15]
+    #endif
+    #define blk2(j, i)  (w(j, i) += blk2_main(j, i))
+#endif
+
+#define W_MAIN(i)  blk2(j, i)
+
+
+#define T1(wx, i) \
+    tmp = h + S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
+    h = g; \
+    g = f; \
+    f = e; \
+    e = d + tmp; \
+    tmp += S0(a) + Maj(a, b, c); \
+    d = c; \
+    c = b; \
+    b = a; \
+    a = tmp; \
 
-#define RX_16  unsigned i; for (i = 0; i < 16; i++) { R(i); }
+#define R1_PRE(i)  T1( W_PRE, i)
+#define R1_MAIN(i) T1( W_MAIN, i)
+
+#if (!defined(Z7_SHA256_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4)
+#define R2_MAIN(i) \
+    R1_MAIN(i) \
+    R1_MAIN(i + 1) \
 
 #endif
 
+
+
+#if defined(Z7_SHA256_UNROLL) && STEP_MAIN >= 8
+
+#define T4( a,b,c,d,e,f,g,h, wx, i) \
+    h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
+    tmp = h; \
+    h += d; \
+    d = tmp + S0(a) + Maj(a, b, c); \
+
+#define R4( wx, i) \
+    T4 ( a,b,c,d,e,f,g,h, wx, (i  )); \
+    T4 ( d,a,b,c,h,e,f,g, wx, (i+1)); \
+    T4 ( c,d,a,b,g,h,e,f, wx, (i+2)); \
+    T4 ( b,c,d,a,f,g,h,e, wx, (i+3)); \
+
+#define R4_PRE(i)  R4( W_PRE, i)
+#define R4_MAIN(i) R4( W_MAIN, i)
+
+
+#define T8( a,b,c,d,e,f,g,h, wx, i) \
+    h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
+    d += h; \
+    h += S0(a) + Maj(a, b, c); \
+
+#define R8( wx, i) \
+    T8 ( a,b,c,d,e,f,g,h, wx, i  ); \
+    T8 ( h,a,b,c,d,e,f,g, wx, i+1); \
+    T8 ( g,h,a,b,c,d,e,f, wx, i+2); \
+    T8 ( f,g,h,a,b,c,d,e, wx, i+3); \
+    T8 ( e,f,g,h,a,b,c,d, wx, i+4); \
+    T8 ( d,e,f,g,h,a,b,c, wx, i+5); \
+    T8 ( c,d,e,f,g,h,a,b, wx, i+6); \
+    T8 ( b,c,d,e,f,g,h,a, wx, i+7); \
+
+#define R8_PRE(i)  R8( W_PRE, i)
+#define R8_MAIN(i) R8( W_MAIN, i)
+
 #endif
 
-static const UInt32 K[64] = {
+
+extern
+MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64];
+MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = {
   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -109,30 +249,29 @@ static const UInt32 K[64] = {
   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 };
 
-static void Sha256_WriteByteBlock(CSha256 *p)
-{
-  UInt32 W[16];
-  unsigned j;
-  UInt32 *state;
 
-  #ifdef _SHA256_UNROLL2
-  UInt32 a,b,c,d,e,f,g,h;
-  #else
-  UInt32 T[8];
-  #endif
 
-  for (j = 0; j < 16; j += 4)
-  {
-    const Byte *ccc = p->buffer + j * 4;
-    W[j    ] = GetBe32(ccc);
-    W[j + 1] = GetBe32(ccc + 4);
-    W[j + 2] = GetBe32(ccc + 8);
-    W[j + 3] = GetBe32(ccc + 12);
-  }
 
-  state = p->state;
 
-  #ifdef _SHA256_UNROLL2
+#define K SHA256_K_ARRAY
+
+Z7_NO_INLINE
+void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks)
+{
+  UInt32 W
+#ifdef Z7_SHA256_BIG_W
+      [64];
+#else
+      [16];
+#endif
+  unsigned j;
+  UInt32 a,b,c,d,e,f,g,h;
+#if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
+  UInt32 tmp;
+#endif
+  
+  if (numBlocks == 0) return;
+
   a = state[0];
   b = state[1];
   c = state[2];
@@ -141,108 +280,213 @@ static void Sha256_WriteByteBlock(CSha256 *p)
   f = state[5];
   g = state[6];
   h = state[7];
-  #else
-  for (j = 0; j < 8; j++)
-    T[j] = state[j];
-  #endif
 
-  for (j = 0; j < 64; j += 16)
+  do
+  {
+
+  for (j = 0; j < 16; j += STEP_PRE)
+  {
+    #if STEP_PRE > 4
+
+      #if STEP_PRE < 8
+      R4_PRE(0);
+      #else
+      R8_PRE(0);
+      #if STEP_PRE == 16
+      R8_PRE(8);
+      #endif
+      #endif
+
+    #else
+
+      R1_PRE(0)
+      #if STEP_PRE >= 2
+      R1_PRE(1)
+      #if STEP_PRE >= 4
+      R1_PRE(2)
+      R1_PRE(3)
+      #endif
+      #endif
+    
+    #endif
+  }
+
+  for (j = 16; j < 64; j += STEP_MAIN)
   {
-    RX_16
+    #if defined(Z7_SHA256_UNROLL) && STEP_MAIN >= 8
+
+      #if STEP_MAIN < 8
+      R4_MAIN(0)
+      #else
+      R8_MAIN(0)
+      #if STEP_MAIN == 16
+      R8_MAIN(8)
+      #endif
+      #endif
+
+    #else
+      
+      R1_MAIN(0)
+      #if STEP_MAIN >= 2
+      R1_MAIN(1)
+      #if STEP_MAIN >= 4
+      R2_MAIN(2)
+      #if STEP_MAIN >= 8
+      R2_MAIN(4)
+      R2_MAIN(6)
+      #if STEP_MAIN >= 16
+      R2_MAIN(8)
+      R2_MAIN(10)
+      R2_MAIN(12)
+      R2_MAIN(14)
+      #endif
+      #endif
+      #endif
+      #endif
+    #endif
   }
 
-  #ifdef _SHA256_UNROLL2
-  state[0] += a;
-  state[1] += b;
-  state[2] += c;
-  state[3] += d;
-  state[4] += e;
-  state[5] += f;
-  state[6] += g;
-  state[7] += h;
-  #else
-  for (j = 0; j < 8; j++)
-    state[j] += T[j];
-  #endif
-  
-  /* Wipe variables */
-  /* memset(W, 0, sizeof(W)); */
-  /* memset(T, 0, sizeof(T)); */
+  a += state[0]; state[0] = a;
+  b += state[1]; state[1] = b;
+  c += state[2]; state[2] = c;
+  d += state[3]; state[3] = d;
+  e += state[4]; state[4] = e;
+  f += state[5]; state[5] = f;
+  g += state[6]; state[6] = g;
+  h += state[7]; state[7] = h;
+
+  data += SHA256_BLOCK_SIZE;
+  }
+  while (--numBlocks);
 }
 
-#undef S0
-#undef S1
-#undef s0
-#undef s1
+
+#define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
 
 void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
 {
   if (size == 0)
     return;
-
   {
-    unsigned pos = (unsigned)p->count & 0x3F;
-    unsigned num;
-    
-    p->count += size;
-    
-    num = 64 - pos;
+    const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
+    const unsigned num = SHA256_BLOCK_SIZE - pos;
+    p->v.vars.count += size;
     if (num > size)
     {
       memcpy(p->buffer + pos, data, size);
       return;
     }
-    
-    size -= num;
-    memcpy(p->buffer + pos, data, num);
-    data += num;
+    if (pos != 0)
+    {
+      size -= num;
+      memcpy(p->buffer + pos, data, num);
+      data += num;
+      Sha256_UpdateBlock(p);
+    }
   }
-
-  for (;;)
   {
-    Sha256_WriteByteBlock(p);
-    if (size < 64)
-      break;
-    size -= 64;
-    memcpy(p->buffer, data, 64);
-    data += 64;
-  }
-
-  if (size != 0)
+    const size_t numBlocks = size >> 6;
+    // if (numBlocks)
+    SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
+    size &= SHA256_BLOCK_SIZE - 1;
+    if (size == 0)
+      return;
+    data += (numBlocks << 6);
     memcpy(p->buffer, data, size);
+  }
 }
 
+
 void Sha256_Final(CSha256 *p, Byte *digest)
 {
-  unsigned pos = (unsigned)p->count & 0x3F;
-  unsigned i;
-  
+  unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
   p->buffer[pos++] = 0x80;
-  
-  while (pos != (64 - 8))
+  if (pos > (SHA256_BLOCK_SIZE - 4 * 2))
   {
-    pos &= 0x3F;
-    if (pos == 0)
-      Sha256_WriteByteBlock(p);
-    p->buffer[pos++] = 0;
+    while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; }
+    // memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos);
+    Sha256_UpdateBlock(p);
+    pos = 0;
   }
-
+  memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos);
   {
-    UInt64 numBits = (p->count << 3);
-    SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32));
-    SetBe32(p->buffer + 64 - 4, (UInt32)(numBits));
+    const UInt64 numBits = p->v.vars.count << 3;
+    SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
+    SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
+  }
+  Sha256_UpdateBlock(p);
+#if 1 && defined(MY_CPU_BE)
+  memcpy(digest, p->state, SHA256_DIGEST_SIZE);
+#else
+  {
+    unsigned i;
+    for (i = 0; i < 8; i += 2)
+    {
+      const UInt32 v0 = p->state[i];
+      const UInt32 v1 = p->state[(size_t)i + 1];
+      SetBe32(digest    , v0)
+      SetBe32(digest + 4, v1)
+      digest += 4 * 2;
+    }
   }
-  
-  Sha256_WriteByteBlock(p);
 
-  for (i = 0; i < 8; i += 2)
+
+
+
+#endif
+  Sha256_InitState(p);
+}
+
+
+void Sha256Prepare(void)
+{
+#ifdef Z7_COMPILER_SHA256_SUPPORTED
+  SHA256_FUNC_UPDATE_BLOCKS f, f_hw;
+  f = Sha256_UpdateBlocks;
+  f_hw = NULL;
+#ifdef MY_CPU_X86_OR_AMD64
+  if (CPU_IsSupported_SHA()
+      && CPU_IsSupported_SSSE3()
+      )
+#else
+  if (CPU_IsSupported_SHA2())
+#endif
   {
-    UInt32 v0 = p->state[i];
-    UInt32 v1 = p->state[i + 1];
-    SetBe32(digest    , v0);
-    SetBe32(digest + 4, v1);
-    digest += 8;
+    // printf("\n========== HW SHA256 ======== \n");
+    f = f_hw = Sha256_UpdateBlocks_HW;
   }
-  
-  Sha256_Init(p);
+  g_SHA256_FUNC_UPDATE_BLOCKS    = f;
+  g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw;
+#endif
 }
+
+#undef U64C
+#undef K
+#undef S0
+#undef S1
+#undef s0
+#undef s1
+#undef Ch
+#undef Maj
+#undef W_MAIN
+#undef W_PRE
+#undef w
+#undef blk2_main
+#undef blk2
+#undef T1
+#undef T4
+#undef T8
+#undef R1_PRE
+#undef R1_MAIN
+#undef R2_MAIN
+#undef R4
+#undef R4_PRE
+#undef R4_MAIN
+#undef R8
+#undef R8_PRE
+#undef R8_MAIN
+#undef STEP_PRE
+#undef STEP_MAIN
+#undef Z7_SHA256_BIG_W
+#undef Z7_SHA256_UNROLL
+#undef Z7_COMPILER_SHA256_SUPPORTED
diff --git a/src/sdk/C/Sha256.h b/src/sdk/C/Sha256.h
index 3f455db..75329cd 100644
--- a/src/sdk/C/Sha256.h
+++ b/src/sdk/C/Sha256.h
@@ -1,26 +1,86 @@
 /* Sha256.h -- SHA-256 Hash
-2013-01-18 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
-#ifndef __CRYPTO_SHA256_H
-#define __CRYPTO_SHA256_H
+#ifndef ZIP7_INC_SHA256_H
+#define ZIP7_INC_SHA256_H
 
 #include "7zTypes.h"
 
 EXTERN_C_BEGIN
 
-#define SHA256_DIGEST_SIZE 32
+#define SHA256_NUM_BLOCK_WORDS  16
+#define SHA256_NUM_DIGEST_WORDS  8
+
+#define SHA256_BLOCK_SIZE   (SHA256_NUM_BLOCK_WORDS * 4)
+#define SHA256_DIGEST_SIZE  (SHA256_NUM_DIGEST_WORDS * 4)
+
+
+
+
+typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks);
+
+/*
+  if (the system supports different SHA256 code implementations)
+  {
+    (CSha256::func_UpdateBlocks) will be used
+    (CSha256::func_UpdateBlocks) can be set by
+       Sha256_Init()        - to default (fastest)
+       Sha256_SetFunction() - to any algo
+  }
+  else
+  {
+    (CSha256::func_UpdateBlocks) is ignored.
+  }
+*/
 
 typedef struct
 {
-  UInt32 state[8];
-  UInt64 count;
-  Byte buffer[64];
+  union
+  {
+    struct
+    {
+      SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
+      UInt64 count;
+    } vars;
+    UInt64 _pad_64bit[4];
+    void *_pad_align_ptr[2];
+  } v;
+  UInt32 state[SHA256_NUM_DIGEST_WORDS];
+
+  Byte buffer[SHA256_BLOCK_SIZE];
 } CSha256;
 
+
+#define SHA256_ALGO_DEFAULT 0
+#define SHA256_ALGO_SW      1
+#define SHA256_ALGO_HW      2
+
+/*
+Sha256_SetFunction()
+return:
+  0 - (algo) value is not supported, and func_UpdateBlocks was not changed
+  1 - func_UpdateBlocks was set according (algo) value.
+*/
+
+BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo);
+
+void Sha256_InitState(CSha256 *p);
 void Sha256_Init(CSha256 *p);
 void Sha256_Update(CSha256 *p, const Byte *data, size_t size);
 void Sha256_Final(CSha256 *p, Byte *digest);
 
+
+
+
+// void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
+
+/*
+call Sha256Prepare() once at program start.
+It prepares all supported implementations, and detects the fastest implementation.
+*/
+
+void Sha256Prepare(void);
+
 EXTERN_C_END
 
 #endif
diff --git a/src/sdk/C/Sha256Opt.c b/src/sdk/C/Sha256Opt.c
new file mode 100644
index 0000000..1c6b50f
--- /dev/null
+++ b/src/sdk/C/Sha256Opt.c
@@ -0,0 +1,451 @@
+/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
+: Igor Pavlov : Public domain */
+
+#include "Precomp.h"
+#include "Compiler.h"
+#include "CpuArch.h"
+
+// #define Z7_USE_HW_SHA_STUB // for debug
+#ifdef MY_CPU_X86_OR_AMD64
+  #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
+      #define USE_HW_SHA
+  #elif defined(Z7_LLVM_CLANG_VERSION)  && (Z7_LLVM_CLANG_VERSION  >= 30800) \
+     || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
+     || defined(Z7_GCC_VERSION)         && (Z7_GCC_VERSION         >= 40900)
+      #define USE_HW_SHA
+      #if !defined(__INTEL_COMPILER)
+      // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
+      #if !defined(__SHA__) || !defined(__SSSE3__)
+        #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
+      #endif
+      #endif
+  #elif defined(_MSC_VER)
+    #if (_MSC_VER >= 1900)
+      #define USE_HW_SHA
+    #else
+      #define Z7_USE_HW_SHA_STUB
+    #endif
+  #endif
+// #endif // MY_CPU_X86_OR_AMD64
+#ifndef USE_HW_SHA
+  // #define Z7_USE_HW_SHA_STUB // for debug
+#endif
+
+#ifdef USE_HW_SHA
+
+// #pragma message("Sha256 HW")
+
+
+
+
+// sse/sse2/ssse3:
+#include <tmmintrin.h>
+// sha*:
+#include <immintrin.h>
+
+#if defined (__clang__) && defined(_MSC_VER)
+  #if !defined(__SHA__)
+    #include <shaintrin.h>
+  #endif
+#else
+
+#endif
+
+/*
+SHA256 uses:
+SSE2:
+  _mm_loadu_si128
+  _mm_storeu_si128
+  _mm_set_epi32
+  _mm_add_epi32
+  _mm_shuffle_epi32 / pshufd
+
+
+  
+SSSE3:
+  _mm_shuffle_epi8 / pshufb
+  _mm_alignr_epi8
+SHA:
+  _mm_sha256*
+*/
+
+// K array must be aligned for 16-bytes at least.
+// The compiler can look align attribute and selects
+//   movdqu - for code without align attribute
+//   movdqa - for code with    align attribute
+extern
+MY_ALIGN(64)
+const UInt32 SHA256_K_ARRAY[64];
+#define K SHA256_K_ARRAY
+
+
+#define ADD_EPI32(dest, src)      dest = _mm_add_epi32(dest, src);
+#define SHA256_MSG1(dest, src)    dest = _mm_sha256msg1_epu32(dest, src);
+#define SHA256_MSG2(dest, src)    dest = _mm_sha256msg2_epu32(dest, src);
+
+#define LOAD_SHUFFLE(m, k) \
+    m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
+    m = _mm_shuffle_epi8(m, mask); \
+
+#define NNN(m0, m1, m2, m3)
+
+#define SM1(m1, m2, m3, m0) \
+    SHA256_MSG1(m0, m1); \
+
+#define SM2(m2, m3, m0, m1) \
+    ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \
+    SHA256_MSG2(m0, m3); \
+
+#define RND2(t0, t1) \
+    t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
+
+
+
+#define R4(k, m0, m1, m2, m3, OP0, OP1) \
+    msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \
+    RND2(state0, state1); \
+    msg = _mm_shuffle_epi32(msg, 0x0E); \
+    OP0(m0, m1, m2, m3) \
+    RND2(state1, state0); \
+    OP1(m0, m1, m2, m3) \
+
+#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
+    R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
+    R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
+    R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
+    R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
+
+#define PREPARE_STATE \
+    tmp    = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
+    state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \
+    state1 = state0; \
+    state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \
+    state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \
+
+
+void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
+#ifdef ATTRIB_SHA
+ATTRIB_SHA
+#endif
+void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
+{
+  const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
+   
+  
+  __m128i tmp, state0, state1;
+
+  if (numBlocks == 0)
+    return;
+
+  state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]);
+  state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]);
+  
+  PREPARE_STATE
+
+  do
+  {
+    __m128i state0_save, state1_save;
+    __m128i m0, m1, m2, m3;
+    __m128i msg;
+    // #define msg tmp
+
+    state0_save = state0;
+    state1_save = state1;
+    
+    LOAD_SHUFFLE (m0, 0)
+    LOAD_SHUFFLE (m1, 1)
+    LOAD_SHUFFLE (m2, 2)
+    LOAD_SHUFFLE (m3, 3)
+
+
+
+    R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
+    R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
+    R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
+    R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
+    
+    ADD_EPI32(state0, state0_save)
+    ADD_EPI32(state1, state1_save)
+    
+    data += 64;
+  }
+  while (--numBlocks);
+
+  PREPARE_STATE
+
+  _mm_storeu_si128((__m128i *) (void *) &state[0], state0);
+  _mm_storeu_si128((__m128i *) (void *) &state[4], state1);
+}
+
+#endif // USE_HW_SHA
+
+#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
+  
+  #if   defined(__ARM_FEATURE_SHA2) \
+     || defined(__ARM_FEATURE_CRYPTO)
+    #define USE_HW_SHA
+  #else
+    #if  defined(MY_CPU_ARM64) \
+      || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
+      || defined(Z7_MSC_VER_ORIGINAL)
+    #if  defined(__ARM_FP) && \
+          (   defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
+           || defined(__GNUC__) && (__GNUC__ >= 6) \
+          ) \
+      || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
+    #if  defined(MY_CPU_ARM64) \
+      || !defined(Z7_CLANG_VERSION) \
+      || defined(__ARM_NEON) && \
+          (Z7_CLANG_VERSION < 170000 || \
+           Z7_CLANG_VERSION > 170001)
+      #define USE_HW_SHA
+    #endif
+    #endif
+    #endif
+  #endif
+
+#ifdef USE_HW_SHA
+
+// #pragma message("=== Sha256 HW === ")
+
+
+#if defined(__clang__) || defined(__GNUC__)
+#if !defined(__ARM_FEATURE_SHA2) && \
+    !defined(__ARM_FEATURE_CRYPTO)
+  #ifdef MY_CPU_ARM64
+#if defined(__clang__)
+    #define ATTRIB_SHA __attribute__((__target__("crypto")))
+#else
+    #define ATTRIB_SHA __attribute__((__target__("+crypto")))
+#endif
+  #else
+#if defined(__clang__) && (__clang_major__ >= 1)
+    #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
+#else
+    #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
+#endif
+  #endif
+#endif
+#else
+  // _MSC_VER
+  // for arm32
+  #define _ARM_USE_NEW_NEON_INTRINSICS
+#endif
+
+#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
+#include <arm64_neon.h>
+#else
+
+#if defined(__clang__) && __clang_major__ < 16
+#if !defined(__ARM_FEATURE_SHA2) && \
+    !defined(__ARM_FEATURE_CRYPTO)
+//     #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
+    Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+    #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
+// #if defined(__clang__) && __clang_major__ < 13
+    #define __ARM_FEATURE_CRYPTO 1
+// #else
+    #define __ARM_FEATURE_SHA2 1
+// #endif
+    Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+#endif
+#endif // clang
+
+#if defined(__clang__)
+
+#if defined(__ARM_ARCH) && __ARM_ARCH < 8
+    Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+//    #pragma message("#define __ARM_ARCH 8")
+    #undef  __ARM_ARCH
+    #define __ARM_ARCH 8
+    Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+#endif
+
+#endif // clang
+
+#include <arm_neon.h>
+
+#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
+    defined(__ARM_FEATURE_CRYPTO) && \
+    defined(__ARM_FEATURE_SHA2)
+Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+    #undef __ARM_FEATURE_CRYPTO
+    #undef __ARM_FEATURE_SHA2
+    #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
+Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+//    #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
+#endif
+
+#endif // Z7_MSC_VER_ORIGINAL
+
+typedef uint32x4_t v128;
+// typedef __n128 v128; // MSVC
+
+#ifdef MY_CPU_BE
+  #define MY_rev32_for_LE(x) x
+#else
+  #define MY_rev32_for_LE(x) vrev32q_u8(x)
+#endif
+
+#if 1 // 0 for debug
+// for arm32: it works slower by some reason than direct code
+/*
+for arm32 it generates:
+MSVC-2022, GCC-9:
+    vld1.32 {d18,d19}, [r10]
+    vst1.32 {d4,d5}, [r3]
+    vld1.8  {d20-d21}, [r4]
+there is no align hint (like [r10:128]).  So instruction allows unaligned access
+*/
+#define LOAD_128_32(_p)       vld1q_u32(_p)
+#define LOAD_128_8(_p)        vld1q_u8 (_p)
+#define STORE_128_32(_p, _v)  vst1q_u32(_p, _v)
+#else
+/*
+for arm32:
+MSVC-2022:
+    vldm r10,{d18,d19}
+    vstm r3,{d4,d5}
+    does it require strict alignment?
+GCC-9:
+    vld1.64 {d30-d31}, [r0:64]
+    vldr  d28, [r0, #16]
+    vldr  d29, [r0, #24]
+    vst1.64 {d30-d31}, [r0:64]
+    vstr  d28, [r0, #16]
+    vstr  d29, [r0, #24]
+there is hint [r0:64], so does it requires 64-bit alignment.
+*/
+#define LOAD_128_32(_p)       (*(const v128 *)(const void *)(_p))
+#define LOAD_128_8(_p)        vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p))
+#define STORE_128_32(_p, _v)  *(v128 *)(void *)(_p) = (_v)
+#endif
+
+#define LOAD_SHUFFLE(m, k) \
+    m = vreinterpretq_u32_u8( \
+        MY_rev32_for_LE( \
+        LOAD_128_8(data + (k) * 16))); \
+
+// K array must be aligned for 16-bytes at least.
+extern
+MY_ALIGN(64)
+const UInt32 SHA256_K_ARRAY[64];
+#define K SHA256_K_ARRAY
+
+#define SHA256_SU0(dest, src)        dest = vsha256su0q_u32(dest, src);
+#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
+
+#define SM1(m0, m1, m2, m3)  SHA256_SU0(m3, m0)
+#define SM2(m0, m1, m2, m3)  SHA256_SU1(m2, m0, m1)
+#define NNN(m0, m1, m2, m3)
+
+#define R4(k, m0, m1, m2, m3, OP0, OP1) \
+    msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \
+    tmp = state0; \
+    state0 = vsha256hq_u32( state0, state1, msg ); \
+    state1 = vsha256h2q_u32( state1, tmp, msg ); \
+    OP0(m0, m1, m2, m3); \
+    OP1(m0, m1, m2, m3); \
+
+
+#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
+    R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
+    R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
+    R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
+    R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
+
+
+void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
+#ifdef ATTRIB_SHA
+ATTRIB_SHA
+#endif
+void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
+{
+  v128 state0, state1;
+
+  if (numBlocks == 0)
+    return;
+
+  state0 = LOAD_128_32(&state[0]);
+  state1 = LOAD_128_32(&state[4]);
+  
+  do
+  {
+    v128 state0_save, state1_save;
+    v128 m0, m1, m2, m3;
+    v128 msg, tmp;
+
+    state0_save = state0;
+    state1_save = state1;
+    
+    LOAD_SHUFFLE (m0, 0)
+    LOAD_SHUFFLE (m1, 1)
+    LOAD_SHUFFLE (m2, 2)
+    LOAD_SHUFFLE (m3, 3)
+
+    R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
+    R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
+    R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
+    R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
+    
+    state0 = vaddq_u32(state0, state0_save);
+    state1 = vaddq_u32(state1, state1_save);
+    
+    data += 64;
+  }
+  while (--numBlocks);
+
+  STORE_128_32(&state[0], state0);
+  STORE_128_32(&state[4], state1);
+}
+
+#endif // USE_HW_SHA
+
+#endif // MY_CPU_ARM_OR_ARM64
+
+
+#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
+// #error Stop_Compiling_UNSUPPORTED_SHA
+// #include <stdlib.h>
+// We can compile this file with another C compiler,
+// or we can compile asm version.
+// So we can generate real code instead of this stub function.
+// #include "Sha256.h"
+// #if defined(_MSC_VER)
+#pragma message("Sha256 HW-SW stub was used")
+// #endif
+void Z7_FASTCALL Sha256_UpdateBlocks   (UInt32 state[8], const Byte *data, size_t numBlocks);
+void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
+void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
+{
+  Sha256_UpdateBlocks(state, data, numBlocks);
+  /*
+  UNUSED_VAR(state);
+  UNUSED_VAR(data);
+  UNUSED_VAR(numBlocks);
+  exit(1);
+  return;
+  */
+}
+#endif
+
+
+#undef K
+#undef RND2
+#undef MY_rev32_for_LE
+
+#undef NNN
+#undef LOAD_128
+#undef STORE_128
+#undef LOAD_SHUFFLE
+#undef SM1
+#undef SM2
+
+
+#undef R4
+#undef R16
+#undef PREPARE_STATE
+#undef USE_HW_SHA
+#undef ATTRIB_SHA
+#undef USE_VER_MIN
+#undef Z7_USE_HW_SHA_STUB
diff --git a/src/sdk/C/Sort.c b/src/sdk/C/Sort.c
index e1097e3..20e3e69 100644
--- a/src/sdk/C/Sort.c
+++ b/src/sdk/C/Sort.c
@@ -1,141 +1,268 @@
 /* Sort.c -- Sort functions
-2014-04-05 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include "Sort.h"
+#include "CpuArch.h"
 
-#define HeapSortDown(p, k, size, temp) \
-  { for (;;) { \
-    size_t s = (k << 1); \
-    if (s > size) break; \
-    if (s < size && p[s + 1] > p[s]) s++; \
-    if (temp >= p[s]) break; \
-    p[k] = p[s]; k = s; \
-  } p[k] = temp; }
+#if (  (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
+    || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
+    )
+// the code with prefetch is slow for small arrays on x86.
+// So we disable prefetch for x86.
+#ifndef MY_CPU_X86
+  // #pragma message("Z7_PREFETCH : __builtin_prefetch")
+  #define Z7_PREFETCH(a)  __builtin_prefetch((a))
+#endif
 
-void HeapSort(UInt32 *p, size_t size)
-{
-  if (size <= 1)
-    return;
-  p--;
-  {
-    size_t i = size / 2;
-    do
-    {
-      UInt32 temp = p[i];
-      size_t k = i;
-      HeapSortDown(p, k, size, temp)
-    }
-    while (--i != 0);
-  }
-  /*
-  do
-  {
-    size_t k = 1;
-    UInt32 temp = p[size];
-    p[size--] = p[1];
-    HeapSortDown(p, k, size, temp)
-  }
-  while (size > 1);
-  */
-  while (size > 3)
-  {
-    UInt32 temp = p[size];
-    size_t k = (p[3] > p[2]) ? 3 : 2;
-    p[size--] = p[1];
-    p[1] = p[k];
-    HeapSortDown(p, k, size, temp)
-  }
-  {
-    UInt32 temp = p[size];
-    p[size] = p[1];
-    if (size > 2 && p[2] < temp)
-    {
-      p[1] = p[2];
-      p[2] = temp;
-    }
-    else
-      p[1] = temp;
-  }
+#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
+
+#include "7zWindows.h"
+
+// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
+// For example, clang-cl can generate "prefetcht2" instruction for
+// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
+// But we want to generate "prefetcht0" instruction.
+// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
+// instead of PreFetchCacheLine() / _mm_prefetch().
+
+// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
+// But old x86 cpus don't support "prefetcht0".
+// So we will use PreFetchCacheLine(), only if we are sure that
+// generated instruction is supported by all cpus of that isa.
+#if defined(MY_CPU_AMD64) \
+    || defined(MY_CPU_ARM64) \
+    || defined(MY_CPU_IA64)
+// we need to use additional braces for (a) in PreFetchCacheLine call, because
+// PreFetchCacheLine macro doesn't use braces:
+//   #define PreFetchCacheLine(l, a)  _mm_prefetch((CHAR CONST *) a, l)
+  // #pragma message("Z7_PREFETCH : PreFetchCacheLine")
+  #define Z7_PREFETCH(a)  PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
+#endif
+
+#endif // _WIN32
+
+
+#define PREFETCH_NO(p,k,s,size)
+
+#ifndef Z7_PREFETCH
+  #define SORT_PREFETCH(p,k,s,size)
+#else
+
+// #define PREFETCH_LEVEL 2  // use it if cache line is 32-bytes
+#define PREFETCH_LEVEL 3  // it is fast for most cases (64-bytes cache line prefetch)
+// #define PREFETCH_LEVEL 4  // it can be faster for big array (128-bytes prefetch)
+
+#if PREFETCH_LEVEL == 0
+
+  #define SORT_PREFETCH(p,k,s,size)
+
+#else // PREFETCH_LEVEL != 0
+
+/*
+if  defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+    we prefetch one value per cache line.
+    Use it if array is aligned for cache line size (64 bytes)
+    or if array is small (less than L1 cache size).
+
+if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+    we perfetch all cache lines that can be required.
+    it can be faster for big unaligned arrays.
+*/
+  #define USE_PREFETCH_FOR_ALIGNED_ARRAY
+
+// s == k * 2
+#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
+  // x86 supports (lea r1*8+offset)
+  #define PREFETCH_OFFSET(k,s)  ((s) << PREFETCH_LEVEL)
+#else
+  #define PREFETCH_OFFSET(k,s)  ((k) << (PREFETCH_LEVEL + 1))
+#endif
+
+#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+  #define PREFETCH_ADD_OFFSET   0
+#else
+  // last offset that can be reqiured in PREFETCH_LEVEL step:
+  #define PREFETCH_RANGE        ((2 << PREFETCH_LEVEL) - 1)
+  #define PREFETCH_ADD_OFFSET   PREFETCH_RANGE / 2
+#endif
+
+#if PREFETCH_LEVEL <= 3
+
+#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#else /* for unaligned array */
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#endif
+
+#else // PREFETCH_LEVEL > 3
+
+#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - 16)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#else /* for unaligned array */
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#endif
+
+#endif // PREFETCH_LEVEL > 3
+#endif // PREFETCH_LEVEL != 0
+#endif // Z7_PREFETCH
+
+
+#if defined(MY_CPU_ARM64) \
+    /* || defined(MY_CPU_AMD64) */ \
+    /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
+  // we want to use cmov, if cmov is very fast:
+  // - this cmov version is slower for clang-x64.
+  // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
+  #define Z7_FAST_CMOV_SUPPORTED
+#endif
+ 
+#ifdef Z7_FAST_CMOV_SUPPORTED
+  // we want to use cmov here, if cmov is fast: new arm64 cpus.
+  // we want the compiler to use conditional move for this branch
+  #define GET_MAX_VAL(n0, n1, max_val_slow)  if (n0 < n1) n0 = n1;
+#else
+  // use this branch, if cpu doesn't support fast conditional move.
+  // it uses slow array access reading:
+  #define GET_MAX_VAL(n0, n1, max_val_slow)  n0 = max_val_slow;
+#endif
+
+#define HeapSortDown(p, k, size, temp, macro_prefetch) \
+{ \
+  for (;;) { \
+    UInt32 n0, n1; \
+    size_t s = k * 2; \
+    if (s >= size) { \
+      if (s == size) { \
+        n0 = p[s]; \
+        p[k] = n0; \
+        if (temp < n0) k = s; \
+      } \
+      break; \
+    } \
+    n0 = p[k * 2]; \
+    n1 = p[k * 2 + 1]; \
+    s += n0 < n1; \
+    GET_MAX_VAL(n0, n1, p[s]) \
+    if (temp >= n0) break; \
+    macro_prefetch(p, k, s, size) \
+    p[k] = n0; \
+    k = s; \
+  } \
+  p[k] = temp; \
 }
 
-void HeapSort64(UInt64 *p, size_t size)
+
+/*
+stage-1 : O(n) :
+  we generate intermediate partially sorted binary tree:
+  p[0]  : it's additional item for better alignment of tree structure in memory.
+  p[1]
+  p[2]       p[3]
+  p[4] p[5]  p[6] p[7]
+  ...
+  p[x] >= p[x * 2]
+  p[x] >= p[x * 2 + 1]
+  
+stage-2 : O(n)*log2(N):
+  we move largest item p[0] from head of tree to the end of array
+  and insert last item to sorted binary tree.
+*/
+
+// (p) must be aligned for cache line size (64-bytes) for best performance
+
+void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
 {
-  if (size <= 1)
+  if (size < 2)
     return;
-  p--;
-  {
-    size_t i = size / 2;
-    do
-    {
-      UInt64 temp = p[i];
-      size_t k = i;
-      HeapSortDown(p, k, size, temp)
-    }
-    while (--i != 0);
-  }
-  /*
-  do
+  if (size == 2)
   {
-    size_t k = 1;
-    UInt64 temp = p[size];
-    p[size--] = p[1];
-    HeapSortDown(p, k, size, temp)
-  }
-  while (size > 1);
-  */
-  while (size > 3)
-  {
-    UInt64 temp = p[size];
-    size_t k = (p[3] > p[2]) ? 3 : 2;
-    p[size--] = p[1];
-    p[1] = p[k];
-    HeapSortDown(p, k, size, temp)
+    const UInt32 a0 = p[0];
+    const UInt32 a1 = p[1];
+    const unsigned k = a1 < a0;
+    p[k] = a0;
+    p[k ^ 1] = a1;
+    return;
   }
   {
-    UInt64 temp = p[size];
-    p[size] = p[1];
-    if (size > 2 && p[2] < temp)
+    // stage-1 : O(n)
+    // we transform array to partially sorted binary tree.
+    size_t i = --size / 2;
+    // (size) now is the index of the last item in tree,
+    // if (i)
     {
-      p[1] = p[2];
-      p[2] = temp;
+      do
+      {
+        const UInt32 temp = p[i];
+        size_t k = i;
+        HeapSortDown(p, k, size, temp, PREFETCH_NO)
+      }
+      while (--i);
+    }
+    {
+      const UInt32 temp = p[0];
+      const UInt32 a1 = p[1];
+      if (temp < a1)
+      {
+        size_t k = 1;
+        p[0] = a1;
+        HeapSortDown(p, k, size, temp, PREFETCH_NO)
+      }
     }
-    else
-      p[1] = temp;
   }
-}
 
-/*
-#define HeapSortRefDown(p, vals, n, size, temp) \
-  { size_t k = n; UInt32 val = vals[temp]; for (;;) { \
-    size_t s = (k << 1); \
-    if (s > size) break; \
-    if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
-    if (val >= vals[p[s]]) break; \
-    p[k] = p[s]; k = s; \
-  } p[k] = temp; }
-
-void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
-{
-  if (size <= 1)
+  if (size < 3)
+  {
+    // size == 2
+    const UInt32 a0 = p[0];
+    p[0] = p[2];
+    p[2] = a0;
     return;
-  p--;
+  }
+  if (size != 3)
   {
-    size_t i = size / 2;
+    // stage-2 : O(size) * log2(size):
+    // we move largest item p[0] from head to the end of array,
+    // and insert last item to sorted binary tree.
     do
     {
-      UInt32 temp = p[i];
-      HeapSortRefDown(p, vals, i, size, temp);
+      const UInt32 temp = p[size];
+      size_t k = p[2] < p[3] ? 3 : 2;
+      p[size--] = p[0];
+      p[0] = p[1];
+      p[1] = p[k];
+      HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
     }
-    while (--i != 0);
+    while (size != 3);
   }
-  do
   {
-    UInt32 temp = p[size];
-    p[size--] = p[1];
-    HeapSortRefDown(p, vals, 1, size, temp);
+    const UInt32 a2 = p[2];
+    const UInt32 a3 = p[3];
+    const size_t k = a2 < a3;
+    p[2] = p[1];
+    p[3] = p[0];
+    p[k] = a3;
+    p[k ^ 1] = a2;
   }
-  while (size > 1);
 }
-*/
diff --git a/src/sdk/C/Sort.h b/src/sdk/C/Sort.h
index 2e2963a..de5a4e8 100644
--- a/src/sdk/C/Sort.h
+++ b/src/sdk/C/Sort.h
@@ -1,17 +1,14 @@
 /* Sort.h -- Sort functions
-2014-04-05 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
-#ifndef __7Z_SORT_H
-#define __7Z_SORT_H
+#ifndef ZIP7_INC_SORT_H
+#define ZIP7_INC_SORT_H
 
 #include "7zTypes.h"
 
 EXTERN_C_BEGIN
 
-void HeapSort(UInt32 *p, size_t size);
-void HeapSort64(UInt64 *p, size_t size);
-
-/* void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size); */
+void Z7_FASTCALL HeapSort(UInt32 *p, size_t size);
 
 EXTERN_C_END
 
diff --git a/src/sdk/C/SwapBytes.c b/src/sdk/C/SwapBytes.c
new file mode 100644
index 0000000..9290592
--- /dev/null
+++ b/src/sdk/C/SwapBytes.c
@@ -0,0 +1,835 @@
+/* SwapBytes.c -- Byte Swap conversion filter
+2024-03-01 : Igor Pavlov : Public domain */
+
+#include "Precomp.h"
+
+#include "Compiler.h"
+#include "CpuArch.h"
+#include "RotateDefs.h"
+#include "SwapBytes.h"
+
+typedef UInt16 CSwapUInt16;
+typedef UInt32 CSwapUInt32;
+
+// #define k_SwapBytes_Mode_BASE   0
+
+#ifdef MY_CPU_X86_OR_AMD64
+
+#define k_SwapBytes_Mode_SSE2   1
+#define k_SwapBytes_Mode_SSSE3  2
+#define k_SwapBytes_Mode_AVX2   3
+
+  // #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
+  #if defined(__clang__) && (__clang_major__ >= 4) \
+      || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
+      #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_AVX2
+      #define SWAP_ATTRIB_SSE2  __attribute__((__target__("sse2")))
+      #define SWAP_ATTRIB_SSSE3 __attribute__((__target__("ssse3")))
+      #define SWAP_ATTRIB_AVX2  __attribute__((__target__("avx2")))
+  #elif defined(_MSC_VER)
+    #if (_MSC_VER == 1900)
+      #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
+    #endif
+    #if (_MSC_VER >= 1900)
+      #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_AVX2
+    #elif (_MSC_VER >= 1500)  // (VS2008)
+      #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_SSSE3
+    #elif (_MSC_VER >= 1310)  // (VS2003)
+      #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_SSE2
+    #endif
+  #endif // _MSC_VER
+
+/*
+// for debug
+#ifdef k_SwapBytes_Mode_MAX
+#undef k_SwapBytes_Mode_MAX
+#endif
+*/
+
+#ifndef k_SwapBytes_Mode_MAX
+#define k_SwapBytes_Mode_MAX 0
+#endif
+
+#if (k_SwapBytes_Mode_MAX != 0) && defined(MY_CPU_AMD64)
+  #define k_SwapBytes_Mode_MIN  k_SwapBytes_Mode_SSE2
+#else
+  #define k_SwapBytes_Mode_MIN  0
+#endif
+
+#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_AVX2)
+  #define USE_SWAP_AVX2
+#endif
+#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSSE3)
+  #define USE_SWAP_SSSE3
+#endif
+#if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSE2)
+  #define USE_SWAP_128
+#endif
+
+#if k_SwapBytes_Mode_MAX <= k_SwapBytes_Mode_MIN || !defined(USE_SWAP_128)
+#define FORCE_SWAP_MODE
+#endif
+
+
+#ifdef USE_SWAP_128
+/*
+ <mmintrin.h> MMX
+<xmmintrin.h> SSE
+<emmintrin.h> SSE2
+<pmmintrin.h> SSE3
+<tmmintrin.h> SSSE3
+<smmintrin.h> SSE4.1
+<nmmintrin.h> SSE4.2
+<ammintrin.h> SSE4A
+<wmmintrin.h> AES
+<immintrin.h> AVX, AVX2, FMA
+*/
+
+#include <emmintrin.h> // sse2
+// typedef __m128i v128;
+
+#define SWAP2_128(i) { \
+  const __m128i v = *(const __m128i *)(const void *)(items + (i) * 8); \
+                    *(      __m128i *)(      void *)(items + (i) * 8) = \
+    _mm_or_si128( \
+      _mm_slli_epi16(v, 8), \
+      _mm_srli_epi16(v, 8)); }
+// _mm_or_si128() has more ports to execute than _mm_add_epi16().
+
+static
+#ifdef SWAP_ATTRIB_SSE2
+SWAP_ATTRIB_SSE2
+#endif
+void
+Z7_FASTCALL
+SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim)
+{
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
+    SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
+  }
+  while (items != lim);
+}
+
+/*
+// sse2
+#define SWAP4_128_pack(i) { \
+  __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \
+  __m128i v0 = _mm_unpacklo_epi8(v, mask); \
+  __m128i v1 = _mm_unpackhi_epi8(v, mask); \
+  v0 = _mm_shufflelo_epi16(v0, 0x1b); \
+  v1 = _mm_shufflelo_epi16(v1, 0x1b); \
+  v0 = _mm_shufflehi_epi16(v0, 0x1b); \
+  v1 = _mm_shufflehi_epi16(v1, 0x1b); \
+  *(__m128i *)(void *)(items + (i) * 4) = _mm_packus_epi16(v0, v1); }
+
+static
+#ifdef SWAP_ATTRIB_SSE2
+SWAP_ATTRIB_SSE2
+#endif
+void
+Z7_FASTCALL
+SwapBytes4_128_pack(CSwapUInt32 *items, const CSwapUInt32 *lim)
+{
+  const __m128i mask = _mm_setzero_si128();
+  // const __m128i mask = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SWAP4_128_pack(0); items += 1 * 4;
+    // SWAP4_128_pack(0); SWAP4_128_pack(1); items += 2 * 4;
+  }
+  while (items != lim);
+}
+
+// sse2
+#define SWAP4_128_shift(i) { \
+  __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \
+  __m128i v2; \
+  v2 = _mm_or_si128( \
+        _mm_slli_si128(_mm_and_si128(v, mask), 1), \
+        _mm_and_si128(_mm_srli_si128(v, 1), mask)); \
+  v = _mm_or_si128( \
+        _mm_slli_epi32(v, 24), \
+        _mm_srli_epi32(v, 24)); \
+  *(__m128i *)(void *)(items + (i) * 4) = _mm_or_si128(v2, v); }
+
+static
+#ifdef SWAP_ATTRIB_SSE2
+SWAP_ATTRIB_SSE2
+#endif
+void
+Z7_FASTCALL
+SwapBytes4_128_shift(CSwapUInt32 *items, const CSwapUInt32 *lim)
+{
+  #define M1 0xff00
+  const __m128i mask = _mm_set_epi32(M1, M1, M1, M1);
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    // SWAP4_128_shift(0)  SWAP4_128_shift(1)  items += 2 * 4;
+    // SWAP4_128_shift(0)  SWAP4_128_shift(1)  items += 2 * 4;
+    SWAP4_128_shift(0); items += 1 * 4;
+  }
+  while (items != lim);
+}
+*/
+
+
+#if defined(USE_SWAP_SSSE3) || defined(USE_SWAP_AVX2)
+
+#define SWAP_SHUF_REV_SEQ_2_VALS(v)                (v)+1, (v)
+#define SWAP_SHUF_REV_SEQ_4_VALS(v)  (v)+3, (v)+2, (v)+1, (v)
+
+#define SWAP2_SHUF_MASK_16_BYTES \
+    SWAP_SHUF_REV_SEQ_2_VALS (0 * 2), \
+    SWAP_SHUF_REV_SEQ_2_VALS (1 * 2), \
+    SWAP_SHUF_REV_SEQ_2_VALS (2 * 2), \
+    SWAP_SHUF_REV_SEQ_2_VALS (3 * 2), \
+    SWAP_SHUF_REV_SEQ_2_VALS (4 * 2), \
+    SWAP_SHUF_REV_SEQ_2_VALS (5 * 2), \
+    SWAP_SHUF_REV_SEQ_2_VALS (6 * 2), \
+    SWAP_SHUF_REV_SEQ_2_VALS (7 * 2)
+
+#define SWAP4_SHUF_MASK_16_BYTES \
+    SWAP_SHUF_REV_SEQ_4_VALS (0 * 4), \
+    SWAP_SHUF_REV_SEQ_4_VALS (1 * 4), \
+    SWAP_SHUF_REV_SEQ_4_VALS (2 * 4), \
+    SWAP_SHUF_REV_SEQ_4_VALS (3 * 4)
+
+#if defined(USE_SWAP_AVX2)
+/* if we use 256_BIT_INIT_MASK, each static array mask will be larger for 16 bytes */
+// #define SWAP_USE_256_BIT_INIT_MASK
+#endif
+
+#if defined(SWAP_USE_256_BIT_INIT_MASK) && defined(USE_SWAP_AVX2)
+#define SWAP_MASK_INIT_SIZE 32
+#else
+#define SWAP_MASK_INIT_SIZE 16
+#endif
+
+MY_ALIGN(SWAP_MASK_INIT_SIZE)
+static const Byte k_ShufMask_Swap2[] =
+{
+    SWAP2_SHUF_MASK_16_BYTES
+  #if SWAP_MASK_INIT_SIZE > 16
+  , SWAP2_SHUF_MASK_16_BYTES
+  #endif
+};
+
+MY_ALIGN(SWAP_MASK_INIT_SIZE)
+static const Byte k_ShufMask_Swap4[] =
+{
+    SWAP4_SHUF_MASK_16_BYTES
+  #if SWAP_MASK_INIT_SIZE > 16
+  , SWAP4_SHUF_MASK_16_BYTES
+  #endif
+};
+
+
+#ifdef USE_SWAP_SSSE3
+
+#include <tmmintrin.h> // ssse3
+
+#define SHUF_128(i)   *(items + (i)) = \
+     _mm_shuffle_epi8(*(items + (i)), mask); // SSSE3
+
+// Z7_NO_INLINE
+static
+#ifdef SWAP_ATTRIB_SSSE3
+SWAP_ATTRIB_SSSE3
+#endif
+Z7_ATTRIB_NO_VECTORIZE
+void
+Z7_FASTCALL
+ShufBytes_128(void *items8, const void *lim8, const void *mask128_ptr)
+{
+  __m128i *items = (__m128i *)items8;
+  const __m128i *lim = (const __m128i *)lim8;
+  // const __m128i mask = _mm_set_epi8(SHUF_SWAP2_MASK_16_VALS);
+  // const __m128i mask = _mm_set_epi8(SHUF_SWAP4_MASK_16_VALS);
+  // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
+  // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
+  // const __m128i mask = *(const __m128i *)(const void *)&(k_ShufMask_Swap4[0]);
+  const __m128i mask = *(const __m128i *)mask128_ptr;
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SHUF_128(0)  SHUF_128(1)  items += 2;
+    SHUF_128(0)  SHUF_128(1)  items += 2;
+  }
+  while (items != lim);
+}
+
+#endif // USE_SWAP_SSSE3
+
+
+
+#ifdef USE_SWAP_AVX2
+
+#include <immintrin.h> // avx, avx2
+#if defined(__clang__)
+#include <avxintrin.h>
+#include <avx2intrin.h>
+#endif
+
+#define SHUF_256(i)   *(items + (i)) = \
+  _mm256_shuffle_epi8(*(items + (i)), mask); // AVX2
+
+// Z7_NO_INLINE
+static
+#ifdef SWAP_ATTRIB_AVX2
+SWAP_ATTRIB_AVX2
+#endif
+Z7_ATTRIB_NO_VECTORIZE
+void
+Z7_FASTCALL
+ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr)
+{
+  __m256i *items = (__m256i *)items8;
+  const __m256i *lim = (const __m256i *)lim8;
+  /*
+  UNUSED_VAR(mask128_ptr)
+  __m256i mask =
+  for Swap4: _mm256_setr_epi8(SWAP4_SHUF_MASK_16_BYTES, SWAP4_SHUF_MASK_16_BYTES);
+  for Swap2: _mm256_setr_epi8(SWAP2_SHUF_MASK_16_BYTES, SWAP2_SHUF_MASK_16_BYTES);
+  */
+  const __m256i mask =
+ #if SWAP_MASK_INIT_SIZE > 16
+      *(const __m256i *)(const void *)mask128_ptr;
+ #else
+  /* msvc: broadcastsi128() version reserves the stack for no reason
+     msvc 19.29-: _mm256_insertf128_si256() / _mm256_set_m128i)) versions use non-avx movdqu   xmm0,XMMWORD PTR [r8]
+     msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want
+  */
+  // _mm256_broadcastsi128_si256(*mask128_ptr);
+#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000)
+  #define MY_mm256_set_m128i(hi, lo)  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
+#else
+  #define MY_mm256_set_m128i  _mm256_set_m128i
+#endif
+      MY_mm256_set_m128i(
+        *(const __m128i *)mask128_ptr,
+        *(const __m128i *)mask128_ptr);
+ #endif
+  
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SHUF_256(0)  SHUF_256(1)  items += 2;
+    SHUF_256(0)  SHUF_256(1)  items += 2;
+  }
+  while (items != lim);
+}
+
+#endif // USE_SWAP_AVX2
+#endif // USE_SWAP_SSSE3 || USE_SWAP_AVX2
+#endif // USE_SWAP_128
+
+
+
+// compile message "NEON intrinsics not available with the soft-float ABI"
+#elif defined(MY_CPU_ARM_OR_ARM64) \
+    && defined(MY_CPU_LE) \
+    && !defined(Z7_DISABLE_ARM_NEON)
+
+  #if defined(__clang__) && (__clang_major__ >= 8) \
+    || defined(__GNUC__) && (__GNUC__ >= 6)
+    #if defined(__ARM_FP)
+    #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 4)) \
+        || defined(MY_CPU_ARM64)
+    #if  defined(MY_CPU_ARM64) \
+      || !defined(Z7_CLANG_VERSION) \
+      || defined(__ARM_NEON)
+      #define USE_SWAP_128
+    #ifdef MY_CPU_ARM64
+      // #define SWAP_ATTRIB_NEON __attribute__((__target__("")))
+    #else
+#if defined(Z7_CLANG_VERSION)
+      // #define SWAP_ATTRIB_NEON __attribute__((__target__("neon")))
+#else
+      // #pragma message("SWAP_ATTRIB_NEON __attribute__((__target__(fpu=neon))")
+      #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=neon")))
+#endif
+    #endif // MY_CPU_ARM64
+    #endif // __ARM_NEON
+    #endif // __ARM_ARCH
+    #endif // __ARM_FP
+
+  #elif defined(_MSC_VER)
+    #if (_MSC_VER >= 1910)
+      #define USE_SWAP_128
+    #endif
+  #endif
+
+  #ifdef USE_SWAP_128
+  #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
+    #include <arm64_neon.h>
+  #else
+
+/*
+#if !defined(__ARM_NEON)
+#if defined(Z7_GCC_VERSION) && (__GNUC__  <   5) \
+ || defined(Z7_GCC_VERSION) && (__GNUC__ ==   5) && (Z7_GCC_VERSION <  90201) \
+ || defined(Z7_GCC_VERSION) && (__GNUC__ ==   5) && (Z7_GCC_VERSION < 100100)
+Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+#pragma message("#define __ARM_NEON 1")
+// #define __ARM_NEON 1
+Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+#endif
+#endif
+*/
+    #include <arm_neon.h>
+  #endif
+  #endif
+
+#ifndef USE_SWAP_128
+  #define FORCE_SWAP_MODE
+#else
+ 
+#ifdef MY_CPU_ARM64
+  // for debug : comment it
+  #define FORCE_SWAP_MODE
+#else
+  #define k_SwapBytes_Mode_NEON 1
+#endif
+// typedef uint8x16_t v128;
+#define SWAP2_128(i)   *(uint8x16_t *)      (void *)(items + (i) * 8) = \
+      vrev16q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 8));
+#define SWAP4_128(i)   *(uint8x16_t *)      (void *)(items + (i) * 4) = \
+      vrev32q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 4));
+
+// Z7_NO_INLINE
+static
+#ifdef SWAP_ATTRIB_NEON
+SWAP_ATTRIB_NEON
+#endif
+Z7_ATTRIB_NO_VECTORIZE
+void
+Z7_FASTCALL
+SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim)
+{
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
+    SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
+  }
+  while (items != lim);
+}
+
+// Z7_NO_INLINE
+static
+#ifdef SWAP_ATTRIB_NEON
+SWAP_ATTRIB_NEON
+#endif
+Z7_ATTRIB_NO_VECTORIZE
+void
+Z7_FASTCALL
+SwapBytes4_128(CSwapUInt32 *items, const CSwapUInt32 *lim)
+{
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SWAP4_128(0)  SWAP4_128(1)  items += 2 * 4;
+    SWAP4_128(0)  SWAP4_128(1)  items += 2 * 4;
+  }
+  while (items != lim);
+}
+
+#endif // USE_SWAP_128
+
+#else // MY_CPU_ARM_OR_ARM64
+#define FORCE_SWAP_MODE
+#endif // MY_CPU_ARM_OR_ARM64
+
+
+
+
+
+
+#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_X86)
+  /* _byteswap_ushort() in MSVC x86 32-bit works via slow { mov dh, al; mov dl, ah }
+     So we use own versions of byteswap function */
+  #if (_MSC_VER < 1400 )  // old MSVC-X86 without _rotr16() support
+    #define SWAP2_16(i)  { UInt32 v = items[i];  v += (v << 16);  v >>= 8;  items[i] = (CSwapUInt16)v; }
+  #else  // is new MSVC-X86 with fast _rotr16()
+    #include <intrin.h>
+    #define SWAP2_16(i)  { items[i] = _rotr16(items[i], 8); }
+  #endif
+#else  // is not MSVC-X86
+  #define SWAP2_16(i)  { CSwapUInt16 v = items[i];  items[i] = Z7_BSWAP16(v); }
+#endif  // MSVC-X86
+
+#if defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
+  #define SWAP4_32(i)  { CSwapUInt32 v = items[i];  items[i] = Z7_BSWAP32(v); }
+#else
+  #define SWAP4_32(i)  \
+    { UInt32 v = items[i]; \
+      v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff); \
+      v = rotlFixed(v, 16); \
+      items[i] = v; }
+#endif
+
+
+
+
+#if defined(FORCE_SWAP_MODE) && defined(USE_SWAP_128)
+  #define DEFAULT_Swap2  SwapBytes2_128
+  #if !defined(MY_CPU_X86_OR_AMD64)
+    #define DEFAULT_Swap4  SwapBytes4_128
+  #endif
+#endif
+
+#if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4)
+
+#define SWAP_BASE_FUNCS_PREFIXES \
+Z7_FORCE_INLINE  \
+static \
+Z7_ATTRIB_NO_VECTOR  \
+void Z7_FASTCALL
+
+
+#if defined(MY_CPU_ARM_OR_ARM64)
+#if defined(__clang__)
+#pragma GCC diagnostic ignored "-Wlanguage-extension-token"
+#endif
+#endif
+
+
+#ifdef MY_CPU_64BIT
+
+#if defined(MY_CPU_ARM64) \
+    && defined(__ARM_ARCH) && (__ARM_ARCH >= 8) \
+    && (  (defined(__GNUC__) && (__GNUC__ >= 4)) \
+       || (defined(__clang__) && (__clang_major__ >= 4)))
+
+  #define SWAP2_64_VAR(v)  asm ("rev16 %x0,%x0" : "+r" (v));
+  #define SWAP4_64_VAR(v)  asm ("rev32 %x0,%x0" : "+r" (v));
+
+#else  // is not ARM64-GNU
+
+#if !defined(MY_CPU_X86_OR_AMD64) || (k_SwapBytes_Mode_MIN == 0) || !defined(USE_SWAP_128)
+  #define SWAP2_64_VAR(v) \
+    v = ( 0x00ff00ff00ff00ff & (v >> 8))  \
+      + ((0x00ff00ff00ff00ff & v) << 8);
+      /* plus gives faster code in MSVC */
+#endif
+
+#ifdef Z7_CPU_FAST_BSWAP_SUPPORTED
+  #define SWAP4_64_VAR(v) \
+    v = Z7_BSWAP64(v); \
+    v = Z7_ROTL64(v, 32);
+#else
+  #define SWAP4_64_VAR(v) \
+    v = ( 0x000000ff000000ff & (v >> 24))  \
+      + ((0x000000ff000000ff & v) << 24 )  \
+      + ( 0x0000ff000000ff00 & (v >>  8))  \
+      + ((0x0000ff000000ff00 & v) <<  8 )  \
+      ;
+#endif
+
+#endif  // ARM64-GNU
+
+
+#ifdef SWAP2_64_VAR
+
+#define SWAP2_64(i) { \
+    UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 4); \
+    SWAP2_64_VAR(v) \
+    *(UInt64 *)(void *)(items + (i) * 4) = v; }
+
+SWAP_BASE_FUNCS_PREFIXES
+SwapBytes2_64(CSwapUInt16 *items, const CSwapUInt16 *lim)
+{
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SWAP2_64(0)  SWAP2_64(1)  items += 2 * 4;
+    SWAP2_64(0)  SWAP2_64(1)  items += 2 * 4;
+  }
+  while (items != lim);
+}
+
+  #define DEFAULT_Swap2  SwapBytes2_64
+  #if !defined(FORCE_SWAP_MODE)
+    #define SWAP2_DEFAULT_MODE 0
+  #endif
+#else // !defined(SWAP2_64_VAR)
+  #define DEFAULT_Swap2  SwapBytes2_128
+  #if !defined(FORCE_SWAP_MODE)
+    #define SWAP2_DEFAULT_MODE 1
+  #endif
+#endif // SWAP2_64_VAR
+
+
+#define SWAP4_64(i) { \
+    UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 2); \
+    SWAP4_64_VAR(v) \
+    *(UInt64 *)(void *)(items + (i) * 2) = v; }
+
+SWAP_BASE_FUNCS_PREFIXES
+SwapBytes4_64(CSwapUInt32 *items, const CSwapUInt32 *lim)
+{
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SWAP4_64(0)  SWAP4_64(1)  items += 2 * 2;
+    SWAP4_64(0)  SWAP4_64(1)  items += 2 * 2;
+  }
+  while (items != lim);
+}
+
+#define DEFAULT_Swap4  SwapBytes4_64
+
+#else  // is not 64BIT
+
+
+#if defined(MY_CPU_ARM_OR_ARM64) \
+    && defined(__ARM_ARCH) && (__ARM_ARCH >= 6) \
+    && (  (defined(__GNUC__) && (__GNUC__ >= 4)) \
+       || (defined(__clang__) && (__clang_major__ >= 4)))
+
+#ifdef MY_CPU_64BIT
+  #define SWAP2_32_VAR(v)  asm ("rev16 %w0,%w0" : "+r" (v));
+#else
+  #define SWAP2_32_VAR(v)  asm ("rev16 %0,%0" : "+r" (v)); // for clang/gcc
+    // asm ("rev16 %r0,%r0" : "+r" (a));  // for gcc
+#endif
+
+#elif defined(_MSC_VER) && (_MSC_VER < 1300) && defined(MY_CPU_X86) \
+    || !defined(Z7_CPU_FAST_BSWAP_SUPPORTED) \
+    || !defined(Z7_CPU_FAST_ROTATE_SUPPORTED)
+  // old msvc doesn't support _byteswap_ulong()
+  #define SWAP2_32_VAR(v) \
+    v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff);
+
+#else  // is not ARM and is not old-MSVC-X86 and fast BSWAP/ROTATE are supported
+  #define SWAP2_32_VAR(v) \
+    v = Z7_BSWAP32(v); \
+    v = rotlFixed(v, 16);
+
+#endif  // GNU-ARM*
+
+#define SWAP2_32(i) { \
+    UInt32 v = *(const UInt32 *)(const void *)(items + (i) * 2); \
+    SWAP2_32_VAR(v); \
+    *(UInt32 *)(void *)(items + (i) * 2) = v; }
+
+
+SWAP_BASE_FUNCS_PREFIXES
+SwapBytes2_32(CSwapUInt16 *items, const CSwapUInt16 *lim)
+{
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SWAP2_32(0)  SWAP2_32(1)  items += 2 * 2;
+    SWAP2_32(0)  SWAP2_32(1)  items += 2 * 2;
+  }
+  while (items != lim);
+}
+
+
+SWAP_BASE_FUNCS_PREFIXES
+SwapBytes4_32(CSwapUInt32 *items, const CSwapUInt32 *lim)
+{
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  do
+  {
+    SWAP4_32(0)  SWAP4_32(1)  items += 2;
+    SWAP4_32(0)  SWAP4_32(1)  items += 2;
+  }
+  while (items != lim);
+}
+
+#define DEFAULT_Swap2  SwapBytes2_32
+#define DEFAULT_Swap4  SwapBytes4_32
+#if !defined(FORCE_SWAP_MODE)
+  #define SWAP2_DEFAULT_MODE 0
+#endif
+
+#endif // MY_CPU_64BIT
+#endif // if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4)
+
+
+
+#if !defined(FORCE_SWAP_MODE)
+static unsigned g_SwapBytes_Mode;
+#endif
+
+/* size of largest unrolled loop iteration: 128 bytes = 4 * 32 bytes (AVX). */
+#define SWAP_ITERATION_BLOCK_SIZE_MAX  (1 << 7)
+
+// 32 bytes for (AVX) or 2 * 16-bytes for NEON.
+#define SWAP_VECTOR_ALIGN_SIZE  (1 << 5)
+
+Z7_NO_INLINE
+void z7_SwapBytes2(CSwapUInt16 *items, size_t numItems)
+{
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--)
+  {
+    SWAP2_16(0)
+    items++;
+  }
+  {
+    const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt16) - 1;
+    size_t numItems2 = numItems;
+    CSwapUInt16 *lim;
+    numItems &= k_Align_Mask;
+    numItems2 &= ~(size_t)k_Align_Mask;
+    lim = items + numItems2;
+    if (numItems2 != 0)
+    {
+     #if !defined(FORCE_SWAP_MODE)
+      #ifdef MY_CPU_X86_OR_AMD64
+        #ifdef USE_SWAP_AVX2
+          if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3)
+            ShufBytes_256((__m256i *)(void *)items,
+                (const __m256i *)(const void *)lim,
+                (const __m128i *)(const void *)&(k_ShufMask_Swap2[0]));
+          else
+        #endif
+        #ifdef USE_SWAP_SSSE3
+          if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3)
+            ShufBytes_128((__m128i *)(void *)items,
+                (const __m128i *)(const void *)lim,
+                (const __m128i *)(const void *)&(k_ShufMask_Swap2[0]));
+          else
+        #endif
+      #endif  // MY_CPU_X86_OR_AMD64
+      #if SWAP2_DEFAULT_MODE == 0
+          if (g_SwapBytes_Mode != 0)
+            SwapBytes2_128(items, lim);
+          else
+      #endif
+     #endif // FORCE_SWAP_MODE
+            DEFAULT_Swap2(items, lim);
+    }
+    items = lim;
+  }
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  for (; numItems != 0; numItems--)
+  {
+    SWAP2_16(0)
+    items++;
+  }
+}
+
+
+Z7_NO_INLINE
+void z7_SwapBytes4(CSwapUInt32 *items, size_t numItems)
+{
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--)
+  {
+    SWAP4_32(0)
+    items++;
+  }
+  {
+    const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt32) - 1;
+    size_t numItems2 = numItems;
+    CSwapUInt32 *lim;
+    numItems &= k_Align_Mask;
+    numItems2 &= ~(size_t)k_Align_Mask;
+    lim = items + numItems2;
+    if (numItems2 != 0)
+    {
+     #if !defined(FORCE_SWAP_MODE)
+      #ifdef MY_CPU_X86_OR_AMD64
+        #ifdef USE_SWAP_AVX2
+          if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3)
+            ShufBytes_256((__m256i *)(void *)items,
+                (const __m256i *)(const void *)lim,
+                (const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
+          else
+        #endif
+        #ifdef USE_SWAP_SSSE3
+          if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3)
+            ShufBytes_128((__m128i *)(void *)items,
+                (const __m128i *)(const void *)lim,
+                (const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
+          else
+        #endif
+      #else  // MY_CPU_X86_OR_AMD64
+
+          if (g_SwapBytes_Mode != 0)
+            SwapBytes4_128(items, lim);
+          else
+      #endif  // MY_CPU_X86_OR_AMD64
+     #endif // FORCE_SWAP_MODE
+            DEFAULT_Swap4(items, lim);
+    }
+    items = lim;
+  }
+  Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
+  for (; numItems != 0; numItems--)
+  {
+    SWAP4_32(0)
+    items++;
+  }
+}
+
+
+// #define SHOW_HW_STATUS
+
+#ifdef SHOW_HW_STATUS
+#include <stdio.h>
+#define PRF(x) x
+#else
+#define PRF(x)
+#endif
+
+void z7_SwapBytesPrepare(void)
+{
+#ifndef FORCE_SWAP_MODE
+  unsigned mode = 0; // k_SwapBytes_Mode_BASE;
+
+#ifdef MY_CPU_ARM_OR_ARM64
+  {
+    if (CPU_IsSupported_NEON())
+    {
+      // #pragma message ("=== SwapBytes NEON")
+      PRF(printf("\n=== SwapBytes NEON\n");)
+      mode = k_SwapBytes_Mode_NEON;
+    }
+  }
+#else // MY_CPU_ARM_OR_ARM64
+  {
+    #ifdef USE_SWAP_AVX2
+      if (CPU_IsSupported_AVX2())
+      {
+        // #pragma message ("=== SwapBytes AVX2")
+        PRF(printf("\n=== SwapBytes AVX2\n");)
+        mode = k_SwapBytes_Mode_AVX2;
+      }
+      else
+    #endif
+    #ifdef USE_SWAP_SSSE3
+      if (CPU_IsSupported_SSSE3())
+      {
+        // #pragma message ("=== SwapBytes SSSE3")
+        PRF(printf("\n=== SwapBytes SSSE3\n");)
+        mode = k_SwapBytes_Mode_SSSE3;
+      }
+      else
+    #endif
+    #if !defined(MY_CPU_AMD64)
+      if (CPU_IsSupported_SSE2())
+    #endif
+      {
+        // #pragma message ("=== SwapBytes SSE2")
+        PRF(printf("\n=== SwapBytes SSE2\n");)
+        mode = k_SwapBytes_Mode_SSE2;
+      }
+  }
+#endif // MY_CPU_ARM_OR_ARM64
+  g_SwapBytes_Mode = mode;
+  // g_SwapBytes_Mode = 0; // for debug
+#endif // FORCE_SWAP_MODE
+  PRF(printf("\n=== SwapBytesPrepare\n");)
+}
+
+#undef PRF
diff --git a/src/sdk/C/SwapBytes.h b/src/sdk/C/SwapBytes.h
new file mode 100644
index 0000000..d442467
--- /dev/null
+++ b/src/sdk/C/SwapBytes.h
@@ -0,0 +1,17 @@
+/* SwapBytes.h -- Byte Swap conversion filter
+2023-04-02 : Igor Pavlov : Public domain */
+
+#ifndef ZIP7_INC_SWAP_BYTES_H
+#define ZIP7_INC_SWAP_BYTES_H
+
+#include "7zTypes.h"
+
+EXTERN_C_BEGIN
+
+void z7_SwapBytes2(UInt16 *data, size_t numItems);
+void z7_SwapBytes4(UInt32 *data, size_t numItems);
+void z7_SwapBytesPrepare(void);
+
+EXTERN_C_END
+
+#endif
diff --git a/src/sdk/C/Threads.c b/src/sdk/C/Threads.c
index 930ad27..177d1d9 100644
--- a/src/sdk/C/Threads.c
+++ b/src/sdk/C/Threads.c
@@ -1,17 +1,19 @@
 /* Threads.c -- multithreading library
-2017-06-26 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
-#ifndef UNDER_CE
+#ifdef _WIN32
+
+#ifndef USE_THREADS_CreateThread
 #include <process.h>
 #endif
 
 #include "Threads.h"
 
-static WRes GetError()
+static WRes GetError(void)
 {
-  DWORD res = GetLastError();
+  const DWORD res = GetLastError();
   return res ? (WRes)res : 1;
 }
 
@@ -29,28 +31,310 @@ WRes HandlePtr_Close(HANDLE *p)
   return 0;
 }
 
-WRes Handle_WaitObject(HANDLE h) { return (WRes)WaitForSingleObject(h, INFINITE); }
+WRes Handle_WaitObject(HANDLE h)
+{
+  DWORD dw = WaitForSingleObject(h, INFINITE);
+  /*
+    (dw) result:
+    WAIT_OBJECT_0  // 0
+    WAIT_ABANDONED // 0x00000080 : is not compatible with Win32 Error space
+    WAIT_TIMEOUT   // 0x00000102 : is     compatible with Win32 Error space
+    WAIT_FAILED    // 0xFFFFFFFF
+  */
+  if (dw == WAIT_FAILED)
+  {
+    dw = GetLastError();
+    if (dw == 0)
+      return WAIT_FAILED;
+  }
+  return (WRes)dw;
+}
+
+#define Thread_Wait(p) Handle_WaitObject(*(p))
+
+WRes Thread_Wait_Close(CThread *p)
+{
+  WRes res = Thread_Wait(p);
+  WRes res2 = Thread_Close(p);
+  return (res != 0 ? res : res2);
+}
+
+typedef struct MY_PROCESSOR_NUMBER {
+    WORD  Group;
+    BYTE  Number;
+    BYTE  Reserved;
+} MY_PROCESSOR_NUMBER, *MY_PPROCESSOR_NUMBER;
+
+typedef struct MY_GROUP_AFFINITY {
+#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 100000)
+    // KAFFINITY is not defined in old mingw
+    ULONG_PTR
+#else
+    KAFFINITY
+#endif
+      Mask;
+    WORD   Group;
+    WORD   Reserved[3];
+} MY_GROUP_AFFINITY, *MY_PGROUP_AFFINITY;
+
+typedef BOOL (WINAPI *Func_SetThreadGroupAffinity)(
+    HANDLE hThread,
+    CONST MY_GROUP_AFFINITY *GroupAffinity,
+    MY_PGROUP_AFFINITY PreviousGroupAffinity);
+
+typedef BOOL (WINAPI *Func_GetThreadGroupAffinity)(
+    HANDLE hThread,
+    MY_PGROUP_AFFINITY GroupAffinity);
+
+typedef BOOL (WINAPI *Func_GetProcessGroupAffinity)(
+    HANDLE hProcess,
+    PUSHORT GroupCount,
+    PUSHORT GroupArray);
+
+Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
+
+#if 0
+#include <stdio.h>
+#define PRF(x) x
+/*
+--
+  before call of SetThreadGroupAffinity()
+    GetProcessGroupAffinity return one group.
+  after call of SetThreadGroupAffinity():
+    GetProcessGroupAffinity return more than group,
+    if SetThreadGroupAffinity() was to another group.
+--
+  GetProcessAffinityMask MS DOCs:
+  {
+    If the calling process contains threads in multiple groups,
+    the function returns zero for both affinity masks.
+  }
+  but tests in win10 with 2 groups (less than 64 cores total):
+    GetProcessAffinityMask() still returns non-zero affinity masks
+    even after SetThreadGroupAffinity() calls.
+*/
+static void PrintProcess_Info()
+{
+  {
+    const
+      Func_GetProcessGroupAffinity fn_GetProcessGroupAffinity =
+     (Func_GetProcessGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
+          "GetProcessGroupAffinity");
+    if (fn_GetProcessGroupAffinity)
+    {
+      unsigned i;
+      USHORT GroupCounts[64];
+      USHORT GroupCount = Z7_ARRAY_SIZE(GroupCounts);
+      BOOL boolRes = fn_GetProcessGroupAffinity(GetCurrentProcess(),
+          &GroupCount, GroupCounts);
+      printf("\n====== GetProcessGroupAffinity : "
+          "boolRes=%u GroupCounts = %u :",
+          boolRes, (unsigned)GroupCount);
+      for (i = 0; i < GroupCount; i++)
+        printf(" %u", GroupCounts[i]);
+      printf("\n");
+    }
+  }
+  {
+    DWORD_PTR processAffinityMask, systemAffinityMask;
+    if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask))
+    {
+      PRF(printf("\n====== GetProcessAffinityMask : "
+        ": processAffinityMask=%x, systemAffinityMask=%x\n",
+        (UInt32)processAffinityMask, (UInt32)systemAffinityMask);)
+    }
+    else
+      printf("\n==GetProcessAffinityMask FAIL");
+  }
+}
+#else
+#ifndef USE_THREADS_CreateThread
+// #define PRF(x)
+#endif
+#endif
 
 WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
 {
   /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
+
+  #ifdef USE_THREADS_CreateThread
+
+  DWORD threadId;
+  *p = CreateThread(NULL, 0, func, param, 0, &threadId);
   
-  #ifdef UNDER_CE
+  #else
   
-  DWORD threadId;
-  *p = CreateThread(0, 0, func, param, 0, &threadId);
+  unsigned threadId;
+  *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId));
 
-  #else
+#if 0 // 1 : for debug
+  {
+      DWORD_PTR prevMask;
+      DWORD_PTR affinity = 1 << 0;
+      prevMask = SetThreadAffinityMask(*p, (DWORD_PTR)affinity);
+      prevMask = prevMask;
+  }
+#endif
+#if 0 // 1 : for debug
+  {
+      /* win10: new thread will be created in same group that is assigned to parent thread
+                but affinity mask will contain all allowed threads of that group,
+                even if affinity mask of parent group is not full
+         win11: what group it will be created, if we have set
+                affinity of parent thread with ThreadGroupAffinity?
+      */
+      const
+         Func_GetThreadGroupAffinity fn =
+        (Func_GetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
+             "GetThreadGroupAffinity");
+      if (fn)
+      {
+        // BOOL wres2;
+        MY_GROUP_AFFINITY groupAffinity;
+        memset(&groupAffinity, 0, sizeof(groupAffinity));
+        /* wres2 = */ fn(*p, &groupAffinity);
+        PRF(printf("\n==Thread_Create cur = %6u GetThreadGroupAffinity(): "
+            "wres2_BOOL = %u, group=%u mask=%x\n",
+            GetCurrentThreadId(),
+            wres2,
+            groupAffinity.Group,
+            (UInt32)groupAffinity.Mask);)
+      }
+  }
+#endif
 
-  unsigned threadId;
-  *p = (HANDLE)_beginthreadex(NULL, 0, func, param, 0, &threadId);
-   
   #endif
 
   /* maybe we must use errno here, but probably GetLastError() is also OK. */
   return HandleToWRes(*p);
 }
 
+
+WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity)
+{
+  #ifdef USE_THREADS_CreateThread
+
+  UNUSED_VAR(affinity)
+  return Thread_Create(p, func, param);
+  
+  #else
+  
+  /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
+  HANDLE h;
+  WRes wres;
+  unsigned threadId;
+  h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId));
+  *p = h;
+  wres = HandleToWRes(h);
+  if (h)
+  {
+    {
+      // DWORD_PTR prevMask =
+      SetThreadAffinityMask(h, (DWORD_PTR)affinity);
+      /*
+      if (prevMask == 0)
+      {
+        // affinity change is non-critical error, so we can ignore it
+        // wres = GetError();
+      }
+      */
+    }
+    {
+      const DWORD prevSuspendCount = ResumeThread(h);
+      /* ResumeThread() returns:
+         0 : was_not_suspended
+         1 : was_resumed
+        -1 : error
+      */
+      if (prevSuspendCount == (DWORD)-1)
+        wres = GetError();
+    }
+  }
+
+  /* maybe we must use errno here, but probably GetLastError() is also OK. */
+  return wres;
+
+  #endif
+}
+
+
+WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask)
+{
+#ifdef USE_THREADS_CreateThread
+
+  UNUSED_VAR(group)
+  UNUSED_VAR(affinityMask)
+  return Thread_Create(p, func, param);
+  
+#else
+  
+  /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
+  HANDLE h;
+  WRes wres;
+  unsigned threadId;
+  h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId));
+  *p = h;
+  wres = HandleToWRes(h);
+  if (h)
+  {
+    // PrintProcess_Info();
+    {
+      const
+         Func_SetThreadGroupAffinity fn =
+        (Func_SetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
+             "SetThreadGroupAffinity");
+      if (fn)
+      {
+        // WRes wres2;
+        MY_GROUP_AFFINITY groupAffinity, prev_groupAffinity;
+        memset(&groupAffinity, 0, sizeof(groupAffinity));
+        // groupAffinity.Mask must use only bits that supported by current group
+        // (groupAffinity.Mask = 0) means all allowed bits
+        groupAffinity.Mask = affinityMask;
+        groupAffinity.Group = (WORD)group;
+        // wres2 =
+        fn(h, &groupAffinity, &prev_groupAffinity);
+        /*
+        if (groupAffinity.Group == prev_groupAffinity.Group)
+          wres2 = wres2;
+        else
+          wres2 = wres2;
+        if (wres2 == 0)
+        {
+          wres2 = GetError();
+          PRF(printf("\n==SetThreadGroupAffinity error: %u\n", wres2);)
+        }
+        else
+        {
+          PRF(printf("\n==Thread_Create_With_Group::SetThreadGroupAffinity()"
+            " threadId = %6u"
+            " group=%u mask=%x\n",
+            threadId,
+            prev_groupAffinity.Group,
+            (UInt32)prev_groupAffinity.Mask);)
+        }
+        */
+      }
+    }
+    {
+      const DWORD prevSuspendCount = ResumeThread(h);
+      /* ResumeThread() returns:
+         0 : was_not_suspended
+         1 : was_resumed
+        -1 : error
+      */
+      if (prevSuspendCount == (DWORD)-1)
+        wres = GetError();
+    }
+  }
+
+  /* maybe we must use errno here, but probably GetLastError() is also OK. */
+  return wres;
+
+  #endif
+}
+
+
 static WRes Event_Create(CEvent *p, BOOL manualReset, int signaled)
 {
   *p = CreateEvent(NULL, manualReset, (signaled ? TRUE : FALSE), NULL);
@@ -68,10 +352,22 @@ WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p) { return AutoResetEven
 
 WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount)
 {
+  // negative ((LONG)maxCount) is not supported in WIN32::CreateSemaphore()
   *p = CreateSemaphore(NULL, (LONG)initCount, (LONG)maxCount, NULL);
   return HandleToWRes(*p);
 }
 
+WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount)
+{
+  // if (Semaphore_IsCreated(p))
+  {
+    WRes wres = Semaphore_Close(p);
+    if (wres != 0)
+      return wres;
+  }
+  return Semaphore_Create(p, initCount, maxCount);
+}
+
 static WRes Semaphore_Release(CSemaphore *p, LONG releaseCount, LONG *previousCount)
   { return BOOLToWRes(ReleaseSemaphore(*p, releaseCount, previousCount)); }
 WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num)
@@ -80,8 +376,13 @@ WRes Semaphore_Release1(CSemaphore *p) { return Semaphore_ReleaseN(p, 1); }
 
 WRes CriticalSection_Init(CCriticalSection *p)
 {
-  /* InitializeCriticalSection can raise only STATUS_NO_MEMORY exception */
+  /* InitializeCriticalSection() can raise exception:
+     Windows XP, 2003 : can raise a STATUS_NO_MEMORY exception
+     Windows Vista+   : no exceptions */
   #ifdef _MSC_VER
+  #ifdef __clang__
+    #pragma GCC diagnostic ignored "-Wlanguage-extension-token"
+  #endif
   __try
   #endif
   {
@@ -89,7 +390,423 @@ WRes CriticalSection_Init(CCriticalSection *p)
     /* InitializeCriticalSectionAndSpinCount(p, 0); */
   }
   #ifdef _MSC_VER
-  __except (EXCEPTION_EXECUTE_HANDLER) { return 1; }
+  __except (EXCEPTION_EXECUTE_HANDLER) { return ERROR_NOT_ENOUGH_MEMORY; }
   #endif
   return 0;
 }
+
+
+
+
+#else // _WIN32
+
+// ---------- POSIX ----------
+
+#if defined(__linux__) && !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__)
+#ifndef Z7_AFFINITY_DISABLE
+// _GNU_SOURCE can be required for pthread_setaffinity_np() / CPU_ZERO / CPU_SET
+// clang < 3.6       : unknown warning group '-Wreserved-id-macro'
+// clang 3.6 - 12.01 : gives warning "macro name is a reserved identifier"
+// clang >= 13       : do not give warning
+#if !defined(_GNU_SOURCE)
+Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+// #define _GNU_SOURCE
+Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+#endif // !defined(_GNU_SOURCE)
+#endif // Z7_AFFINITY_DISABLE
+#endif // __linux__
+
+#include "Threads.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef Z7_AFFINITY_SUPPORTED
+// #include <sched.h>
+#endif
+
+
+// #include <stdio.h>
+// #define PRF(p) p
+#define PRF(p)
+#define Print(s) PRF(printf("\n%s\n", s);)
+
+WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet)
+{
+  // new thread in Posix probably inherits affinity from parrent thread
+  Print("Thread_Create_With_CpuSet")
+
+  pthread_attr_t attr;
+  int ret;
+  // int ret2;
+
+  p->_created = 0;
+
+  RINOK(pthread_attr_init(&attr))
+
+  ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+  if (!ret)
+  {
+    if (cpuSet)
+    {
+      // pthread_attr_setaffinity_np() is not supported for MUSL compile.
+      // so we check for __GLIBC__ here
+#if defined(Z7_AFFINITY_SUPPORTED) && defined( __GLIBC__)
+      /*
+      printf("\n affinity :");
+      unsigned i;
+      for (i = 0; i < sizeof(*cpuSet) && i < 8; i++)
+      {
+        Byte b = *((const Byte *)cpuSet + i);
+        char temp[32];
+        #define GET_HEX_CHAR(t) ((char)(((t < 10) ? ('0' + t) : ('A' + (t - 10)))))
+        temp[0] = GET_HEX_CHAR((b & 0xF));
+        temp[1] = GET_HEX_CHAR((b >> 4));
+        // temp[0] = GET_HEX_CHAR((b >> 4));  // big-endian
+        // temp[1] = GET_HEX_CHAR((b & 0xF));  // big-endian
+        temp[2] = 0;
+        printf("%s", temp);
+      }
+      printf("\n");
+      */
+
+      // ret2 =
+      pthread_attr_setaffinity_np(&attr, sizeof(*cpuSet), cpuSet);
+      // if (ret2) ret = ret2;
+#endif
+    }
+    
+    ret = pthread_create(&p->_tid, &attr, func, param);
+    
+    if (!ret)
+    {
+      p->_created = 1;
+      /*
+      if (cpuSet)
+      {
+        // ret2 =
+        pthread_setaffinity_np(p->_tid, sizeof(*cpuSet), cpuSet);
+        // if (ret2) ret = ret2;
+      }
+      */
+    }
+  }
+  // ret2 =
+  pthread_attr_destroy(&attr);
+  // if (ret2 != 0) ret = ret2;
+  return ret;
+}
+
+
+WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
+{
+  return Thread_Create_With_CpuSet(p, func, param, NULL);
+}
+
+/*
+WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinity)
+{
+  UNUSED_VAR(group)
+  return Thread_Create_With_Affinity(p, func, param, affinity);
+}
+*/
+
+WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity)
+{
+  Print("Thread_Create_WithAffinity")
+  CCpuSet cs;
+  unsigned i;
+  CpuSet_Zero(&cs);
+  for (i = 0; i < sizeof(affinity) * 8; i++)
+  {
+    if (affinity == 0)
+      break;
+    if (affinity & 1)
+    {
+      CpuSet_Set(&cs, i);
+    }
+    affinity >>= 1;
+  }
+  return Thread_Create_With_CpuSet(p, func, param, &cs);
+}
+
+
+WRes Thread_Close(CThread *p)
+{
+  // Print("Thread_Close")
+  int ret;
+  if (!p->_created)
+    return 0;
+    
+  ret = pthread_detach(p->_tid);
+  p->_tid = 0;
+  p->_created = 0;
+  return ret;
+}
+
+
+WRes Thread_Wait_Close(CThread *p)
+{
+  // Print("Thread_Wait_Close")
+  void *thread_return;
+  int ret;
+  if (!p->_created)
+    return EINVAL;
+
+  ret = pthread_join(p->_tid, &thread_return);
+  // probably we can't use that (_tid) after pthread_join(), so we close thread here
+  p->_created = 0;
+  p->_tid = 0;
+  return ret;
+}
+
+
+
+static WRes Event_Create(CEvent *p, int manualReset, int signaled)
+{
+  RINOK(pthread_mutex_init(&p->_mutex, NULL))
+  RINOK(pthread_cond_init(&p->_cond, NULL))
+  p->_manual_reset = manualReset;
+  p->_state = (signaled ? True : False);
+  p->_created = 1;
+  return 0;
+}
+
+WRes ManualResetEvent_Create(CManualResetEvent *p, int signaled)
+  { return Event_Create(p, True, signaled); }
+WRes ManualResetEvent_CreateNotSignaled(CManualResetEvent *p)
+  { return ManualResetEvent_Create(p, 0); }
+WRes AutoResetEvent_Create(CAutoResetEvent *p, int signaled)
+  { return Event_Create(p, False, signaled); }
+WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p)
+  { return AutoResetEvent_Create(p, 0); }
+
+
+#if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13)
+// freebsd:
+#pragma GCC diagnostic ignored "-Wthread-safety-analysis"
+#endif
+
+WRes Event_Set(CEvent *p)
+{
+  RINOK(pthread_mutex_lock(&p->_mutex))
+  p->_state = True;
+  {
+    const int res1 = pthread_cond_broadcast(&p->_cond);
+    const int res2 = pthread_mutex_unlock(&p->_mutex);
+    return (res2 ? res2 : res1);
+  }
+}
+
+WRes Event_Reset(CEvent *p)
+{
+  RINOK(pthread_mutex_lock(&p->_mutex))
+  p->_state = False;
+  return pthread_mutex_unlock(&p->_mutex);
+}
+ 
+WRes Event_Wait(CEvent *p)
+{
+  RINOK(pthread_mutex_lock(&p->_mutex))
+  while (p->_state == False)
+  {
+    // ETIMEDOUT
+    // ret =
+    pthread_cond_wait(&p->_cond, &p->_mutex);
+    // if (ret != 0) break;
+  }
+  if (p->_manual_reset == False)
+  {
+    p->_state = False;
+  }
+  return pthread_mutex_unlock(&p->_mutex);
+}
+
+WRes Event_Close(CEvent *p)
+{
+  if (!p->_created)
+    return 0;
+  p->_created = 0;
+  {
+    const int res1 = pthread_mutex_destroy(&p->_mutex);
+    const int res2 = pthread_cond_destroy(&p->_cond);
+    return (res1 ? res1 : res2);
+  }
+}
+
+
+WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount)
+{
+  if (initCount > maxCount || maxCount < 1)
+    return EINVAL;
+  RINOK(pthread_mutex_init(&p->_mutex, NULL))
+  RINOK(pthread_cond_init(&p->_cond, NULL))
+  p->_count = initCount;
+  p->_maxCount = maxCount;
+  p->_created = 1;
+  return 0;
+}
+
+
+WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount)
+{
+  if (Semaphore_IsCreated(p))
+  {
+    /*
+    WRes wres = Semaphore_Close(p);
+    if (wres != 0)
+      return wres;
+    */
+    if (initCount > maxCount || maxCount < 1)
+      return EINVAL;
+    // return EINVAL; // for debug
+    p->_count = initCount;
+    p->_maxCount = maxCount;
+    return 0;
+  }
+  return Semaphore_Create(p, initCount, maxCount);
+}
+
+
+WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 releaseCount)
+{
+  UInt32 newCount;
+  int ret;
+
+  if (releaseCount < 1)
+    return EINVAL;
+
+  RINOK(pthread_mutex_lock(&p->_mutex))
+
+  newCount = p->_count + releaseCount;
+  if (newCount > p->_maxCount)
+    ret = ERROR_TOO_MANY_POSTS; // EINVAL;
+  else
+  {
+    p->_count = newCount;
+    ret = pthread_cond_broadcast(&p->_cond);
+  }
+  RINOK(pthread_mutex_unlock(&p->_mutex))
+  return ret;
+}
+
+WRes Semaphore_Wait(CSemaphore *p)
+{
+  RINOK(pthread_mutex_lock(&p->_mutex))
+  while (p->_count < 1)
+  {
+    pthread_cond_wait(&p->_cond, &p->_mutex);
+  }
+  p->_count--;
+  return pthread_mutex_unlock(&p->_mutex);
+}
+
+WRes Semaphore_Close(CSemaphore *p)
+{
+  if (!p->_created)
+    return 0;
+  p->_created = 0;
+  {
+    const int res1 = pthread_mutex_destroy(&p->_mutex);
+    const int res2 = pthread_cond_destroy(&p->_cond);
+    return (res1 ? res1 : res2);
+  }
+}
+
+
+
+WRes CriticalSection_Init(CCriticalSection *p)
+{
+  // Print("CriticalSection_Init")
+  if (!p)
+    return EINTR;
+  return pthread_mutex_init(&p->_mutex, NULL);
+}
+
+void CriticalSection_Enter(CCriticalSection *p)
+{
+  // Print("CriticalSection_Enter")
+  if (p)
+  {
+    // int ret =
+    pthread_mutex_lock(&p->_mutex);
+  }
+}
+
+void CriticalSection_Leave(CCriticalSection *p)
+{
+  // Print("CriticalSection_Leave")
+  if (p)
+  {
+    // int ret =
+    pthread_mutex_unlock(&p->_mutex);
+  }
+}
+
+void CriticalSection_Delete(CCriticalSection *p)
+{
+  // Print("CriticalSection_Delete")
+  if (p)
+  {
+    // int ret =
+    pthread_mutex_destroy(&p->_mutex);
+  }
+}
+
+LONG InterlockedIncrement(LONG volatile *addend)
+{
+  // Print("InterlockedIncrement")
+  #ifdef USE_HACK_UNSAFE_ATOMIC
+    LONG val = *addend + 1;
+    *addend = val;
+    return val;
+  #else
+
+  #if defined(__clang__) && (__clang_major__ >= 8)
+    #pragma GCC diagnostic ignored "-Watomic-implicit-seq-cst"
+  #endif
+    return __sync_add_and_fetch(addend, 1);
+  #endif
+}
+
+LONG InterlockedDecrement(LONG volatile *addend)
+{
+  // Print("InterlockedDecrement")
+  #ifdef USE_HACK_UNSAFE_ATOMIC
+    LONG val = *addend - 1;
+    *addend = val;
+    return val;
+  #else
+    return __sync_sub_and_fetch(addend, 1);
+  #endif
+}
+
+#endif // _WIN32
+
+WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p)
+{
+  if (Event_IsCreated(p))
+    return Event_Reset(p);
+  return AutoResetEvent_CreateNotSignaled(p);
+}
+
+void ThreadNextGroup_Init(CThreadNextGroup *p, UInt32 numGroups, UInt32 startGroup)
+{
+  // printf("\n====== ThreadNextGroup_Init numGroups = %x: startGroup=%x\n", numGroups, startGroup);
+  if (numGroups == 0)
+      numGroups = 1;
+  p->NumGroups = numGroups;
+  p->NextGroup = startGroup % numGroups;
+}
+
+
+UInt32 ThreadNextGroup_GetNext(CThreadNextGroup *p)
+{
+  const UInt32 next = p->NextGroup;
+  p->NextGroup = (next + 1) % p->NumGroups;
+  return next;
+}
+
+#undef PRF
+#undef Print
diff --git a/src/sdk/C/Threads.h b/src/sdk/C/Threads.h
index e53ace4..be12e6e 100644
--- a/src/sdk/C/Threads.h
+++ b/src/sdk/C/Threads.h
@@ -1,38 +1,163 @@
 /* Threads.h -- multithreading library
-2017-06-18 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
-#ifndef __7Z_THREADS_H
-#define __7Z_THREADS_H
+#ifndef ZIP7_INC_THREADS_H
+#define ZIP7_INC_THREADS_H
 
 #ifdef _WIN32
-#include <windows.h>
+#include "7zWindows.h"
+
+#else
+
+#include "Compiler.h"
+
+// #define Z7_AFFINITY_DISABLE
+#if defined(__linux__)
+#if !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__)
+#ifndef Z7_AFFINITY_DISABLE
+#define Z7_AFFINITY_SUPPORTED
+// #pragma message(" ==== Z7_AFFINITY_SUPPORTED")
+#if !defined(_GNU_SOURCE)
+// #pragma message(" ==== _GNU_SOURCE set")
+// we need _GNU_SOURCE for cpu_set_t, if we compile for MUSL
+Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
+#define _GNU_SOURCE
+Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
+#endif
+#endif
+#endif
+#endif
+
+#include <pthread.h>
+
 #endif
 
 #include "7zTypes.h"
 
 EXTERN_C_BEGIN
 
+#ifdef _WIN32
+
 WRes HandlePtr_Close(HANDLE *h);
 WRes Handle_WaitObject(HANDLE h);
 
 typedef HANDLE CThread;
-#define Thread_Construct(p) *(p) = NULL
+
+#define Thread_CONSTRUCT(p) { *(p) = NULL; }
 #define Thread_WasCreated(p) (*(p) != NULL)
 #define Thread_Close(p) HandlePtr_Close(p)
-#define Thread_Wait(p) Handle_WaitObject(*(p))
+// #define Thread_Wait(p) Handle_WaitObject(*(p))
 
-typedef
 #ifdef UNDER_CE
-  DWORD
+  // if (USE_THREADS_CreateThread is      defined), we use _beginthreadex()
+  // if (USE_THREADS_CreateThread is not definned), we use CreateThread()
+  #define USE_THREADS_CreateThread
+#endif
+
+typedef
+    #ifdef USE_THREADS_CreateThread
+      DWORD
+    #else
+      unsigned
+    #endif
+    THREAD_FUNC_RET_TYPE;
+
+#define THREAD_FUNC_RET_ZERO  0
+
+typedef DWORD_PTR CAffinityMask;
+typedef DWORD_PTR CCpuSet;
+
+#define CpuSet_Zero(p)        *(p) = (0)
+#define CpuSet_Set(p, cpu)    *(p) |= ((DWORD_PTR)1 << (cpu))
+
+#else //  _WIN32
+
+typedef struct
+{
+  pthread_t _tid;
+  int _created;
+} CThread;
+
+#define Thread_CONSTRUCT(p)   { (p)->_tid = 0;  (p)->_created = 0; }
+#define Thread_WasCreated(p)  ((p)->_created != 0)
+WRes Thread_Close(CThread *p);
+// #define Thread_Wait Thread_Wait_Close
+
+typedef void * THREAD_FUNC_RET_TYPE;
+#define THREAD_FUNC_RET_ZERO  NULL
+
+
+typedef UInt64 CAffinityMask;
+
+#ifdef Z7_AFFINITY_SUPPORTED
+
+typedef cpu_set_t CCpuSet;
+#define CpuSet_Zero(p)        CPU_ZERO(p)
+#define CpuSet_Set(p, cpu)    CPU_SET(cpu, p)
+#define CpuSet_IsSet(p, cpu)  CPU_ISSET(cpu, p)
+
+#else
+
+typedef UInt64 CCpuSet;
+#define CpuSet_Zero(p)        *(p) = (0)
+#define CpuSet_Set(p, cpu)    *(p) |= ((UInt64)1 << (cpu))
+#define CpuSet_IsSet(p, cpu)  ((*(p) & ((UInt64)1 << (cpu))) != 0)
+
+#endif
+
+
+#endif //  _WIN32
+
+
+#define THREAD_FUNC_CALL_TYPE Z7_STDCALL
+
+#if defined(_WIN32) && defined(__GNUC__)
+/* GCC compiler for x86 32-bit uses the rule:
+   the stack is 16-byte aligned before CALL instruction for function calling.
+   But only root function main() contains instructions that
+   set 16-byte alignment for stack pointer. And another functions
+   just keep alignment, if it was set in some parent function.
+   
+   The problem:
+    if we create new thread in MinGW (GCC) 32-bit x86 via _beginthreadex() or CreateThread(),
+       the root function of thread doesn't set 16-byte alignment.
+       And stack frames in all child functions also will be unaligned in that case.
+   
+   Here we set (force_align_arg_pointer) attribute for root function of new thread.
+   Do we need (force_align_arg_pointer) also for another systems?  */
+  
+  #define THREAD_FUNC_ATTRIB_ALIGN_ARG __attribute__((force_align_arg_pointer))
+  // #define THREAD_FUNC_ATTRIB_ALIGN_ARG // for debug : bad alignment in SSE functions
 #else
-  unsigned
+  #define THREAD_FUNC_ATTRIB_ALIGN_ARG
 #endif
-  THREAD_FUNC_RET_TYPE;
 
-#define THREAD_FUNC_CALL_TYPE MY_STD_CALL
-#define THREAD_FUNC_DECL THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE
+#define THREAD_FUNC_DECL  THREAD_FUNC_ATTRIB_ALIGN_ARG THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE
+
 typedef THREAD_FUNC_RET_TYPE (THREAD_FUNC_CALL_TYPE * THREAD_FUNC_TYPE)(void *);
 WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param);
+WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity);
+WRes Thread_Wait_Close(CThread *p);
+
+#ifdef _WIN32
+WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask);
+#define Thread_Create_With_CpuSet(p, func, param, cs) \
+  Thread_Create_With_Affinity(p, func, param, *cs)
+#else
+WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet);
+#endif
+
+typedef struct
+{
+  unsigned NumGroups;
+  unsigned NextGroup;
+} CThreadNextGroup;
+
+void ThreadNextGroup_Init(CThreadNextGroup *p, unsigned numGroups, unsigned startGroup);
+unsigned ThreadNextGroup_GetNext(CThreadNextGroup *p);
+
+
+#ifdef _WIN32
 
 typedef HANDLE CEvent;
 typedef CEvent CAutoResetEvent;
@@ -54,6 +179,7 @@ typedef HANDLE CSemaphore;
 #define Semaphore_Close(p) HandlePtr_Close(p)
 #define Semaphore_Wait(p) Handle_WaitObject(*(p))
 WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount);
+WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount);
 WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num);
 WRes Semaphore_Release1(CSemaphore *p);
 
@@ -63,6 +189,72 @@ WRes CriticalSection_Init(CCriticalSection *p);
 #define CriticalSection_Enter(p) EnterCriticalSection(p)
 #define CriticalSection_Leave(p) LeaveCriticalSection(p)
 
+
+#else // _WIN32
+
+typedef struct
+{
+  int _created;
+  int _manual_reset;
+  int _state;
+  pthread_mutex_t _mutex;
+  pthread_cond_t _cond;
+} CEvent;
+
+typedef CEvent CAutoResetEvent;
+typedef CEvent CManualResetEvent;
+
+#define Event_Construct(p) (p)->_created = 0
+#define Event_IsCreated(p) ((p)->_created)
+
+WRes ManualResetEvent_Create(CManualResetEvent *p, int signaled);
+WRes ManualResetEvent_CreateNotSignaled(CManualResetEvent *p);
+WRes AutoResetEvent_Create(CAutoResetEvent *p, int signaled);
+WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p);
+
+WRes Event_Set(CEvent *p);
+WRes Event_Reset(CEvent *p);
+WRes Event_Wait(CEvent *p);
+WRes Event_Close(CEvent *p);
+
+
+typedef struct
+{
+  int _created;
+  UInt32 _count;
+  UInt32 _maxCount;
+  pthread_mutex_t _mutex;
+  pthread_cond_t _cond;
+} CSemaphore;
+
+#define Semaphore_Construct(p) (p)->_created = 0
+#define Semaphore_IsCreated(p) ((p)->_created)
+
+WRes Semaphore_Create(CSemaphore *p, UInt32 initCount, UInt32 maxCount);
+WRes Semaphore_OptCreateInit(CSemaphore *p, UInt32 initCount, UInt32 maxCount);
+WRes Semaphore_ReleaseN(CSemaphore *p, UInt32 num);
+#define Semaphore_Release1(p) Semaphore_ReleaseN(p, 1)
+WRes Semaphore_Wait(CSemaphore *p);
+WRes Semaphore_Close(CSemaphore *p);
+
+
+typedef struct
+{
+  pthread_mutex_t _mutex;
+} CCriticalSection;
+
+WRes CriticalSection_Init(CCriticalSection *p);
+void CriticalSection_Delete(CCriticalSection *cs);
+void CriticalSection_Enter(CCriticalSection *cs);
+void CriticalSection_Leave(CCriticalSection *cs);
+
+LONG InterlockedIncrement(LONG volatile *addend);
+LONG InterlockedDecrement(LONG volatile *addend);
+
+#endif  // _WIN32
+
+WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p);
+
 EXTERN_C_END
 
 #endif
diff --git a/src/sdk/C/Util/7z/7z.dsp b/src/sdk/C/Util/7z/7z.dsp
index be0f0a7..474c660 100644
--- a/src/sdk/C/Util/7z/7z.dsp
+++ b/src/sdk/C/Util/7z/7z.dsp
@@ -42,7 +42,7 @@ RSC=rc.exe
 # PROP Ignore_Export_Lib 0
 # PROP Target_Dir ""
 # ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
-# ADD CPP /nologo /MD /W4 /WX /GX /O2 /D "NDEBUG" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /FAcs /Yu"Precomp.h" /FD /c
+# ADD CPP /nologo /MD /W4 /WX /GX /O2 /D "NDEBUG" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /D "Z7_EXTRACT_ONLY" /FAcs /Yu"Precomp.h" /FD /c
 # ADD BASE RSC /l 0x419 /d "NDEBUG"
 # ADD RSC /l 0x419 /d "NDEBUG"
 BSC32=bscmake.exe
@@ -67,7 +67,7 @@ LINK32=link.exe
 # PROP Ignore_Export_Lib 0
 # PROP Target_Dir ""
 # ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
-# ADD CPP /nologo /W4 /WX /Gm /GX /ZI /Od /D "_DEBUG" /D "_SZ_ALLOC_DEBUG2" /D "_SZ_NO_INT_64_A" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /Yu"Precomp.h" /FD /GZ /c
+# ADD CPP /nologo /W4 /WX /Gm /GX /ZI /Od /D "_DEBUG" /D "_SZ_ALLOC_DEBUG2" /D "_SZ_NO_INT_64_A" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /D "Z7_EXTRACT_ONLY" /Yu"Precomp.h" /FD /GZ /c
 # ADD BASE RSC /l 0x419 /d "_DEBUG"
 # ADD RSC /l 0x419 /d "_DEBUG"
 BSC32=bscmake.exe
@@ -145,6 +145,10 @@ SOURCE=..\..\7zTypes.h
 # End Source File
 # Begin Source File
 
+SOURCE=..\..\7zWindows.h
+# End Source File
+# Begin Source File
+
 SOURCE=..\..\Bcj2.c
 # End Source File
 # Begin Source File
@@ -230,6 +234,10 @@ SOURCE=.\Precomp.c
 # End Source File
 # Begin Source File
 
+SOURCE=..\..\Precomp.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\Precomp.h
 # End Source File
 # End Group
diff --git a/src/sdk/C/Util/7z/7zMain.c b/src/sdk/C/Util/7z/7zMain.c
index 6ccc830..6baf979 100644
--- a/src/sdk/C/Util/7z/7zMain.c
+++ b/src/sdk/C/Util/7z/7zMain.c
@@ -1,34 +1,41 @@
 /* 7zMain.c - Test application for 7z Decoder
-2019-02-02 : Igor Pavlov : Public domain */
+2024-02-28 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include <stdio.h>
 #include <string.h>
 
-#include "../../CpuArch.h"
-
-#include "../../7z.h"
-#include "../../7zAlloc.h"
-#include "../../7zBuf.h"
-#include "../../7zCrc.h"
-#include "../../7zFile.h"
-#include "../../7zVersion.h"
-
 #ifndef USE_WINDOWS_FILE
 /* for mkdir */
 #ifdef _WIN32
 #include <direct.h>
 #else
+#include <stdlib.h>
+#include <time.h>
+#ifdef __GNUC__
+#include <sys/time.h>
+#endif
+#include <fcntl.h>
+// #include <utime.h>
 #include <sys/stat.h>
 #include <errno.h>
 #endif
 #endif
 
+#include "../../7zFile.h"
+#include "../../7z.h"
+#include "../../7zAlloc.h"
+#include "../../7zBuf.h"
+#include "../../7zCrc.h"
+#include "../../7zVersion.h"
+
+#include "../../CpuArch.h"
 
 #define kInputBufSize ((size_t)1 << 18)
 
 static const ISzAlloc g_Alloc = { SzAlloc, SzFree };
+// static const ISzAlloc g_Alloc_temp = { SzAllocTemp, SzFreeTemp };
 
 
 static void Print(const char *s)
@@ -46,19 +53,19 @@ static int Buf_EnsureSize(CBuf *dest, size_t size)
 }
 
 #ifndef _WIN32
-#define _USE_UTF8
+#define MY_USE_UTF8
 #endif
 
-/* #define _USE_UTF8 */
+/* #define MY_USE_UTF8 */
 
-#ifdef _USE_UTF8
+#ifdef MY_USE_UTF8
 
-#define _UTF8_START(n) (0x100 - (1 << (7 - (n))))
+#define MY_UTF8_START(n) (0x100 - (1 << (7 - (n))))
 
-#define _UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6))
+#define MY_UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6))
 
-#define _UTF8_HEAD(n, val) ((Byte)(_UTF8_START(n) + (val >> (6 * (n)))))
-#define _UTF8_CHAR(n, val) ((Byte)(0x80 + (((val) >> (6 * (n))) & 0x3F)))
+#define MY_UTF8_HEAD(n, val) ((Byte)(MY_UTF8_START(n) + (val >> (6 * (n)))))
+#define MY_UTF8_CHAR(n, val) ((Byte)(0x80 + (((val) >> (6 * (n))) & 0x3F)))
 
 static size_t Utf16_To_Utf8_Calc(const UInt16 *src, const UInt16 *srcLim)
 {
@@ -75,7 +82,7 @@ static size_t Utf16_To_Utf8_Calc(const UInt16 *src, const UInt16 *srcLim)
     if (val < 0x80)
       continue;
 
-    if (val < _UTF8_RANGE(1))
+    if (val < MY_UTF8_RANGE(1))
     {
       size++;
       continue;
@@ -83,7 +90,7 @@ static size_t Utf16_To_Utf8_Calc(const UInt16 *src, const UInt16 *srcLim)
 
     if (val >= 0xD800 && val < 0xDC00 && src != srcLim)
     {
-      UInt32 c2 = *src;
+      const UInt32 c2 = *src;
       if (c2 >= 0xDC00 && c2 < 0xE000)
       {
         src++;
@@ -108,37 +115,37 @@ static Byte *Utf16_To_Utf8(Byte *dest, const UInt16 *src, const UInt16 *srcLim)
     
     if (val < 0x80)
     {
-      *dest++ = (char)val;
+      *dest++ = (Byte)val;
       continue;
     }
 
-    if (val < _UTF8_RANGE(1))
+    if (val < MY_UTF8_RANGE(1))
     {
-      dest[0] = _UTF8_HEAD(1, val);
-      dest[1] = _UTF8_CHAR(0, val);
+      dest[0] = MY_UTF8_HEAD(1, val);
+      dest[1] = MY_UTF8_CHAR(0, val);
       dest += 2;
       continue;
     }
 
     if (val >= 0xD800 && val < 0xDC00 && src != srcLim)
     {
-      UInt32 c2 = *src;
+      const UInt32 c2 = *src;
       if (c2 >= 0xDC00 && c2 < 0xE000)
       {
         src++;
         val = (((val - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;
-        dest[0] = _UTF8_HEAD(3, val);
-        dest[1] = _UTF8_CHAR(2, val);
-        dest[2] = _UTF8_CHAR(1, val);
-        dest[3] = _UTF8_CHAR(0, val);
+        dest[0] = MY_UTF8_HEAD(3, val);
+        dest[1] = MY_UTF8_CHAR(2, val);
+        dest[2] = MY_UTF8_CHAR(1, val);
+        dest[3] = MY_UTF8_CHAR(0, val);
         dest += 4;
         continue;
       }
     }
     
-    dest[0] = _UTF8_HEAD(2, val);
-    dest[1] = _UTF8_CHAR(1, val);
-    dest[2] = _UTF8_CHAR(0, val);
+    dest[0] = MY_UTF8_HEAD(2, val);
+    dest[1] = MY_UTF8_CHAR(1, val);
+    dest[2] = MY_UTF8_CHAR(0, val);
     dest += 3;
   }
 }
@@ -156,27 +163,27 @@ static SRes Utf16_To_Utf8Buf(CBuf *dest, const UInt16 *src, size_t srcLen)
 #endif
 
 static SRes Utf16_To_Char(CBuf *buf, const UInt16 *s
-    #ifndef _USE_UTF8
+    #ifndef MY_USE_UTF8
     , UINT codePage
     #endif
     )
 {
-  unsigned len = 0;
-  for (len = 0; s[len] != 0; len++);
+  size_t len = 0;
+  for (len = 0; s[len] != 0; len++) {}
 
-  #ifndef _USE_UTF8
+  #ifndef MY_USE_UTF8
   {
-    unsigned size = len * 3 + 100;
+    const size_t size = len * 3 + 100;
     if (!Buf_EnsureSize(buf, size))
       return SZ_ERROR_MEM;
     {
       buf->data[0] = 0;
       if (len != 0)
       {
-        char defaultChar = '_';
+        const char defaultChar = '_';
         BOOL defUsed;
-        unsigned numChars = 0;
-        numChars = WideCharToMultiByte(codePage, 0, (LPCWSTR)s, len, (char *)buf->data, size, &defaultChar, &defUsed);
+        const unsigned numChars = (unsigned)WideCharToMultiByte(
+            codePage, 0, (LPCWSTR)s, (int)len, (char *)buf->data, (int)size, &defaultChar, &defUsed);
         if (numChars == 0 || numChars >= size)
           return SZ_ERROR_FAIL;
         buf->data[numChars] = 0;
@@ -192,8 +199,8 @@ static SRes Utf16_To_Char(CBuf *buf, const UInt16 *s
 #ifdef _WIN32
   #ifndef USE_WINDOWS_FILE
     static UINT g_FileCodePage = CP_ACP;
+    #define MY_FILE_CODE_PAGE_PARAM ,g_FileCodePage
   #endif
-  #define MY_FILE_CODE_PAGE_PARAM ,g_FileCodePage
 #else
   #define MY_FILE_CODE_PAGE_PARAM
 #endif
@@ -209,7 +216,7 @@ static WRes MyCreateDir(const UInt16 *name)
   CBuf buf;
   WRes res;
   Buf_Init(&buf);
-  RINOK(Utf16_To_Char(&buf, name MY_FILE_CODE_PAGE_PARAM));
+  RINOK(Utf16_To_Char(&buf, name MY_FILE_CODE_PAGE_PARAM))
 
   res =
   #ifdef _WIN32
@@ -232,7 +239,7 @@ static WRes OutFile_OpenUtf16(CSzFile *p, const UInt16 *name)
   CBuf buf;
   WRes res;
   Buf_Init(&buf);
-  RINOK(Utf16_To_Char(&buf, name MY_FILE_CODE_PAGE_PARAM));
+  RINOK(Utf16_To_Char(&buf, name MY_FILE_CODE_PAGE_PARAM))
   res = OutFile_Open(p, (const char *)buf.data);
   Buf_Free(&buf, &g_Alloc);
   return res;
@@ -246,7 +253,7 @@ static SRes PrintString(const UInt16 *s)
   SRes res;
   Buf_Init(&buf);
   res = Utf16_To_Char(&buf, s
-      #ifndef _USE_UTF8
+      #ifndef MY_USE_UTF8
       , CP_OEMCP
       #endif
       );
@@ -300,17 +307,143 @@ static void UIntToStr_2(char *s, unsigned value)
   s[1] = (char)('0' + (value % 10));
 }
 
+
 #define PERIOD_4 (4 * 365 + 1)
 #define PERIOD_100 (PERIOD_4 * 25 - 1)
 #define PERIOD_400 (PERIOD_100 * 4 + 1)
 
-static void ConvertFileTimeToString(const CNtfsFileTime *nt, char *s)
+
+
+#ifndef _WIN32
+
+// MS uses long for BOOL, but long is 32-bit in MS. So we use int.
+// typedef long BOOL;
+typedef int BOOL;
+
+typedef struct
+{
+  DWORD dwLowDateTime;
+  DWORD dwHighDateTime;
+} FILETIME;
+
+static LONG TIME_GetBias(void)
+{
+  const time_t utc = time(NULL);
+  struct tm *ptm = localtime(&utc);
+  const int localdaylight = ptm->tm_isdst; /* daylight for local timezone */
+  ptm = gmtime(&utc);
+  ptm->tm_isdst = localdaylight; /* use local daylight, not that of Greenwich */
+  return (int)(mktime(ptm) - utc);
+}
+
+#define TICKS_PER_SEC 10000000
+
+#define GET_TIME_64(pft) ((pft)->dwLowDateTime | ((UInt64)(pft)->dwHighDateTime << 32))
+
+#define SET_FILETIME(ft, v64) \
+   (ft)->dwLowDateTime = (DWORD)v64; \
+   (ft)->dwHighDateTime = (DWORD)(v64 >> 32);
+
+#define WINAPI
+#define TRUE 1
+
+static BOOL WINAPI FileTimeToLocalFileTime(const FILETIME *fileTime, FILETIME *localFileTime)
+{
+  UInt64 v = GET_TIME_64(fileTime);
+  v = (UInt64)((Int64)v - (Int64)TIME_GetBias() * TICKS_PER_SEC);
+  SET_FILETIME(localFileTime, v)
+  return TRUE;
+}
+
+static const UInt32 kNumTimeQuantumsInSecond = 10000000;
+static const UInt32 kFileTimeStartYear = 1601;
+static const UInt32 kUnixTimeStartYear = 1970;
+
+static Int64 Time_FileTimeToUnixTime64(const FILETIME *ft)
+{
+  const UInt64 kUnixTimeOffset =
+      (UInt64)60 * 60 * 24 * (89 + 365 * (kUnixTimeStartYear - kFileTimeStartYear));
+  const UInt64 winTime = GET_TIME_64(ft);
+  return (Int64)(winTime / kNumTimeQuantumsInSecond) - (Int64)kUnixTimeOffset;
+}
+
+#if defined(_AIX)
+  #define MY_ST_TIMESPEC st_timespec
+#else
+  #define MY_ST_TIMESPEC timespec
+#endif
+
+static void FILETIME_To_timespec(const FILETIME *ft, struct MY_ST_TIMESPEC *ts)
+{
+  if (ft)
+  {
+    const Int64 sec = Time_FileTimeToUnixTime64(ft);
+    // time_t is long
+    const time_t sec2 = (time_t)sec;
+    if (sec2 == sec)
+    {
+      ts->tv_sec = sec2;
+      {
+        const UInt64 winTime = GET_TIME_64(ft);
+        ts->tv_nsec = (long)((winTime % 10000000) * 100);
+      }
+      return;
+    }
+  }
+  // else
+  {
+    ts->tv_sec = 0;
+    // ts.tv_nsec = UTIME_NOW; // set to the current time
+    ts->tv_nsec = UTIME_OMIT; // keep old timesptamp
+  }
+}
+
+static WRes Set_File_FILETIME(const UInt16 *name, const FILETIME *mTime)
+{
+  struct timespec times[2];
+  
+  const int flags = 0; // follow link
+    // = AT_SYMLINK_NOFOLLOW; // don't follow link
+
+  CBuf buf;
+  int res;
+  Buf_Init(&buf);
+  RINOK(Utf16_To_Char(&buf, name MY_FILE_CODE_PAGE_PARAM))
+  FILETIME_To_timespec(NULL, &times[0]);
+  FILETIME_To_timespec(mTime, &times[1]);
+  res = utimensat(AT_FDCWD, (const char *)buf.data, times, flags);
+  Buf_Free(&buf, &g_Alloc);
+  if (res == 0)
+    return 0;
+  return errno;
+}
+
+#endif
+
+static void NtfsFileTime_to_FILETIME(const CNtfsFileTime *t, FILETIME *ft)
+{
+  ft->dwLowDateTime = (DWORD)(t->Low);
+  ft->dwHighDateTime = (DWORD)(t->High);
+}
+
+static void ConvertFileTimeToString(const CNtfsFileTime *nTime, char *s)
 {
   unsigned year, mon, hour, min, sec;
   Byte ms[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
-  unsigned t;
+  UInt32 t;
   UInt32 v;
-  UInt64 v64 = nt->Low | ((UInt64)nt->High << 32);
+  // UInt64 v64 = nt->Low | ((UInt64)nt->High << 32);
+  UInt64 v64;
+  {
+    FILETIME fileTime, locTime;
+    NtfsFileTime_to_FILETIME(nTime, &fileTime);
+    if (!FileTimeToLocalFileTime(&fileTime, &locTime))
+    {
+      locTime.dwHighDateTime =
+      locTime.dwLowDateTime = 0;
+    }
+    v64 = locTime.dwLowDateTime | ((UInt64)locTime.dwHighDateTime << 32);
+  }
   v64 /= 10000000;
   sec = (unsigned)(v64 % 60); v64 /= 60;
   min = (unsigned)(v64 % 60); v64 /= 60;
@@ -329,7 +462,7 @@ static void ConvertFileTimeToString(const CNtfsFileTime *nt, char *s)
     ms[1] = 29;
   for (mon = 0;; mon++)
   {
-    unsigned d = ms[mon];
+    const UInt32 d = ms[mon];
     if (v < d)
       break;
     v -= d;
@@ -342,7 +475,7 @@ static void ConvertFileTimeToString(const CNtfsFileTime *nt, char *s)
   UIntToStr_2(s, sec); s[2] = 0;
 }
 
-static void PrintLF()
+static void PrintLF(void)
 {
   Print("\n");
 }
@@ -354,6 +487,43 @@ static void PrintError(char *s)
   PrintLF();
 }
 
+static void PrintError_WRes(const char *message, WRes wres)
+{
+  Print("\nERROR: ");
+  Print(message);
+  PrintLF();
+  {
+    char s[32];
+    UIntToStr(s, (unsigned)wres, 1);
+    Print("System error code: ");
+    Print(s);
+  }
+  // sprintf(buffer + strlen(buffer), "\nSystem error code: %d", (unsigned)wres);
+  #ifdef _WIN32
+  {
+    char *s = NULL;
+    if (FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+        FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+        NULL, wres, 0, (LPSTR) &s, 0, NULL) != 0 && s)
+    {
+      Print(" : ");
+      Print(s);
+      LocalFree(s);
+    }
+  }
+  #else
+  {
+    const char *s = strerror(wres);
+    if (s)
+    {
+      Print(" : ");
+      Print(s);
+    }
+  }
+  #endif
+  PrintLF();
+}
+
 static void GetAttribString(UInt32 wa, BoolInt isDir, char *s)
 {
   #ifdef USE_WINDOWS_FILE
@@ -372,7 +542,7 @@ static void GetAttribString(UInt32 wa, BoolInt isDir, char *s)
 
 // #define NUM_PARENTS_MAX 128
 
-int MY_CDECL main(int numargs, char *args[])
+int Z7_CDECL main(int numargs, char *args[])
 {
   ISzAlloc allocImp;
   ISzAlloc allocTempImp;
@@ -412,18 +582,24 @@ int MY_CDECL main(int numargs, char *args[])
 
   allocImp = g_Alloc;
   allocTempImp = g_Alloc;
+  // allocTempImp = g_Alloc_temp;
 
-  #ifdef UNDER_CE
-  if (InFile_OpenW(&archiveStream.file, L"\test.7z"))
-  #else
-  if (InFile_Open(&archiveStream.file, args[2]))
-  #endif
   {
-    PrintError("can not open input file");
-    return 1;
+    WRes wres =
+    #ifdef UNDER_CE
+      InFile_OpenW(&archiveStream.file, L"\test.7z"); // change it
+    #else
+      InFile_Open(&archiveStream.file, args[2]);
+    #endif
+    if (wres != 0)
+    {
+      PrintError_WRes("cannot open input file", wres);
+      return 1;
+    }
   }
 
   FileInStream_CreateVTable(&archiveStream);
+  archiveStream.wres = 0;
   LookToRead2_CreateVTable(&lookStream, False);
   lookStream.buf = NULL;
 
@@ -437,7 +613,7 @@ int MY_CDECL main(int numargs, char *args[])
     {
       lookStream.bufSize = kInputBufSize;
       lookStream.realStream = &archiveStream.vt;
-      LookToRead2_Init(&lookStream);
+      LookToRead2_INIT(&lookStream)
     }
   }
     
@@ -483,7 +659,7 @@ int MY_CDECL main(int numargs, char *args[])
         size_t outSizeProcessed = 0;
         // const CSzFileItem *f = db.Files + i;
         size_t len;
-        unsigned isDir = SzArEx_IsDir(&db, i);
+        const BoolInt isDir = SzArEx_IsDir(&db, i);
         if (listCommand == 0 && isDir && !fullPaths)
           continue;
         len = SzArEx_GetFileNameUtf16(&db, i, NULL);
@@ -546,8 +722,8 @@ int MY_CDECL main(int numargs, char *args[])
         }
 
         Print(testCommand ?
-            "Testing    ":
-            "Extracting ");
+            "T ":
+            "- ");
         res = PrintString(temp);
         if (res != SZ_OK)
           break;
@@ -591,27 +767,37 @@ int MY_CDECL main(int numargs, char *args[])
             PrintLF();
             continue;
           }
-          else if (OutFile_OpenUtf16(&outFile, destPath))
+          else
           {
-            PrintError("can not open output file");
-            res = SZ_ERROR_FAIL;
-            break;
+            const WRes wres = OutFile_OpenUtf16(&outFile, destPath);
+            if (wres != 0)
+            {
+              PrintError_WRes("cannot open output file", wres);
+              res = SZ_ERROR_FAIL;
+              break;
+            }
           }
 
           processedSize = outSizeProcessed;
           
-          if (File_Write(&outFile, outBuffer + offset, &processedSize) != 0 || processedSize != outSizeProcessed)
           {
-            PrintError("can not write output file");
-            res = SZ_ERROR_FAIL;
-            break;
+            const WRes wres = File_Write(&outFile, outBuffer + offset, &processedSize);
+            if (wres != 0 || processedSize != outSizeProcessed)
+            {
+              PrintError_WRes("cannot write output file", wres);
+              res = SZ_ERROR_FAIL;
+              break;
+            }
           }
 
-          #ifdef USE_WINDOWS_FILE
           {
-            FILETIME mtime, ctime;
+            FILETIME mtime;
             FILETIME *mtimePtr = NULL;
+            
+            #ifdef USE_WINDOWS_FILE
+            FILETIME ctime;
             FILETIME *ctimePtr = NULL;
+            #endif
 
             if (SzBitWithVals_Check(&db.MTime, i))
             {
@@ -620,6 +806,8 @@ int MY_CDECL main(int numargs, char *args[])
               mtime.dwHighDateTime = (DWORD)(t->High);
               mtimePtr = &mtime;
             }
+
+            #ifdef USE_WINDOWS_FILE
             if (SzBitWithVals_Check(&db.CTime, i))
             {
               const CNtfsFileTime *t = &db.CTime.Vals[i];
@@ -627,16 +815,29 @@ int MY_CDECL main(int numargs, char *args[])
               ctime.dwHighDateTime = (DWORD)(t->High);
               ctimePtr = &ctime;
             }
+
             if (mtimePtr || ctimePtr)
               SetFileTime(outFile.handle, ctimePtr, NULL, mtimePtr);
-          }
-          #endif
+            #endif
           
-          if (File_Close(&outFile))
-          {
-            PrintError("can not close output file");
-            res = SZ_ERROR_FAIL;
-            break;
+            {
+              const WRes wres = File_Close(&outFile);
+              if (wres != 0)
+              {
+                PrintError_WRes("cannot close output file", wres);
+                res = SZ_ERROR_FAIL;
+                break;
+              }
+            }
+
+            #ifndef USE_WINDOWS_FILE
+            #ifdef _WIN32
+            mtimePtr = mtimePtr;
+            #else
+            if (mtimePtr)
+              Set_File_FILETIME(destPath, mtimePtr);
+            #endif
+            #endif
           }
           
           #ifdef USE_WINDOWS_FILE
@@ -672,13 +873,15 @@ int MY_CDECL main(int numargs, char *args[])
   if (res == SZ_ERROR_UNSUPPORTED)
     PrintError("decoder doesn't support this archive");
   else if (res == SZ_ERROR_MEM)
-    PrintError("can not allocate memory");
+    PrintError("cannot allocate memory");
   else if (res == SZ_ERROR_CRC)
     PrintError("CRC error");
+  else if (res == SZ_ERROR_READ /* || archiveStream.Res != 0 */)
+    PrintError_WRes("Read Error", archiveStream.wres);
   else
   {
     char s[32];
-    UInt64ToStr(res, s, 0);
+    UInt64ToStr((unsigned)res, s, 0);
     PrintError(s);
   }
   
diff --git a/src/sdk/C/Util/7z/Precomp.h b/src/sdk/C/Util/7z/Precomp.h
index 588a66f..13a41ef 100644
--- a/src/sdk/C/Util/7z/Precomp.h
+++ b/src/sdk/C/Util/7z/Precomp.h
@@ -1,10 +1,13 @@
-/* Precomp.h -- StdAfx
-2013-06-16 : Igor Pavlov : Public domain */
+/* Precomp.h -- Precomp
+2024-01-23 : Igor Pavlov : Public domain */
 
-#ifndef __7Z_PRECOMP_H
-#define __7Z_PRECOMP_H
-
-#include "../../Compiler.h"
-#include "../../7zTypes.h"
+// #ifndef ZIP7_INC_PRECOMP_LOC_H
+// #define ZIP7_INC_PRECOMP_LOC_H
 
+#if defined(_MSC_VER) && _MSC_VER >= 1800
+#pragma warning(disable : 4464) // relative include path contains '..'
 #endif
+
+#include "../../Precomp.h"
+
+// #endif
diff --git a/src/sdk/C/Util/7z/makefile b/src/sdk/C/Util/7z/makefile
index 9a49fd5..987f065 100644
--- a/src/sdk/C/Util/7z/makefile
+++ b/src/sdk/C/Util/7z/makefile
@@ -1,12 +1,10 @@
-CFLAGS = $(CFLAGS) -D_7ZIP_PPMD_SUPPPORT
+CFLAGS = $(CFLAGS) -DZ7_PPMD_SUPPORT -DZ7_EXTRACT_ONLY
 
 PROG = 7zDec.exe
 
 C_OBJS = \
   $O\7zAlloc.obj \
   $O\7zBuf.obj \
-  $O\7zCrc.obj \
-  $O\7zCrcOpt.obj \
   $O\7zFile.obj \
   $O\7zDec.obj \
   $O\7zArcIn.obj \
@@ -25,10 +23,14 @@ C_OBJS = \
 7Z_OBJS = \
   $O\7zMain.obj \
 
+!include "../../../CPP/7zip/Crc.mak"
+!include "../../../CPP/7zip/LzmaDec.mak"
+
 OBJS = \
   $O\Precomp.obj \
   $(7Z_OBJS) \
   $(C_OBJS) \
+  $(ASM_OBJS) \
 
 !include "../../../CPP/Build.mak"
 
@@ -38,3 +40,5 @@ $(C_OBJS): ../../$(*B).c
 	$(CCOMPL_USE)
 $O\Precomp.obj: Precomp.c
 	$(CCOMPL_PCH)
+
+!include "../../Asm_c.mak"
diff --git a/src/sdk/C/Util/7z/makefile.gcc b/src/sdk/C/Util/7z/makefile.gcc
index 51053ba..f48d362 100644
--- a/src/sdk/C/Util/7z/makefile.gcc
+++ b/src/sdk/C/Util/7z/makefile.gcc
@@ -1,75 +1,32 @@
-PROG = 7zDec
-CXX = gcc
-LIB =
-RM = rm -f
-CFLAGS = -c -O2 -Wall
-
-OBJS = 7zMain.o 7zAlloc.o 7zArcIn.o 7zBuf.o 7zBuf2.o 7zCrc.o 7zCrcOpt.o 7zDec.o CpuArch.o Delta.o LzmaDec.o Lzma2Dec.o Bra.o Bra86.o BraIA64.o Bcj2.o Ppmd7.o Ppmd7Dec.o 7zFile.o 7zStream.o
-
-all: $(PROG)
-
-$(PROG): $(OBJS)
-	$(CXX) -o $(PROG) $(LDFLAGS) $(OBJS) $(LIB)
-
-7zMain.o: 7zMain.c
-	$(CXX) $(CFLAGS) 7zMain.c
-
-7zAlloc.o: ../../7zAlloc.c
-	$(CXX) $(CFLAGS) ../../7zAlloc.c
-
-7zArcIn.o: ../../7zArcIn.c
-	$(CXX) $(CFLAGS) ../../7zArcIn.c
-
-7zBuf.o: ../../7zBuf.c
-	$(CXX) $(CFLAGS) ../../7zBuf.c
-
-7zBuf2.o: ../../7zBuf2.c
-	$(CXX) $(CFLAGS) ../../7zBuf2.c
-
-7zCrc.o: ../../7zCrc.c
-	$(CXX) $(CFLAGS) ../../7zCrc.c
-
-7zCrcOpt.o: ../../7zCrc.c
-	$(CXX) $(CFLAGS) ../../7zCrcOpt.c
-
-7zDec.o: ../../7zDec.c
-	$(CXX) $(CFLAGS) -D_7ZIP_PPMD_SUPPPORT ../../7zDec.c
-
-CpuArch.o: ../../CpuArch.c
-	$(CXX) $(CFLAGS) ../../CpuArch.c
-
-Delta.o: ../../Delta.c
-	$(CXX) $(CFLAGS) ../../Delta.c
-
-LzmaDec.o: ../../LzmaDec.c
-	$(CXX) $(CFLAGS) ../../LzmaDec.c
-
-Lzma2Dec.o: ../../Lzma2Dec.c
-	$(CXX) $(CFLAGS) ../../Lzma2Dec.c
-
-Bra.o: ../../Bra.c
-	$(CXX) $(CFLAGS) ../../Bra.c
-
-Bra86.o: ../../Bra86.c
-	$(CXX) $(CFLAGS) ../../Bra86.c
-
-BraIA64.o: ../../BraIA64.c
-	$(CXX) $(CFLAGS) ../../BraIA64.c
-
-Bcj2.o: ../../Bcj2.c
-	$(CXX) $(CFLAGS) ../../Bcj2.c
-
-Ppmd7.o: ../../Ppmd7.c
-	$(CXX) $(CFLAGS) ../../Ppmd7.c
-
-Ppmd7Dec.o: ../../Ppmd7Dec.c
-	$(CXX) $(CFLAGS) ../../Ppmd7Dec.c
-
-7zFile.o: ../../7zFile.c
-	$(CXX) $(CFLAGS) ../../7zFile.c
-
-7zStream.o: ../../7zStream.c
-	$(CXX) $(CFLAGS) ../../7zStream.c
-
-clean:
-	-$(RM) $(PROG) $(OBJS)
+PROG = 7zdec
+
+LOCAL_FLAGS = -DZ7_PPMD_SUPPORT -DZ7_EXTRACT_ONLY
+
+include ../../../CPP/7zip/LzmaDec_gcc.mak
+
+
+OBJS = \
+  $(LZMA_DEC_OPT_OBJS) \
+  $O/Bcj2.o \
+  $O/Bra.o \
+  $O/Bra86.o \
+  $O/BraIA64.o \
+  $O/CpuArch.o \
+  $O/Delta.o \
+  $O/Lzma2Dec.o \
+  $O/LzmaDec.o \
+  $O/Ppmd7.o \
+  $O/Ppmd7Dec.o \
+  $O/7zCrc.o \
+  $O/7zCrcOpt.o \
+  $O/7zAlloc.o \
+  $O/7zArcIn.o \
+  $O/7zBuf.o \
+  $O/7zBuf2.o \
+  $O/7zDec.o \
+  $O/7zMain.o \
+  $O/7zFile.o \
+  $O/7zStream.o \
+
+
+include ../../7zip_gcc_c.mak
diff --git a/src/sdk/C/Util/Lzma/LzmaUtil.c b/src/sdk/C/Util/Lzma/LzmaUtil.c
index 739bc0f..b9b974b 100644
--- a/src/sdk/C/Util/Lzma/LzmaUtil.c
+++ b/src/sdk/C/Util/Lzma/LzmaUtil.c
@@ -1,7 +1,7 @@
 /* LzmaUtil.c -- Test application for LZMA compression
-2018-07-04 : Igor Pavlov : Public domain */
+2023-03-07 : Igor Pavlov : Public domain */
 
-#include "../../Precomp.h"
+#include "Precomp.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -12,40 +12,89 @@
 #include "../../Alloc.h"
 #include "../../7zFile.h"
 #include "../../7zVersion.h"
+#include "../../LzFind.h"
 #include "../../LzmaDec.h"
 #include "../../LzmaEnc.h"
 
-static const char * const kCantReadMessage = "Can not read input file";
-static const char * const kCantWriteMessage = "Can not write output file";
-static const char * const kCantAllocateMessage = "Can not allocate memory";
+static const char * const kCantReadMessage = "Cannot read input file";
+static const char * const kCantWriteMessage = "Cannot write output file";
+static const char * const kCantAllocateMessage = "Cannot allocate memory";
 static const char * const kDataErrorMessage = "Data error";
 
-static void PrintHelp(char *buffer)
+static void Print(const char *s)
 {
-  strcat(buffer,
-    "\nLZMA-C " MY_VERSION_CPU " : " MY_COPYRIGHT_DATE "\n\n"
-    "Usage:  lzma <e|d> inputFile outputFile\n"
-    "  e: encode file\n"
-    "  d: decode file\n");
+  fputs(s, stdout);
 }
 
-static int PrintError(char *buffer, const char *message)
+static void PrintHelp(void)
 {
-  strcat(buffer, "\nError: ");
-  strcat(buffer, message);
-  strcat(buffer, "\n");
+  Print(
+    "\n" "LZMA-C " MY_VERSION_CPU " : " MY_COPYRIGHT_DATE
+    "\n"
+    "\n" "Usage:  lzma <e|d> inputFile outputFile"
+    "\n" "  e: encode file"
+    "\n" "  d: decode file"
+    "\n");
+}
+
+static int PrintError(const char *message)
+{
+  Print("\nError: ");
+  Print(message);
+  Print("\n");
+  return 1;
+}
+
+#define CONVERT_INT_TO_STR(charType, tempSize) \
+  unsigned char temp[tempSize]; unsigned i = 0; \
+  while (val >= 10) { temp[i++] = (unsigned char)('0' + (unsigned)(val % 10)); val /= 10; } \
+  *s++ = (charType)('0' + (unsigned)val); \
+  while (i != 0) { i--; *s++ = (charType)temp[i]; } \
+  *s = 0; \
+  return s;
+
+static char * Convert_unsigned_To_str(unsigned val, char *s)
+{
+  CONVERT_INT_TO_STR(char, 32)
+}
+
+static void Print_unsigned(unsigned code)
+{
+  char str[32];
+  Convert_unsigned_To_str(code, str);
+  Print(str);
+}
+
+static int PrintError_WRes(const char *message, WRes wres)
+{
+  PrintError(message);
+  Print("\nSystem error code: ");
+  Print_unsigned((unsigned)wres);
+  #ifndef _WIN32
+  {
+    const char *s = strerror(wres);
+    if (s)
+    {
+      Print(" : ");
+      Print(s);
+    }
+  }
+  #endif
+  Print("\n");
   return 1;
 }
 
-static int PrintErrorNumber(char *buffer, SRes val)
+static int PrintErrorNumber(SRes val)
 {
-  sprintf(buffer + strlen(buffer), "\nError code: %x\n", (unsigned)val);
+  Print("\n7-Zip error code: ");
+  Print_unsigned((unsigned)val);
+  Print("\n");
   return 1;
 }
 
-static int PrintUserError(char *buffer)
+static int PrintUserError(void)
 {
-  return PrintError(buffer, "Incorrect command");
+  return PrintError("Incorrect command");
 }
 
 
@@ -53,10 +102,10 @@ static int PrintUserError(char *buffer)
 #define OUT_BUF_SIZE (1 << 16)
 
 
-static SRes Decode2(CLzmaDec *state, ISeqOutStream *outStream, ISeqInStream *inStream,
+static SRes Decode2(CLzmaDec *state, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream,
     UInt64 unpackSize)
 {
-  int thereIsSize = (unpackSize != (UInt64)(Int64)-1);
+  const int thereIsSize = (unpackSize != (UInt64)(Int64)-1);
   Byte inBuf[IN_BUF_SIZE];
   Byte outBuf[OUT_BUF_SIZE];
   size_t inPos = 0, inSize = 0, outPos = 0;
@@ -66,7 +115,7 @@ static SRes Decode2(CLzmaDec *state, ISeqOutStream *outStream, ISeqInStream *inS
     if (inPos == inSize)
     {
       inSize = IN_BUF_SIZE;
-      RINOK(inStream->Read(inStream, inBuf, &inSize));
+      RINOK(inStream->Read(inStream, inBuf, &inSize))
       inPos = 0;
     }
     {
@@ -107,7 +156,7 @@ static SRes Decode2(CLzmaDec *state, ISeqOutStream *outStream, ISeqInStream *inS
 }
 
 
-static SRes Decode(ISeqOutStream *outStream, ISeqInStream *inStream)
+static SRes Decode(ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream)
 {
   UInt64 unpackSize;
   int i;
@@ -120,27 +169,29 @@ static SRes Decode(ISeqOutStream *outStream, ISeqInStream *inStream)
 
   /* Read and parse header */
 
-  RINOK(SeqInStream_Read(inStream, header, sizeof(header)));
-
+  {
+    size_t size = sizeof(header);
+    RINOK(SeqInStream_ReadMax(inStream, header, &size))
+    if (size != sizeof(header))
+      return SZ_ERROR_INPUT_EOF;
+  }
   unpackSize = 0;
   for (i = 0; i < 8; i++)
     unpackSize += (UInt64)header[LZMA_PROPS_SIZE + i] << (i * 8);
 
-  LzmaDec_Construct(&state);
-  RINOK(LzmaDec_Allocate(&state, header, LZMA_PROPS_SIZE, &g_Alloc));
+  LzmaDec_CONSTRUCT(&state)
+  RINOK(LzmaDec_Allocate(&state, header, LZMA_PROPS_SIZE, &g_Alloc))
   res = Decode2(&state, outStream, inStream, unpackSize);
   LzmaDec_Free(&state, &g_Alloc);
   return res;
 }
 
-static SRes Encode(ISeqOutStream *outStream, ISeqInStream *inStream, UInt64 fileSize, char *rs)
+static SRes Encode(ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, UInt64 fileSize)
 {
   CLzmaEncHandle enc;
   SRes res;
   CLzmaEncProps props;
 
-  UNUSED_VAR(rs);
-
   enc = LzmaEnc_Create(&g_Alloc);
   if (enc == 0)
     return SZ_ERROR_MEM;
@@ -170,7 +221,7 @@ static SRes Encode(ISeqOutStream *outStream, ISeqInStream *inStream, UInt64 file
 }
 
 
-static int main2(int numArgs, const char *args[], char *rs)
+int Z7_CDECL main(int numArgs, const char *args[])
 {
   CFileSeqInStream inStream;
   CFileOutStream outStream;
@@ -179,50 +230,63 @@ static int main2(int numArgs, const char *args[], char *rs)
   int encodeMode;
   BoolInt useOutFile = False;
 
+  LzFindPrepare();
+
   FileSeqInStream_CreateVTable(&inStream);
   File_Construct(&inStream.file);
+  inStream.wres = 0;
 
   FileOutStream_CreateVTable(&outStream);
   File_Construct(&outStream.file);
+  outStream.wres = 0;
 
   if (numArgs == 1)
   {
-    PrintHelp(rs);
+    PrintHelp();
     return 0;
   }
 
   if (numArgs < 3 || numArgs > 4 || strlen(args[1]) != 1)
-    return PrintUserError(rs);
+    return PrintUserError();
 
   c = args[1][0];
   encodeMode = (c == 'e' || c == 'E');
   if (!encodeMode && c != 'd' && c != 'D')
-    return PrintUserError(rs);
+    return PrintUserError();
 
+  /*
   {
     size_t t4 = sizeof(UInt32);
     size_t t8 = sizeof(UInt64);
     if (t4 != 4 || t8 != 8)
-      return PrintError(rs, "Incorrect UInt32 or UInt64");
+      return PrintError("Incorrect UInt32 or UInt64");
   }
+  */
 
-  if (InFile_Open(&inStream.file, args[2]) != 0)
-    return PrintError(rs, "Can not open input file");
+  {
+    const WRes wres = InFile_Open(&inStream.file, args[2]);
+    if (wres != 0)
+      return PrintError_WRes("Cannot open input file", wres);
+  }
 
   if (numArgs > 3)
   {
+    WRes wres;
     useOutFile = True;
-    if (OutFile_Open(&outStream.file, args[3]) != 0)
-      return PrintError(rs, "Can not open output file");
+    wres = OutFile_Open(&outStream.file, args[3]);
+    if (wres != 0)
+      return PrintError_WRes("Cannot open output file", wres);
   }
   else if (encodeMode)
-    PrintUserError(rs);
+    PrintUserError();
 
   if (encodeMode)
   {
     UInt64 fileSize;
-    File_GetLength(&inStream.file, &fileSize);
-    res = Encode(&outStream.vt, &inStream.vt, fileSize, rs);
+    const WRes wres = File_GetLength(&inStream.file, &fileSize);
+    if (wres != 0)
+      return PrintError_WRes("Cannot get file length", wres);
+    res = Encode(&outStream.vt, &inStream.vt, fileSize);
   }
   else
   {
@@ -236,23 +300,14 @@ static int main2(int numArgs, const char *args[], char *rs)
   if (res != SZ_OK)
   {
     if (res == SZ_ERROR_MEM)
-      return PrintError(rs, kCantAllocateMessage);
+      return PrintError(kCantAllocateMessage);
     else if (res == SZ_ERROR_DATA)
-      return PrintError(rs, kDataErrorMessage);
+      return PrintError(kDataErrorMessage);
     else if (res == SZ_ERROR_WRITE)
-      return PrintError(rs, kCantWriteMessage);
+      return PrintError_WRes(kCantWriteMessage, outStream.wres);
     else if (res == SZ_ERROR_READ)
-      return PrintError(rs, kCantReadMessage);
-    return PrintErrorNumber(rs, res);
+      return PrintError_WRes(kCantReadMessage, inStream.wres);
+    return PrintErrorNumber(res);
   }
   return 0;
 }
-
-
-int MY_CDECL main(int numArgs, const char *args[])
-{
-  char rs[800] = { 0 };
-  int res = main2(numArgs, args, rs);
-  fputs(rs, stdout);
-  return res;
-}
diff --git a/src/sdk/C/Util/Lzma/LzmaUtil.dsp b/src/sdk/C/Util/Lzma/LzmaUtil.dsp
index f060a26..71de950 100644
--- a/src/sdk/C/Util/Lzma/LzmaUtil.dsp
+++ b/src/sdk/C/Util/Lzma/LzmaUtil.dsp
@@ -106,6 +106,10 @@ SOURCE=..\..\7zVersion.h
 # End Source File
 # Begin Source File
 
+SOURCE=..\..\7zWindows.h
+# End Source File
+# Begin Source File
+
 SOURCE=..\..\Alloc.c
 # End Source File
 # Begin Source File
@@ -114,6 +118,14 @@ SOURCE=..\..\Alloc.h
 # End Source File
 # Begin Source File
 
+SOURCE=..\..\Compiler.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\CpuArch.c
+# End Source File
+# Begin Source File
+
 SOURCE=..\..\CpuArch.h
 # End Source File
 # Begin Source File
@@ -134,6 +146,10 @@ SOURCE=..\..\LzFindMt.h
 # End Source File
 # Begin Source File
 
+SOURCE=..\..\LzFindOpt.c
+# End Source File
+# Begin Source File
+
 SOURCE=..\..\LzHash.h
 # End Source File
 # Begin Source File
@@ -158,6 +174,14 @@ SOURCE=.\LzmaUtil.c
 # End Source File
 # Begin Source File
 
+SOURCE=..\..\Precomp.h
+# End Source File
+# Begin Source File
+
+SOURCE=.\Precomp.h
+# End Source File
+# Begin Source File
+
 SOURCE=..\..\Threads.c
 # End Source File
 # Begin Source File
diff --git a/src/sdk/C/Util/Lzma/Precomp.h b/src/sdk/C/Util/Lzma/Precomp.h
new file mode 100644
index 0000000..13a41ef
--- /dev/null
+++ b/src/sdk/C/Util/Lzma/Precomp.h
@@ -0,0 +1,13 @@
+/* Precomp.h -- Precomp
+2024-01-23 : Igor Pavlov : Public domain */
+
+// #ifndef ZIP7_INC_PRECOMP_LOC_H
+// #define ZIP7_INC_PRECOMP_LOC_H
+
+#if defined(_MSC_VER) && _MSC_VER >= 1800
+#pragma warning(disable : 4464) // relative include path contains '..'
+#endif
+
+#include "../../Precomp.h"
+
+// #endif
diff --git a/src/sdk/C/Util/Lzma/makefile b/src/sdk/C/Util/Lzma/makefile
index 4795322..7813bdb 100644
--- a/src/sdk/C/Util/Lzma/makefile
+++ b/src/sdk/C/Util/Lzma/makefile
@@ -8,8 +8,10 @@ LIB_OBJS = \
 
 C_OBJS = \
   $O\Alloc.obj \
+  $O\CpuArch.obj \
   $O\LzFind.obj \
   $O\LzFindMt.obj \
+  $O\LzFindOpt.obj \
   $O\LzmaDec.obj \
   $O\LzmaEnc.obj \
   $O\7zFile.obj \
diff --git a/src/sdk/C/Util/Lzma/makefile.gcc b/src/sdk/C/Util/Lzma/makefile.gcc
index 67aa8b1..2acb0b8 100644
--- a/src/sdk/C/Util/Lzma/makefile.gcc
+++ b/src/sdk/C/Util/Lzma/makefile.gcc
@@ -1,44 +1,21 @@
-PROG = lzma
-CXX = g++
-LIB =
-RM = rm -f
-CFLAGS = -c -O2 -Wall -D_7ZIP_ST
+PROG = 7lzma
 
-OBJS = \
-  LzmaUtil.o \
-  Alloc.o \
-  LzFind.o \
-  LzmaDec.o \
-  LzmaEnc.o \
-  7zFile.o \
-  7zStream.o \
-
-
-all: $(PROG)
-
-$(PROG): $(OBJS)
-	$(CXX) -o $(PROG) $(LDFLAGS) $(OBJS) $(LIB) $(LIB2)
-
-LzmaUtil.o: LzmaUtil.c
-	$(CXX) $(CFLAGS) LzmaUtil.c
-
-Alloc.o: ../../Alloc.c
-	$(CXX) $(CFLAGS) ../../Alloc.c
+include ../../../CPP/7zip/LzmaDec_gcc.mak
 
-LzFind.o: ../../LzFind.c
-	$(CXX) $(CFLAGS) ../../LzFind.c
 
-LzmaDec.o: ../../LzmaDec.c
-	$(CXX) $(CFLAGS) ../../LzmaDec.c
-
-LzmaEnc.o: ../../LzmaEnc.c
-	$(CXX) $(CFLAGS) ../../LzmaEnc.c
-
-7zFile.o: ../../7zFile.c
-	$(CXX) $(CFLAGS) ../../7zFile.c
-
-7zStream.o: ../../7zStream.c
-	$(CXX) $(CFLAGS) ../../7zStream.c
-
-clean:
-	-$(RM) $(PROG) $(OBJS)
+OBJS = \
+  $(LZMA_DEC_OPT_OBJS) \
+  $O/7zFile.o \
+  $O/7zStream.o \
+  $O/Alloc.o \
+  $O/CpuArch.o \
+  $O/LzFind.o \
+  $O/LzFindMt.o \
+  $O/LzFindOpt.o \
+  $O/LzmaDec.o \
+  $O/LzmaEnc.o \
+  $O/LzmaUtil.o \
+  $O/Threads.o \
+
+
+include ../../7zip_gcc_c.mak
diff --git a/src/sdk/C/Util/LzmaLib/LzmaLib.dsp b/src/sdk/C/Util/LzmaLib/LzmaLib.dsp
index 3421de8..f413137 100644
--- a/src/sdk/C/Util/LzmaLib/LzmaLib.dsp
+++ b/src/sdk/C/Util/LzmaLib/LzmaLib.dsp
@@ -43,7 +43,7 @@ RSC=rc.exe
 # PROP Ignore_Export_Lib 0
 # PROP Target_Dir ""
 # ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /c
-# ADD CPP /nologo /Gr /MT /W3 /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c
+# ADD CPP /nologo /Gr /MT /W4 /WX /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c
 # SUBTRACT CPP /YX
 # ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32
 # ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32
@@ -71,7 +71,7 @@ LINK32=link.exe
 # PROP Ignore_Export_Lib 0
 # PROP Target_Dir ""
 # ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /GZ /c
-# ADD CPP /nologo /MTd /W3 /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c
+# ADD CPP /nologo /MTd /W4 /WX /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c
 # SUBTRACT CPP /YX
 # ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
 # ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
@@ -101,6 +101,10 @@ SOURCE=.\LzmaLib.def
 
 SOURCE=.\LzmaLibExports.c
 # End Source File
+# Begin Source File
+
+SOURCE=.\Precomp.h
+# End Source File
 # End Group
 # Begin Source File
 
@@ -108,6 +112,10 @@ SOURCE=..\..\7zTypes.h
 # End Source File
 # Begin Source File
 
+SOURCE=..\..\7zWindows.h
+# End Source File
+# Begin Source File
+
 SOURCE=..\..\Alloc.c
 # End Source File
 # Begin Source File
@@ -116,6 +124,18 @@ SOURCE=..\..\Alloc.h
 # End Source File
 # Begin Source File
 
+SOURCE=..\..\Compiler.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\CpuArch.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\CpuArch.h
+# End Source File
+# Begin Source File
+
 SOURCE=..\..\IStream.h
 # End Source File
 # Begin Source File
@@ -136,6 +156,10 @@ SOURCE=..\..\LzFindMt.h
 # End Source File
 # Begin Source File
 
+SOURCE=..\..\LzFindOpt.c
+# End Source File
+# Begin Source File
+
 SOURCE=..\..\LzHash.h
 # End Source File
 # Begin Source File
@@ -164,6 +188,10 @@ SOURCE=..\..\LzmaLib.h
 # End Source File
 # Begin Source File
 
+SOURCE=..\..\Precomp.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\resource.rc
 # End Source File
 # Begin Source File
diff --git a/src/sdk/C/Util/LzmaLib/LzmaLibExports.c b/src/sdk/C/Util/LzmaLib/LzmaLibExports.c
index 4a28a9a..a46c9a8 100644
--- a/src/sdk/C/Util/LzmaLib/LzmaLibExports.c
+++ b/src/sdk/C/Util/LzmaLib/LzmaLibExports.c
@@ -1,14 +1,15 @@
 /* LzmaLibExports.c -- LZMA library DLL Entry point
-2015-11-08 : Igor Pavlov : Public domain */
+2023-03-05 : Igor Pavlov : Public domain */
 
-#include "../../Precomp.h"
+#include "Precomp.h"
 
-#include <windows.h>
+#include "../../7zWindows.h"
 
+BOOL WINAPI DllMain(HINSTANCE hInstance, DWORD dwReason, LPVOID lpReserved);
 BOOL WINAPI DllMain(HINSTANCE hInstance, DWORD dwReason, LPVOID lpReserved)
 {
-  UNUSED_VAR(hInstance);
-  UNUSED_VAR(dwReason);
-  UNUSED_VAR(lpReserved);
+  UNUSED_VAR(hInstance)
+  UNUSED_VAR(dwReason)
+  UNUSED_VAR(lpReserved)
   return TRUE;
 }
diff --git a/src/sdk/C/Util/LzmaLib/Precomp.c b/src/sdk/C/Util/LzmaLib/Precomp.c
new file mode 100644
index 0000000..01605e3
--- /dev/null
+++ b/src/sdk/C/Util/LzmaLib/Precomp.c
@@ -0,0 +1,4 @@
+/* Precomp.c -- StdAfx
+2013-01-21 : Igor Pavlov : Public domain */
+
+#include "Precomp.h"
diff --git a/src/sdk/C/Util/LzmaLib/Precomp.h b/src/sdk/C/Util/LzmaLib/Precomp.h
new file mode 100644
index 0000000..13a41ef
--- /dev/null
+++ b/src/sdk/C/Util/LzmaLib/Precomp.h
@@ -0,0 +1,13 @@
+/* Precomp.h -- Precomp
+2024-01-23 : Igor Pavlov : Public domain */
+
+// #ifndef ZIP7_INC_PRECOMP_LOC_H
+// #define ZIP7_INC_PRECOMP_LOC_H
+
+#if defined(_MSC_VER) && _MSC_VER >= 1800
+#pragma warning(disable : 4464) // relative include path contains '..'
+#endif
+
+#include "../../Precomp.h"
+
+// #endif
diff --git a/src/sdk/C/Util/LzmaLib/makefile b/src/sdk/C/Util/LzmaLib/makefile
index 74103bb..9ed0aa4 100644
--- a/src/sdk/C/Util/LzmaLib/makefile
+++ b/src/sdk/C/Util/LzmaLib/makefile
@@ -11,6 +11,7 @@ LIB_OBJS = \
 
 C_OBJS = \
   $O\Alloc.obj \
+  $O\CpuArch.obj \
   $O\LzFind.obj \
   $O\LzFindMt.obj \
   $O\LzmaDec.obj \
@@ -18,9 +19,14 @@ C_OBJS = \
   $O\LzmaLib.obj \
   $O\Threads.obj \
 
+!include "../../../CPP/7zip/LzFindOpt.mak"
+!include "../../../CPP/7zip/LzmaDec.mak"
+
 OBJS = \
+  $O\Precomp.obj \
   $(LIB_OBJS) \
   $(C_OBJS) \
+  $(ASM_OBJS) \
   $O\resource.res
 
 !include "../../../CPP/Build.mak"
@@ -28,7 +34,26 @@ OBJS = \
 $(SLIBPATH): $O $(OBJS)
 	lib -out:$(SLIBPATH) $(OBJS) $(LIBS)
 
+
+MAK_SINGLE_FILE = 1
+
+$O\Precomp.obj: Precomp.c
+	$(CCOMPL_PCH)
+
+!IFDEF MAK_SINGLE_FILE
+
 $(LIB_OBJS): $(*B).c
-	$(COMPL_O2)
+	$(CCOMPL_USE)
 $(C_OBJS): ../../$(*B).c
-	$(COMPL_O2)
+	$(CCOMPL_USE)
+
+!ELSE
+
+{.}.c{$O}.obj::
+	$(CCOMPLB_USE)
+{../../../C}.c{$O}.obj::
+	$(CCOMPLB_USE)
+
+!ENDIF
+
+!include "../../Asm_c.mak"
diff --git a/src/sdk/C/Util/SfxSetup/Precomp.h b/src/sdk/C/Util/SfxSetup/Precomp.h
index 588a66f..13a41ef 100644
--- a/src/sdk/C/Util/SfxSetup/Precomp.h
+++ b/src/sdk/C/Util/SfxSetup/Precomp.h
@@ -1,10 +1,13 @@
-/* Precomp.h -- StdAfx
-2013-06-16 : Igor Pavlov : Public domain */
+/* Precomp.h -- Precomp
+2024-01-23 : Igor Pavlov : Public domain */
 
-#ifndef __7Z_PRECOMP_H
-#define __7Z_PRECOMP_H
-
-#include "../../Compiler.h"
-#include "../../7zTypes.h"
+// #ifndef ZIP7_INC_PRECOMP_LOC_H
+// #define ZIP7_INC_PRECOMP_LOC_H
 
+#if defined(_MSC_VER) && _MSC_VER >= 1800
+#pragma warning(disable : 4464) // relative include path contains '..'
 #endif
+
+#include "../../Precomp.h"
+
+// #endif
diff --git a/src/sdk/C/Util/SfxSetup/SfxSetup.c b/src/sdk/C/Util/SfxSetup/SfxSetup.c
index ef19aea..9b5c1f9 100644
--- a/src/sdk/C/Util/SfxSetup/SfxSetup.c
+++ b/src/sdk/C/Util/SfxSetup/SfxSetup.c
@@ -1,5 +1,5 @@
 /* SfxSetup.c - 7z SFX Setup
-2019-02-02 : Igor Pavlov : Public domain */
+2024-01-24 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -26,6 +26,12 @@
 
 #define kInputBufSize ((size_t)1 << 18)
 
+
+#define wcscat lstrcatW
+#define wcslen (size_t)lstrlenW
+#define wcscpy lstrcpyW
+// wcsncpy() and lstrcpynW() work differently. We don't use them.
+
 static const char * const kExts[] =
 {
     "bat"
@@ -64,7 +70,7 @@ static unsigned FindExt(const wchar_t *s, unsigned *extLen)
   return len;
 }
 
-#define MAKE_CHAR_UPPER(c) ((((c) >= 'a' && (c) <= 'z') ? (c) -= 0x20 : (c)))
+#define MAKE_CHAR_UPPER(c) ((((c) >= 'a' && (c) <= 'z') ? (c) - 0x20 : (c)))
 
 static unsigned FindItem(const char * const *items, unsigned num, const wchar_t *s, unsigned len)
 {
@@ -72,13 +78,13 @@ static unsigned FindItem(const char * const *items, unsigned num, const wchar_t
   for (i = 0; i < num; i++)
   {
     const char *item = items[i];
-    unsigned itemLen = (unsigned)strlen(item);
+    const unsigned itemLen = (unsigned)strlen(item);
     unsigned j;
     if (len != itemLen)
       continue;
     for (j = 0; j < len; j++)
     {
-      unsigned c = (Byte)item[j];
+      const unsigned c = (Byte)item[j];
       if (c != s[j] && MAKE_CHAR_UPPER(c) != s[j])
         break;
     }
@@ -96,10 +102,20 @@ static BOOL WINAPI HandlerRoutine(DWORD ctrlType)
 }
 #endif
 
+
+#ifdef _CONSOLE
+static void PrintStr(const char *s)
+{
+  fputs(s, stdout);
+}
+#endif
+
 static void PrintErrorMessage(const char *message)
 {
   #ifdef _CONSOLE
-  printf("\n7-Zip Error: %s\n", message);
+  PrintStr("\n7-Zip Error: ");
+  PrintStr(message);
+  PrintStr("\n");
   #else
   #ifdef UNDER_CE
   WCHAR messageW[256 + 4];
@@ -179,7 +195,7 @@ static WRes RemoveDirWithSubItems(WCHAR *path)
   WIN32_FIND_DATAW fd;
   HANDLE handle;
   WRes res = 0;
-  size_t len = wcslen(path);
+  const size_t len = wcslen(path);
   wcscpy(path + len, L"*");
   handle = FindFirstFileW(path, &fd);
   path[len] = L'\0';
@@ -228,7 +244,7 @@ static WRes RemoveDirWithSubItems(WCHAR *path)
 }
 
 #ifdef _CONSOLE
-int MY_CDECL main()
+int Z7_CDECL main(void)
 #else
 int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
   #ifdef UNDER_CE
@@ -262,10 +278,10 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
   #ifdef _CONSOLE
   SetConsoleCtrlHandler(HandlerRoutine, TRUE);
   #else
-  UNUSED_VAR(hInstance);
-  UNUSED_VAR(hPrevInstance);
-  UNUSED_VAR(lpCmdLine);
-  UNUSED_VAR(nCmdShow);
+  UNUSED_VAR(hInstance)
+  UNUSED_VAR(hPrevInstance)
+  UNUSED_VAR(lpCmdLine)
+  UNUSED_VAR(nCmdShow)
   #endif
 
   CrcGenerateTable();
@@ -290,7 +306,7 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
       BoolInt quoteMode = False;
       for (;; cmdLineParams++)
       {
-        wchar_t c = *cmdLineParams;
+        const wchar_t c = *cmdLineParams;
         if (c == L'\"')
           quoteMode = !quoteMode;
         else if (c == 0 || (c == L' ' && !quoteMode))
@@ -324,7 +340,7 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
         unsigned k;
         for (k = 0; k < 8; k++)
         {
-          unsigned t = value & 0xF;
+          const unsigned t = value & 0xF;
           value >>= 4;
           s[7 - k] = (wchar_t)((t < 10) ? ('0' + t) : ('A' + (t - 10)));
         }
@@ -386,7 +402,7 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
     {
       lookStream.bufSize = kInputBufSize;
       lookStream.realStream = &archiveStream.vt;
-      LookToRead2_Init(&lookStream);
+      LookToRead2_INIT(&lookStream)
     }
   }
 
@@ -455,11 +471,11 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
           unsigned extLen;
           const WCHAR *name = temp + nameStartPos;
           unsigned len = (unsigned)wcslen(name);
-          unsigned nameLen = FindExt(temp + nameStartPos, &extLen);
-          unsigned extPrice = FindItem(kExts, sizeof(kExts) / sizeof(kExts[0]), name + len - extLen, extLen);
-          unsigned namePrice = FindItem(kNames, sizeof(kNames) / sizeof(kNames[0]), name, nameLen);
+          const unsigned nameLen = FindExt(temp + nameStartPos, &extLen);
+          const unsigned extPrice = FindItem(kExts, sizeof(kExts) / sizeof(kExts[0]), name + len - extLen, extLen);
+          const unsigned namePrice = FindItem(kNames, sizeof(kNames) / sizeof(kNames[0]), name, nameLen);
 
-          unsigned price = namePrice + extPrice * 64 + (nameStartPos == 0 ? 0 : (1 << 12));
+          const unsigned price = namePrice + extPrice * 64 + (nameStartPos == 0 ? 0 : (1 << 12));
           if (minPrice > price)
           {
             minPrice = price;
@@ -500,12 +516,13 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
         #endif
         
         {
-          SRes res2 = File_Close(&outFile);
+          const WRes res2 = File_Close(&outFile);
           if (res != SZ_OK)
             break;
-          if (res2 != SZ_OK)
+          if (res2 != 0)
           {
-            res = res2;
+            errorMessage = "Can't close output file";
+            res = SZ_ERROR_FAIL;
             break;
           }
         }
@@ -550,7 +567,7 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
     WCHAR oldCurDir[MAX_PATH + 2];
     oldCurDir[0] = 0;
     {
-      DWORD needLen = GetCurrentDirectory(MAX_PATH + 1, oldCurDir);
+      const DWORD needLen = GetCurrentDirectory(MAX_PATH + 1, oldCurDir);
       if (needLen == 0 || needLen > MAX_PATH)
         oldCurDir[0] = 0;
       SetCurrentDirectory(workCurDir);
diff --git a/src/sdk/C/Util/SfxSetup/makefile b/src/sdk/C/Util/SfxSetup/makefile
index 544da67..b3f25a2 100644
--- a/src/sdk/C/Util/SfxSetup/makefile
+++ b/src/sdk/C/Util/SfxSetup/makefile
@@ -1,13 +1,14 @@
 PROG = 7zS2.sfx
 MY_FIXED = 1
 
+CFLAGS = $(CFLAGS) \
+  -DZ7_EXTRACT_ONLY \
+
 C_OBJS = \
   $O\7zAlloc.obj \
   $O\7zArcIn.obj \
   $O\7zBuf.obj \
   $O\7zBuf2.obj \
-  $O\7zCrc.obj \
-  $O\7zCrcOpt.obj \
   $O\7zFile.obj \
   $O\7zDec.obj \
   $O\7zStream.obj \
@@ -24,9 +25,13 @@ C_OBJS = \
 7Z_OBJS = \
   $O\SfxSetup.obj \
 
+!include "../../../CPP/7zip/Crc.mak"
+# !include "../../../CPP/7zip/LzmaDec.mak"
+
 OBJS = \
   $(7Z_OBJS) \
   $(C_OBJS) \
+  $(ASM_OBJS) \
   $O\resource.res
 
 !include "../../../CPP/Build.mak"
@@ -35,3 +40,5 @@ $(7Z_OBJS): $(*B).c
 	$(COMPL_O1)
 $(C_OBJS): ../../$(*B).c
 	$(COMPL_O1)
+
+!include "../../Asm_c.mak"
diff --git a/src/sdk/C/Util/SfxSetup/makefile_con b/src/sdk/C/Util/SfxSetup/makefile_con
index d0f8352..9f4b916 100644
--- a/src/sdk/C/Util/SfxSetup/makefile_con
+++ b/src/sdk/C/Util/SfxSetup/makefile_con
@@ -1,6 +1,8 @@
 PROG = 7zS2con.sfx
 MY_FIXED = 1
-CFLAGS = $(CFLAGS) -D_CONSOLE
+
+CFLAGS = $(CFLAGS) -D_CONSOLE \
+  -DZ7_EXTRACT_ONLY \
 
 C_OBJS = \
   $O\7zAlloc.obj \
diff --git a/src/sdk/C/Xz.c b/src/sdk/C/Xz.c
index d9f83df..d07550d 100644
--- a/src/sdk/C/Xz.c
+++ b/src/sdk/C/Xz.c
@@ -1,5 +1,5 @@
 /* Xz.c - Xz
-2017-05-12 : Igor Pavlov : Public domain */
+2024-03-01 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -41,7 +41,7 @@ void Xz_Free(CXzStream *p, ISzAllocPtr alloc)
 unsigned XzFlags_GetCheckSize(CXzStreamFlags f)
 {
   unsigned t = XzFlags_GetCheckType(f);
-  return (t == 0) ? 0 : (4 << ((t - 1) / 3));
+  return (t == 0) ? 0 : ((unsigned)4 << ((t - 1) / 3));
 }
 
 void XzCheck_Init(CXzCheck *p, unsigned mode)
@@ -52,6 +52,7 @@ void XzCheck_Init(CXzCheck *p, unsigned mode)
     case XZ_CHECK_CRC32: p->crc = CRC_INIT_VAL; break;
     case XZ_CHECK_CRC64: p->crc64 = CRC64_INIT_VAL; break;
     case XZ_CHECK_SHA256: Sha256_Init(&p->sha); break;
+    default: break;
   }
 }
 
@@ -62,6 +63,7 @@ void XzCheck_Update(CXzCheck *p, const void *data, size_t size)
     case XZ_CHECK_CRC32: p->crc = CrcUpdate(p->crc, data, size); break;
     case XZ_CHECK_CRC64: p->crc64 = Crc64Update(p->crc64, data, size); break;
     case XZ_CHECK_SHA256: Sha256_Update(&p->sha, (const Byte *)data, size); break;
+    default: break;
   }
 }
 
@@ -70,7 +72,7 @@ int XzCheck_Final(CXzCheck *p, Byte *digest)
   switch (p->mode)
   {
     case XZ_CHECK_CRC32:
-      SetUi32(digest, CRC_GET_DIGEST(p->crc));
+      SetUi32(digest, CRC_GET_DIGEST(p->crc))
       break;
     case XZ_CHECK_CRC64:
     {
diff --git a/src/sdk/C/Xz.h b/src/sdk/C/Xz.h
index 544ee18..ad63b48 100644
--- a/src/sdk/C/Xz.h
+++ b/src/sdk/C/Xz.h
@@ -1,21 +1,24 @@
 /* Xz.h - Xz interface
-2018-07-04 : Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */
 
-#ifndef __XZ_H
-#define __XZ_H
+#ifndef ZIP7_INC_XZ_H
+#define ZIP7_INC_XZ_H
 
 #include "Sha256.h"
+#include "Delta.h"
 
 EXTERN_C_BEGIN
 
 #define XZ_ID_Subblock 1
 #define XZ_ID_Delta 3
-#define XZ_ID_X86 4
-#define XZ_ID_PPC 5
-#define XZ_ID_IA64 6
-#define XZ_ID_ARM 7
-#define XZ_ID_ARMT 8
+#define XZ_ID_X86   4
+#define XZ_ID_PPC   5
+#define XZ_ID_IA64  6
+#define XZ_ID_ARM   7
+#define XZ_ID_ARMT  8
 #define XZ_ID_SPARC 9
+#define XZ_ID_ARM64 0xa
+#define XZ_ID_RISCV 0xb
 #define XZ_ID_LZMA2 0x21
 
 unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value);
@@ -47,13 +50,13 @@ typedef struct
   CXzFilter filters[XZ_NUM_FILTERS_MAX];
 } CXzBlock;
 
-#define XzBlock_GetNumFilters(p) (((p)->flags & XZ_BF_NUM_FILTERS_MASK) + 1)
+#define XzBlock_GetNumFilters(p) (((unsigned)(p)->flags & XZ_BF_NUM_FILTERS_MASK) + 1)
 #define XzBlock_HasPackSize(p)   (((p)->flags & XZ_BF_PACK_SIZE) != 0)
 #define XzBlock_HasUnpackSize(p) (((p)->flags & XZ_BF_UNPACK_SIZE) != 0)
 #define XzBlock_HasUnsupportedFlags(p) (((p)->flags & ~(XZ_BF_NUM_FILTERS_MASK | XZ_BF_PACK_SIZE | XZ_BF_UNPACK_SIZE)) != 0)
 
 SRes XzBlock_Parse(CXzBlock *p, const Byte *header);
-SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStream *inStream, BoolInt *isIndex, UInt32 *headerSizeRes);
+SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes);
 
 /* ---------- xz stream ---------- */
 
@@ -101,7 +104,7 @@ typedef UInt16 CXzStreamFlags;
 unsigned XzFlags_GetCheckSize(CXzStreamFlags f);
 
 SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf);
-SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStream *inStream);
+SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream);
 
 typedef struct
 {
@@ -112,11 +115,13 @@ typedef struct
 typedef struct
 {
   CXzStreamFlags flags;
+  // Byte _pad[6];
   size_t numBlocks;
   CXzBlockSizes *blocks;
   UInt64 startOffset;
 } CXzStream;
 
+#define Xz_CONSTRUCT(p) { (p)->numBlocks = 0;  (p)->blocks = NULL;  (p)->flags = 0; }
 void Xz_Construct(CXzStream *p);
 void Xz_Free(CXzStream *p, ISzAllocPtr alloc);
 
@@ -132,9 +137,14 @@ typedef struct
   CXzStream *streams;
 } CXzs;
 
+#define Xzs_CONSTRUCT(p) { (p)->num = 0;  (p)->numAllocated = 0;  (p)->streams = NULL; }
 void Xzs_Construct(CXzs *p);
 void Xzs_Free(CXzs *p, ISzAllocPtr alloc);
-SRes Xzs_ReadBackward(CXzs *p, ILookInStream *inStream, Int64 *startOffset, ICompressProgress *progress, ISzAllocPtr alloc);
+/*
+Xzs_ReadBackward() must be called for empty CXzs object.
+Xzs_ReadBackward() can return non empty object with (p->num != 0) even in case of error.
+*/
+SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc);
 
 UInt64 Xzs_GetNumBlocks(const CXzs *p);
 UInt64 Xzs_GetUnpackSize(const CXzs *p);
@@ -160,9 +170,9 @@ typedef enum
 } ECoderFinishMode;
 
 
-typedef struct _IStateCoder
+typedef struct
 {
-  void *p;
+  void *p; // state object;
   void (*Free)(void *p, ISzAllocPtr alloc);
   SRes (*SetProps)(void *p, const Byte *props, size_t propSize, ISzAllocPtr alloc);
   void (*Init)(void *p);
@@ -174,6 +184,20 @@ typedef struct _IStateCoder
 } IStateCoder;
 
 
+typedef struct
+{
+  UInt32 methodId;
+  UInt32 delta;
+  UInt32 ip;
+  UInt32 X86_State;
+  Byte delta_State[DELTA_STATE_SIZE];
+} CXzBcFilterStateBase;
+
+typedef SizeT (*Xz_Func_BcFilterStateBase_Filter)(CXzBcFilterStateBase *p, Byte *data, SizeT size);
+
+SRes Xz_StateCoder_Bc_SetFromMethod_Func(IStateCoder *p, UInt64 id,
+    Xz_Func_BcFilterStateBase_Filter func, ISzAllocPtr alloc);
+
 
 #define MIXCODER_NUM_FILTERS_MAX 4
 
@@ -216,13 +240,13 @@ typedef enum
 typedef struct
 {
   EXzState state;
-  UInt32 pos;
+  unsigned pos;
   unsigned alignPos;
   unsigned indexPreSize;
 
   CXzStreamFlags streamFlags;
   
-  UInt32 blockHeaderSize;
+  unsigned blockHeaderSize;
   UInt64 packSize;
   UInt64 unpackSize;
 
@@ -250,8 +274,8 @@ typedef struct
   size_t outBufSize;
   size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked
 
-  Byte shaDigest[SHA256_DIGEST_SIZE];
-  Byte buf[XZ_BLOCK_HEADER_SIZE_MAX];
+  UInt32 shaDigest32[SHA256_DIGEST_SIZE / 4];
+  Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; // it must be aligned for 4-bytes
 } CXzUnpacker;
 
 /* alloc : aligned for cache line allocation is better */
@@ -277,7 +301,10 @@ void XzUnpacker_Free(CXzUnpacker *p);
     {
       XzUnpacker_Init()
       for()
+      {
         XzUnpacker_Code();
+      }
+      XzUnpacker_IsStreamWasFinished()
     }
     
   Interface-2 : Direct output buffer:
@@ -288,7 +315,10 @@ void XzUnpacker_Free(CXzUnpacker *p);
       XzUnpacker_Init()
       XzUnpacker_SetOutBufMode(); // to set output buffer and size
       for()
+      {
         XzUnpacker_Code(); // (dest = NULL) in XzUnpacker_Code()
+      }
+      XzUnpacker_IsStreamWasFinished()
     }
 
   Interface-3 : Direct output buffer : One call full decoding
@@ -296,6 +326,7 @@ void XzUnpacker_Free(CXzUnpacker *p);
     It uses Interface-2 internally.
     {
       XzUnpacker_CodeFull()
+      XzUnpacker_IsStreamWasFinished()
     }
 */
 
@@ -309,8 +340,12 @@ void XzUnpacker_Free(CXzUnpacker *p);
   SZ_OK
     status:
       CODER_STATUS_NOT_FINISHED,
-      CODER_STATUS_NEEDS_MORE_INPUT - maybe there are more xz streams,
-                                      call XzUnpacker_IsStreamWasFinished to check that current stream was finished
+      CODER_STATUS_NEEDS_MORE_INPUT - the decoder can return it in two cases:
+         1) it needs more input data to finish current xz stream
+         2) xz stream was finished successfully. But the decoder supports multiple
+            concatented xz streams. So it expects more input data for new xz streams.
+         Call XzUnpacker_IsStreamWasFinished() to check that latest xz stream was finished successfully.
+
   SZ_ERROR_MEM  - Memory allocation error
   SZ_ERROR_DATA - Data error
   SZ_ERROR_UNSUPPORTED - Unsupported method or method properties
@@ -335,12 +370,17 @@ SRes XzUnpacker_CodeFull(CXzUnpacker *p, Byte *dest, SizeT *destLen,
     const Byte *src, SizeT *srcLen,
     ECoderFinishMode finishMode, ECoderStatus *status);
 
+/*
+If you decode full xz stream(s), then you can call XzUnpacker_IsStreamWasFinished()
+after successful XzUnpacker_CodeFull() or after last call of XzUnpacker_Code().
+*/
+
 BoolInt XzUnpacker_IsStreamWasFinished(const CXzUnpacker *p);
 
 /*
-XzUnpacker_GetExtraSize() returns then number of uncofirmed bytes,
+XzUnpacker_GetExtraSize() returns then number of unconfirmed bytes,
  if it's in (XZ_STATE_STREAM_HEADER) state or in (XZ_STATE_STREAM_PADDING) state.
-These bytes can be some bytes after xz archive, or
+These bytes can be some data after xz archive, or
 it can be start of new xz stream.
  
 Call XzUnpacker_GetExtraSize() after XzUnpacker_Code() function to detect real size of
@@ -371,29 +411,57 @@ BoolInt XzUnpacker_IsBlockFinished(const CXzUnpacker *p);
 
 
 
-/* ---------- Multi Threading Decoding ---------- */
+
+
+
+/* ---- Single-Thread and Multi-Thread xz Decoding with Input/Output Streams ---- */
+
+/*
+  if (CXzDecMtProps::numThreads > 1), the decoder can try to use
+  Multi-Threading. The decoder analyses xz block header, and if
+  there are pack size and unpack size values stored in xz block header,
+  the decoder reads compressed data of block to internal buffers,
+  and then it can start parallel decoding, if there are another blocks.
+  The decoder can switch back to Single-Thread decoding after some conditions.
+
+  The sequence of calls for xz decoding with in/out Streams:
+  {
+    XzDecMt_Create()
+    XzDecMtProps_Init(XzDecMtProps) to set default values of properties
+    // then you can change some XzDecMtProps parameters with required values
+    // here you can set the number of threads and (memUseMax) - the maximum
+    Memory usage for multithreading decoding.
+    for()
+    {
+      XzDecMt_Decode() // one call per one file
+    }
+    XzDecMt_Destroy()
+  }
+*/
 
 
 typedef struct
 {
-  size_t inBufSize_ST;
-  size_t outStep_ST;
-  BoolInt ignoreErrors;
+  size_t inBufSize_ST;    // size of input buffer for Single-Thread decoding
+  size_t outStep_ST;      // size of output buffer for Single-Thread decoding
+  BoolInt ignoreErrors;   // if set to 1, the decoder can ignore some errors and it skips broken parts of data.
   
-  #ifndef _7ZIP_ST
-  unsigned numThreads;
-  size_t inBufSize_MT;
-  size_t memUseMax;
+  #ifndef Z7_ST
+  unsigned numThreads;    // the number of threads for Multi-Thread decoding. if (umThreads == 1) it will use Single-thread decoding
+  size_t inBufSize_MT;    // size of small input data buffers for Multi-Thread decoding. Big number of such small buffers can be created
+  size_t memUseMax;       // the limit of total memory usage for Multi-Thread decoding.
+                          // it's recommended to set (memUseMax) manually to value that is smaller of total size of RAM in computer.
   #endif
 } CXzDecMtProps;
 
 void XzDecMtProps_Init(CXzDecMtProps *p);
 
-
-typedef void * CXzDecMtHandle;
+typedef struct CXzDecMt CXzDecMt;
+typedef CXzDecMt * CXzDecMtHandle;
+// Z7_DECLARE_HANDLE(CXzDecMtHandle)
 
 /*
-  alloc    : XzDecMt uses CAlignOffsetAlloc for addresses allocated by (alloc).
+  alloc    : XzDecMt uses CAlignOffsetAlloc internally for addresses allocated by (alloc).
   allocMid : for big allocations, aligned allocation is better
 */
 
@@ -407,33 +475,46 @@ typedef struct
   Byte NumStreams_Defined;
   Byte NumBlocks_Defined;
 
-  Byte DataAfterEnd;
+  Byte DataAfterEnd;      // there are some additional data after good xz streams, and that data is not new xz stream.
   Byte DecodingTruncated; // Decoding was Truncated, we need only partial output data
 
-  UInt64 InSize;  // pack size processed
+  UInt64 InSize;          // pack size processed. That value doesn't include the data after
+                          // end of xz stream, if that data was not correct
   UInt64 OutSize;
 
   UInt64 NumStreams;
   UInt64 NumBlocks;
 
-  SRes DecodeRes;
-  SRes ReadRes;
-  SRes ProgressRes;
-  SRes CombinedRes;
-  SRes CombinedRes_Type;
+  SRes DecodeRes;         // the error code of xz streams data decoding
+  SRes ReadRes;           // error code from ISeqInStream:Read()
+  SRes ProgressRes;       // error code from ICompressProgress:Progress()
 
+  SRes CombinedRes;       // Combined result error code that shows main rusult
+                          // = S_OK, if there is no error.
+                          // but check also (DataAfterEnd) that can show additional minor errors.
+ 
+  SRes CombinedRes_Type;  // = SZ_ERROR_READ,     if error from ISeqInStream
+                          // = SZ_ERROR_PROGRESS, if error from ICompressProgress
+                          // = SZ_ERROR_WRITE,    if error from ISeqOutStream
+                          // = SZ_ERROR_* codes for decoding
 } CXzStatInfo;
 
 void XzStatInfo_Clear(CXzStatInfo *p);
 
 /*
+
 XzDecMt_Decode()
-SRes:
-  SZ_OK               - OK
+SRes: it's combined decoding result. It also is equal to stat->CombinedRes.
+
+  SZ_OK               - no error
+                        check also output value in (stat->DataAfterEnd)
+                        that can show additional possible error
+
   SZ_ERROR_MEM        - Memory allocation error
   SZ_ERROR_NO_ARCHIVE - is not xz archive
   SZ_ERROR_ARCHIVE    - Headers error
   SZ_ERROR_DATA       - Data Error
+  SZ_ERROR_UNSUPPORTED - Unsupported method or method properties
   SZ_ERROR_CRC        - CRC Error
   SZ_ERROR_INPUT_EOF  - it needs more input data
   SZ_ERROR_WRITE      - ISeqOutStream error
@@ -447,13 +528,14 @@ SRes XzDecMt_Decode(CXzDecMtHandle p,
     const CXzDecMtProps *props,
     const UInt64 *outDataSize, // NULL means undefined
     int finishMode,            // 0 - partial unpacking is allowed, 1 - xz stream(s) must be finished
-    ISeqOutStream *outStream,
+    ISeqOutStreamPtr outStream,
     // Byte *outBuf, size_t *outBufSize,
-    ISeqInStream *inStream,
+    ISeqInStreamPtr inStream,
     // const Byte *inData, size_t inDataSize,
-    CXzStatInfo *stat,
-    int *isMT,                 // 0 means that ST (Single-Thread) version was used
-    ICompressProgress *progress);
+    CXzStatInfo *stat,         // out: decoding results and statistics
+    int *isMT,                 // out: 0 means that ST (Single-Thread) version was used
+                               //      1 means that MT (Multi-Thread) version was used
+    ICompressProgressPtr progress);
 
 EXTERN_C_END
 
diff --git a/src/sdk/C/XzCrc64.c b/src/sdk/C/XzCrc64.c
index b6d02cb..94fc1af 100644
--- a/src/sdk/C/XzCrc64.c
+++ b/src/sdk/C/XzCrc64.c
@@ -1,5 +1,5 @@
 /* XzCrc64.c -- CRC64 calculation
-2017-05-23 : Igor Pavlov : Public domain */
+2023-12-08 : Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -8,45 +8,76 @@
 
 #define kCrc64Poly UINT64_CONST(0xC96C5795D7870F42)
 
-#ifdef MY_CPU_LE
-  #define CRC64_NUM_TABLES 4
+// for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu
+// #define Z7_CRC64_DEBUG_BE
+#ifdef Z7_CRC64_DEBUG_BE
+#undef MY_CPU_LE
+#define MY_CPU_BE
+#endif
+
+#ifdef Z7_CRC64_NUM_TABLES
+  #define Z7_CRC64_NUM_TABLES_USE  Z7_CRC64_NUM_TABLES
 #else
-  #define CRC64_NUM_TABLES 5
-  #define CRC_UINT64_SWAP(v) \
-      ((v >> 56) \
-    | ((v >> 40) & ((UInt64)0xFF <<  8)) \
-    | ((v >> 24) & ((UInt64)0xFF << 16)) \
-    | ((v >>  8) & ((UInt64)0xFF << 24)) \
-    | ((v <<  8) & ((UInt64)0xFF << 32)) \
-    | ((v << 24) & ((UInt64)0xFF << 40)) \
-    | ((v << 40) & ((UInt64)0xFF << 48)) \
-    | ((v << 56)))
-
-  UInt64 MY_FAST_CALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table);
+  #define Z7_CRC64_NUM_TABLES_USE  12
+#endif
+
+#if Z7_CRC64_NUM_TABLES_USE < 1
+  #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
 #endif
 
+
+#if Z7_CRC64_NUM_TABLES_USE != 1
+
 #ifndef MY_CPU_BE
-  UInt64 MY_FAST_CALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table);
+  #define FUNC_NAME_LE_2(s)   XzCrc64UpdateT ## s
+  #define FUNC_NAME_LE_1(s)   FUNC_NAME_LE_2(s)
+  #define FUNC_NAME_LE        FUNC_NAME_LE_1(Z7_CRC64_NUM_TABLES_USE)
+  UInt64 Z7_FASTCALL FUNC_NAME_LE (UInt64 v, const void *data, size_t size, const UInt64 *table);
+#endif
+#ifndef MY_CPU_LE
+  #define FUNC_NAME_BE_2(s)   XzCrc64UpdateBeT ## s
+  #define FUNC_NAME_BE_1(s)   FUNC_NAME_BE_2(s)
+  #define FUNC_NAME_BE        FUNC_NAME_BE_1(Z7_CRC64_NUM_TABLES_USE)
+  UInt64 Z7_FASTCALL FUNC_NAME_BE (UInt64 v, const void *data, size_t size, const UInt64 *table);
 #endif
 
-typedef UInt64 (MY_FAST_CALL *CRC64_FUNC)(UInt64 v, const void *data, size_t size, const UInt64 *table);
+#if defined(MY_CPU_LE)
+  #define FUNC_REF  FUNC_NAME_LE
+#elif defined(MY_CPU_BE)
+  #define FUNC_REF  FUNC_NAME_BE
+#else
+  #define FUNC_REF  g_Crc64Update
+  static UInt64 (Z7_FASTCALL *FUNC_REF)(UInt64 v, const void *data, size_t size, const UInt64 *table);
+#endif
 
-static CRC64_FUNC g_Crc64Update;
-UInt64 g_Crc64Table[256 * CRC64_NUM_TABLES];
+#endif
+
+
+MY_ALIGN(64)
+static UInt64 g_Crc64Table[256 * Z7_CRC64_NUM_TABLES_USE];
 
-UInt64 MY_FAST_CALL Crc64Update(UInt64 v, const void *data, size_t size)
-{
-  return g_Crc64Update(v, data, size, g_Crc64Table);
-}
 
-UInt64 MY_FAST_CALL Crc64Calc(const void *data, size_t size)
+UInt64 Z7_FASTCALL Crc64Update(UInt64 v, const void *data, size_t size)
 {
-  return g_Crc64Update(CRC64_INIT_VAL, data, size, g_Crc64Table) ^ CRC64_INIT_VAL;
+#if Z7_CRC64_NUM_TABLES_USE == 1
+  #define CRC64_UPDATE_BYTE_2(crc, b)  (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
+  const UInt64 *table = g_Crc64Table;
+  const Byte *p = (const Byte *)data;
+  const Byte *lim = p + size;
+  for (; p != lim; p++)
+    v = CRC64_UPDATE_BYTE_2(v, *p);
+  return v;
+  #undef CRC64_UPDATE_BYTE_2
+#else
+  return FUNC_REF (v, data, size, g_Crc64Table);
+#endif
 }
 
-void MY_FAST_CALL Crc64GenerateTable()
+
+Z7_NO_INLINE
+void Z7_FASTCALL Crc64GenerateTable(void)
 {
-  UInt32 i;
+  unsigned i;
   for (i = 0; i < 256; i++)
   {
     UInt64 r = i;
@@ -55,32 +86,55 @@ void MY_FAST_CALL Crc64GenerateTable()
       r = (r >> 1) ^ (kCrc64Poly & ((UInt64)0 - (r & 1)));
     g_Crc64Table[i] = r;
   }
-  for (i = 256; i < 256 * CRC64_NUM_TABLES; i++)
+
+#if Z7_CRC64_NUM_TABLES_USE != 1
+#if 1 || 1 && defined(MY_CPU_X86) // low register count
+  for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i++)
   {
-    UInt64 r = g_Crc64Table[(size_t)i - 256];
-    g_Crc64Table[i] = g_Crc64Table[r & 0xFF] ^ (r >> 8);
+    const UInt64 r0 = g_Crc64Table[(size_t)i];
+    g_Crc64Table[(size_t)i + 256] = g_Crc64Table[(Byte)r0] ^ (r0 >> 8);
   }
-  
-  #ifdef MY_CPU_LE
-
-  g_Crc64Update = XzCrc64UpdateT4;
+#else
+  for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i += 2)
+  {
+    UInt64 r0 = g_Crc64Table[(size_t)(i)    ];
+    UInt64 r1 = g_Crc64Table[(size_t)(i) + 1];
+    r0 = g_Crc64Table[(Byte)r0] ^ (r0 >> 8);
+    r1 = g_Crc64Table[(Byte)r1] ^ (r1 >> 8);
+    g_Crc64Table[(size_t)i + 256    ] = r0;
+    g_Crc64Table[(size_t)i + 256 + 1] = r1;
+  }
+#endif
 
-  #else
+#ifndef MY_CPU_LE
   {
-    #ifndef MY_CPU_BE
+#ifndef MY_CPU_BE
     UInt32 k = 1;
     if (*(const Byte *)&k == 1)
-      g_Crc64Update = XzCrc64UpdateT4;
+      FUNC_REF = FUNC_NAME_LE;
     else
-    #endif
+#endif
     {
-      for (i = 256 * CRC64_NUM_TABLES - 1; i >= 256; i--)
+#ifndef MY_CPU_BE
+      FUNC_REF = FUNC_NAME_BE;
+#endif
+      for (i = 0; i < 256 * Z7_CRC64_NUM_TABLES_USE; i++)
       {
-        UInt64 x = g_Crc64Table[(size_t)i - 256];
-        g_Crc64Table[i] = CRC_UINT64_SWAP(x);
+        const UInt64 x = g_Crc64Table[i];
+        g_Crc64Table[i] = Z7_BSWAP64(x);
       }
-      g_Crc64Update = XzCrc64UpdateT1_BeT4;
     }
   }
-  #endif
+#endif // ndef MY_CPU_LE
+#endif // Z7_CRC64_NUM_TABLES_USE != 1
 }
+
+#undef kCrc64Poly
+#undef Z7_CRC64_NUM_TABLES_USE
+#undef FUNC_REF
+#undef FUNC_NAME_LE_2
+#undef FUNC_NAME_LE_1
+#undef FUNC_NAME_LE
+#undef FUNC_NAME_BE_2
+#undef FUNC_NAME_BE_1
+#undef FUNC_NAME_BE
diff --git a/src/sdk/C/XzCrc64.h b/src/sdk/C/XzCrc64.h
index 08dbc33..04f8153 100644
--- a/src/sdk/C/XzCrc64.h
+++ b/src/sdk/C/XzCrc64.h
@@ -1,8 +1,8 @@
 /* XzCrc64.h -- CRC64 calculation
-2013-01-18 : Igor Pavlov : Public domain */
+2023-12-08 : Igor Pavlov : Public domain */
 
-#ifndef __XZ_CRC64_H
-#define __XZ_CRC64_H
+#ifndef ZIP7_INC_XZ_CRC64_H
+#define ZIP7_INC_XZ_CRC64_H
 
 #include <stddef.h>
 
@@ -10,16 +10,16 @@
 
 EXTERN_C_BEGIN
 
-extern UInt64 g_Crc64Table[];
+// extern UInt64 g_Crc64Table[];
 
-void MY_FAST_CALL Crc64GenerateTable(void);
+void Z7_FASTCALL Crc64GenerateTable(void);
 
 #define CRC64_INIT_VAL UINT64_CONST(0xFFFFFFFFFFFFFFFF)
 #define CRC64_GET_DIGEST(crc) ((crc) ^ CRC64_INIT_VAL)
-#define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
+// #define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
 
-UInt64 MY_FAST_CALL Crc64Update(UInt64 crc, const void *data, size_t size);
-UInt64 MY_FAST_CALL Crc64Calc(const void *data, size_t size);
+UInt64 Z7_FASTCALL Crc64Update(UInt64 crc, const void *data, size_t size);
+// UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size);
 
 EXTERN_C_END
 
diff --git a/src/sdk/C/XzCrc64Opt.c b/src/sdk/C/XzCrc64Opt.c
index b2852de..6eea4a3 100644
--- a/src/sdk/C/XzCrc64Opt.c
+++ b/src/sdk/C/XzCrc64Opt.c
@@ -1,69 +1,261 @@
-/* XzCrc64Opt.c -- CRC64 calculation
-2017-06-30 : Igor Pavlov : Public domain */
+/* XzCrc64Opt.c -- CRC64 calculation (optimized functions)
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include "CpuArch.h"
 
+#if !defined(Z7_CRC64_NUM_TABLES) || Z7_CRC64_NUM_TABLES > 1
+
+// for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu
+// #define Z7_CRC64_DEBUG_BE
+#ifdef Z7_CRC64_DEBUG_BE
+#undef MY_CPU_LE
+#define MY_CPU_BE
+#endif
+
+#if defined(MY_CPU_64BIT)
+#define Z7_CRC64_USE_64BIT
+#endif
+
+// the value Z7_CRC64_NUM_TABLES_USE must be defined to same value as in XzCrc64.c
+#ifdef Z7_CRC64_NUM_TABLES
+#define Z7_CRC64_NUM_TABLES_USE  Z7_CRC64_NUM_TABLES
+#else
+#define Z7_CRC64_NUM_TABLES_USE  12
+#endif
+
+#if Z7_CRC64_NUM_TABLES_USE % 4 || \
+    Z7_CRC64_NUM_TABLES_USE < 4 || \
+    Z7_CRC64_NUM_TABLES_USE > 4 * 4
+  #error Stop_Compiling_Bad_CRC64_NUM_TABLES
+#endif
+
+
 #ifndef MY_CPU_BE
 
-#define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
+#define CRC64_UPDATE_BYTE_2(crc, b)  (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
+
+#if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0)
+
+#define Q64LE(n, d) \
+    ( (table + ((n) * 8 + 7) * 0x100)[((d)         ) & 0xFF] \
+    ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 1 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 2 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 3 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 4 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 5 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 6 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 0) * 0x100)[((d) >> 7 * 8)] )
+
+#define R64(a)  *((const UInt64 *)(const void *)p + (a))
+
+#else
+
+#define Q32LE(n, d) \
+    ( (table + ((n) * 4 + 3) * 0x100)[((d)         ) & 0xFF] \
+    ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \
+    ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \
+    ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] )
+
+#define R32(a)  *((const UInt32 *)(const void *)p + (a))
+
+#endif
+
+
+#define CRC64_FUNC_PRE_LE2(step) \
+UInt64 Z7_FASTCALL XzCrc64UpdateT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table)
+
+#define CRC64_FUNC_PRE_LE(step)   \
+        CRC64_FUNC_PRE_LE2(step); \
+        CRC64_FUNC_PRE_LE2(step)
 
-UInt64 MY_FAST_CALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table)
+CRC64_FUNC_PRE_LE(Z7_CRC64_NUM_TABLES_USE)
 {
   const Byte *p = (const Byte *)data;
-  for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++)
+  const Byte *lim;
+  for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++)
     v = CRC64_UPDATE_BYTE_2(v, *p);
-  for (; size >= 4; size -= 4, p += 4)
+  lim = p + size;
+  if (size >= Z7_CRC64_NUM_TABLES_USE)
   {
-    UInt32 d = (UInt32)v ^ *(const UInt32 *)p;
-    v = (v >> 32)
-        ^ (table + 0x300)[((d      ) & 0xFF)]
-        ^ (table + 0x200)[((d >>  8) & 0xFF)]
-        ^ (table + 0x100)[((d >> 16) & 0xFF)]
-        ^ (table + 0x000)[((d >> 24))];
+    lim -= Z7_CRC64_NUM_TABLES_USE;
+    do
+    {
+#if Z7_CRC64_NUM_TABLES_USE == 4
+      const UInt32 d = (UInt32)v ^ R32(0);
+      v = (v >> 32) ^ Q32LE(0, d);
+#elif Z7_CRC64_NUM_TABLES_USE == 8
+#ifdef Z7_CRC64_USE_64BIT
+      v ^= R64(0);
+      v = Q64LE(0, v);
+#else
+      UInt32 v0, v1;
+      v0 = (UInt32)v         ^ R32(0);
+      v1 = (UInt32)(v >> 32) ^ R32(1);
+      v = Q32LE(1, v0) ^ Q32LE(0, v1);
+#endif
+#elif Z7_CRC64_NUM_TABLES_USE == 12
+      UInt32 w;
+      UInt32 v0, v1;
+      v0 = (UInt32)v         ^ R32(0);
+      v1 = (UInt32)(v >> 32) ^ R32(1);
+      w = R32(2);
+      v = Q32LE(0, w);
+      v ^= Q32LE(2, v0) ^ Q32LE(1, v1);
+#elif Z7_CRC64_NUM_TABLES_USE == 16
+#ifdef Z7_CRC64_USE_64BIT
+      UInt64 w;
+      UInt64 x;
+      w  = R64(1);      x = Q64LE(0, w);
+      v ^= R64(0);  v = x ^ Q64LE(1, v);
+#else
+      UInt32 v0, v1;
+      UInt32 r0, r1;
+      v0 = (UInt32)v         ^ R32(0);
+      v1 = (UInt32)(v >> 32) ^ R32(1);
+      r0 =                     R32(2);
+      r1 =                     R32(3);
+      v  = Q32LE(1, r0) ^ Q32LE(0, r1);
+      v ^= Q32LE(3, v0) ^ Q32LE(2, v1);
+#endif
+#else
+#error Stop_Compiling_Bad_CRC64_NUM_TABLES
+#endif
+      p += Z7_CRC64_NUM_TABLES_USE;
+    }
+    while (p <= lim);
+    lim += Z7_CRC64_NUM_TABLES_USE;
   }
-  for (; size > 0; size--, p++)
+  for (; p < lim; p++)
     v = CRC64_UPDATE_BYTE_2(v, *p);
   return v;
 }
 
+#undef CRC64_UPDATE_BYTE_2
+#undef R32
+#undef R64
+#undef Q32LE
+#undef Q64LE
+#undef CRC64_FUNC_PRE_LE
+#undef CRC64_FUNC_PRE_LE2
+
 #endif
 
 
+
+
 #ifndef MY_CPU_LE
 
-#define CRC_UINT64_SWAP(v) \
-      ((v >> 56) \
-    | ((v >> 40) & ((UInt64)0xFF <<  8)) \
-    | ((v >> 24) & ((UInt64)0xFF << 16)) \
-    | ((v >>  8) & ((UInt64)0xFF << 24)) \
-    | ((v <<  8) & ((UInt64)0xFF << 32)) \
-    | ((v << 24) & ((UInt64)0xFF << 40)) \
-    | ((v << 40) & ((UInt64)0xFF << 48)) \
-    | ((v << 56)))
+#define CRC64_UPDATE_BYTE_2_BE(crc, b)  (table[((crc) >> 56) ^ (b)] ^ ((crc) << 8))
+
+#if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0)
+
+#define Q64BE(n, d) \
+    ( (table + ((n) * 8 + 0) * 0x100)[(Byte)(d)] \
+    ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 3 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 4 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 5 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 6 * 8) & 0xFF] \
+    ^ (table + ((n) * 8 + 7) * 0x100)[((d) >> 7 * 8)] )
+
+#ifdef Z7_CRC64_DEBUG_BE
+  #define R64BE(a)  GetBe64a((const UInt64 *)(const void *)p + (a))
+#else
+  #define R64BE(a)         *((const UInt64 *)(const void *)p + (a))
+#endif
+
+#else
 
-#define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[(Byte)((crc) >> 56) ^ (b)] ^ ((crc) << 8))
+#define Q32BE(n, d) \
+    ( (table + ((n) * 4 + 0) * 0x100)[(Byte)(d)] \
+    ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \
+    ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \
+    ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] )
 
-UInt64 MY_FAST_CALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table)
+#ifdef Z7_CRC64_DEBUG_BE
+  #define R32BE(a)  GetBe32a((const UInt32 *)(const void *)p + (a))
+#else
+  #define R32BE(a)         *((const UInt32 *)(const void *)p + (a))
+#endif
+
+#endif
+
+#define CRC64_FUNC_PRE_BE2(step) \
+UInt64 Z7_FASTCALL XzCrc64UpdateBeT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table)
+
+#define CRC64_FUNC_PRE_BE(step)   \
+        CRC64_FUNC_PRE_BE2(step); \
+        CRC64_FUNC_PRE_BE2(step)
+
+CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE)
 {
   const Byte *p = (const Byte *)data;
-  table += 0x100;
-  v = CRC_UINT64_SWAP(v);
-  for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++)
+  const Byte *lim;
+  v = Z7_BSWAP64(v);
+  for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++)
     v = CRC64_UPDATE_BYTE_2_BE(v, *p);
-  for (; size >= 4; size -= 4, p += 4)
+  lim = p + size;
+  if (size >= Z7_CRC64_NUM_TABLES_USE)
   {
-    UInt32 d = (UInt32)(v >> 32) ^ *(const UInt32 *)p;
-    v = (v << 32)
-        ^ (table + 0x000)[((d      ) & 0xFF)]
-        ^ (table + 0x100)[((d >>  8) & 0xFF)]
-        ^ (table + 0x200)[((d >> 16) & 0xFF)]
-        ^ (table + 0x300)[((d >> 24))];
+    lim -= Z7_CRC64_NUM_TABLES_USE;
+    do
+    {
+#if   Z7_CRC64_NUM_TABLES_USE == 4
+      const UInt32 d = (UInt32)(v >> 32) ^ R32BE(0);
+      v = (v << 32) ^ Q32BE(0, d);
+#elif Z7_CRC64_NUM_TABLES_USE == 12
+      const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0);
+      const UInt32 d0 = (UInt32)(v      ) ^ R32BE(1);
+      const UInt32 w =                      R32BE(2);
+      v  = Q32BE(0, w);
+      v ^= Q32BE(2, d1) ^ Q32BE(1, d0);
+
+#elif Z7_CRC64_NUM_TABLES_USE == 8
+  #ifdef Z7_CRC64_USE_64BIT
+      v ^= R64BE(0);
+      v  = Q64BE(0, v);
+  #else
+      const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0);
+      const UInt32 d0 = (UInt32)(v      ) ^ R32BE(1);
+      v = Q32BE(1, d1) ^ Q32BE(0, d0);
+  #endif
+#elif Z7_CRC64_NUM_TABLES_USE == 16
+  #ifdef Z7_CRC64_USE_64BIT
+      const UInt64 w = R64BE(1);
+      v ^= R64BE(0);
+      v  = Q64BE(0, w) ^ Q64BE(1, v);
+  #else
+      const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0);
+      const UInt32 d0 = (UInt32)(v      ) ^ R32BE(1);
+      const UInt32 w1 =                     R32BE(2);
+      const UInt32 w0 =                     R32BE(3);
+      v  = Q32BE(1, w1) ^ Q32BE(0, w0);
+      v ^= Q32BE(3, d1) ^ Q32BE(2, d0);
+  #endif
+#else
+#error Stop_Compiling_Bad_CRC64_NUM_TABLES
+#endif
+      p += Z7_CRC64_NUM_TABLES_USE;
+    }
+    while (p <= lim);
+    lim += Z7_CRC64_NUM_TABLES_USE;
   }
-  for (; size > 0; size--, p++)
+  for (; p < lim; p++)
     v = CRC64_UPDATE_BYTE_2_BE(v, *p);
-  return CRC_UINT64_SWAP(v);
+  return Z7_BSWAP64(v);
 }
 
+#undef CRC64_UPDATE_BYTE_2_BE
+#undef R32BE
+#undef R64BE
+#undef Q32BE
+#undef Q64BE
+#undef CRC64_FUNC_PRE_BE
+#undef CRC64_FUNC_PRE_BE2
+
+#endif
+#undef Z7_CRC64_NUM_TABLES_USE
 #endif
diff --git a/src/sdk/C/XzDec.c b/src/sdk/C/XzDec.c
index 395e83f..2dac324 100644
--- a/src/sdk/C/XzDec.c
+++ b/src/sdk/C/XzDec.c
@@ -1,5 +1,5 @@
 /* XzDec.c -- Xz Decode
-2019-02-02 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -59,7 +59,7 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value)
 
   for (i = 0; i < limit;)
   {
-    Byte b = p[i];
+    const unsigned b = p[i];
     *value |= (UInt64)(b & 0x7F) << (7 * i++);
     if ((b & 0x80) == 0)
       return (b == 0 && i != 1) ? 0 : i;
@@ -67,7 +67,8 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value)
   return 0;
 }
 
-/* ---------- BraState ---------- */
+
+/* ---------- XzBcFilterState ---------- */
 
 #define BRA_BUF_SIZE (1 << 14)
 
@@ -76,55 +77,60 @@ typedef struct
   size_t bufPos;
   size_t bufConv;
   size_t bufTotal;
+  Byte *buf;  // must be aligned for 4 bytes
+  Xz_Func_BcFilterStateBase_Filter filter_func;
+  // int encodeMode;
+  CXzBcFilterStateBase base;
+  // Byte buf[BRA_BUF_SIZE];
+} CXzBcFilterState;
 
-  int encodeMode;
-
-  UInt32 methodId;
-  UInt32 delta;
-  UInt32 ip;
-  UInt32 x86State;
-  Byte deltaState[DELTA_STATE_SIZE];
 
-  Byte buf[BRA_BUF_SIZE];
-} CBraState;
-
-static void BraState_Free(void *pp, ISzAllocPtr alloc)
+static void XzBcFilterState_Free(void *pp, ISzAllocPtr alloc)
 {
-  ISzAlloc_Free(alloc, pp);
+  if (pp)
+  {
+    CXzBcFilterState *p = ((CXzBcFilterState *)pp);
+    ISzAlloc_Free(alloc, p->buf);
+    ISzAlloc_Free(alloc, pp);
+  }
 }
 
-static SRes BraState_SetProps(void *pp, const Byte *props, size_t propSize, ISzAllocPtr alloc)
+
+static SRes XzBcFilterState_SetProps(void *pp, const Byte *props, size_t propSize, ISzAllocPtr alloc)
 {
-  CBraState *p = ((CBraState *)pp);
-  UNUSED_VAR(alloc);
+  CXzBcFilterStateBase *p = &((CXzBcFilterState *)pp)->base;
+  UNUSED_VAR(alloc)
   p->ip = 0;
   if (p->methodId == XZ_ID_Delta)
   {
     if (propSize != 1)
       return SZ_ERROR_UNSUPPORTED;
-    p->delta = (unsigned)props[0] + 1;
+    p->delta = (UInt32)props[0] + 1;
   }
   else
   {
     if (propSize == 4)
     {
-      UInt32 v = GetUi32(props);
+      const UInt32 v = GetUi32(props);
       switch (p->methodId)
       {
         case XZ_ID_PPC:
         case XZ_ID_ARM:
         case XZ_ID_SPARC:
-          if ((v & 3) != 0)
+        case XZ_ID_ARM64:
+          if (v & 3)
             return SZ_ERROR_UNSUPPORTED;
           break;
         case XZ_ID_ARMT:
-          if ((v & 1) != 0)
+        case XZ_ID_RISCV:
+          if (v & 1)
             return SZ_ERROR_UNSUPPORTED;
           break;
         case XZ_ID_IA64:
-          if ((v & 0xF) != 0)
+          if (v & 0xf)
             return SZ_ERROR_UNSUPPORTED;
           break;
+        default: break;
       }
       p->ip = v;
     }
@@ -134,73 +140,91 @@ static SRes BraState_SetProps(void *pp, const Byte *props, size_t propSize, ISzA
   return SZ_OK;
 }
 
-static void BraState_Init(void *pp)
+
+static void XzBcFilterState_Init(void *pp)
 {
-  CBraState *p = ((CBraState *)pp);
+  CXzBcFilterState *p = ((CXzBcFilterState *)pp);
   p->bufPos = p->bufConv = p->bufTotal = 0;
-  x86_Convert_Init(p->x86State);
-  if (p->methodId == XZ_ID_Delta)
-    Delta_Init(p->deltaState);
+  p->base.X86_State = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL;
+  if (p->base.methodId == XZ_ID_Delta)
+    Delta_Init(p->base.delta_State);
 }
 
 
-#define CASE_BRA_CONV(isa) case XZ_ID_ ## isa: size = isa ## _Convert(data, size, p->ip, p->encodeMode); break;
-
-static SizeT BraState_Filter(void *pp, Byte *data, SizeT size)
+static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Dec[] =
+{
+  Z7_BRANCH_CONV_DEC_2 (BranchConv_PPC),
+  Z7_BRANCH_CONV_DEC_2 (BranchConv_IA64),
+  Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM),
+  Z7_BRANCH_CONV_DEC_2 (BranchConv_ARMT),
+  Z7_BRANCH_CONV_DEC_2 (BranchConv_SPARC),
+  Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM64),
+  Z7_BRANCH_CONV_DEC_2 (BranchConv_RISCV)
+};
+
+static SizeT XzBcFilterStateBase_Filter_Dec(CXzBcFilterStateBase *p, Byte *data, SizeT size)
 {
-  CBraState *p = ((CBraState *)pp);
   switch (p->methodId)
   {
     case XZ_ID_Delta:
-      if (p->encodeMode)
-        Delta_Encode(p->deltaState, p->delta, data, size);
-      else
-        Delta_Decode(p->deltaState, p->delta, data, size);
+      Delta_Decode(p->delta_State, p->delta, data, size);
       break;
     case XZ_ID_X86:
-      size = x86_Convert(data, size, p->ip, &p->x86State, p->encodeMode);
+      size = (SizeT)(z7_BranchConvSt_X86_Dec(data, size, p->ip, &p->X86_State) - data);
+      break;
+    default:
+      if (p->methodId >= XZ_ID_PPC)
+      {
+        const UInt32 i = p->methodId - XZ_ID_PPC;
+        if (i < Z7_ARRAY_SIZE(g_Funcs_BranchConv_RISC_Dec))
+          size = (SizeT)(g_Funcs_BranchConv_RISC_Dec[i](data, size, p->ip) - data);
+      }
       break;
-    CASE_BRA_CONV(PPC)
-    CASE_BRA_CONV(IA64)
-    CASE_BRA_CONV(ARM)
-    CASE_BRA_CONV(ARMT)
-    CASE_BRA_CONV(SPARC)
   }
   p->ip += (UInt32)size;
   return size;
 }
 
 
-static SRes BraState_Code2(void *pp,
+static SizeT XzBcFilterState_Filter(void *pp, Byte *data, SizeT size)
+{
+  CXzBcFilterState *p = ((CXzBcFilterState *)pp);
+  return p->filter_func(&p->base, data, size);
+}
+
+
+static SRes XzBcFilterState_Code2(void *pp,
     Byte *dest, SizeT *destLen,
     const Byte *src, SizeT *srcLen, int srcWasFinished,
     ECoderFinishMode finishMode,
     // int *wasFinished
     ECoderStatus *status)
 {
-  CBraState *p = ((CBraState *)pp);
+  CXzBcFilterState *p = ((CXzBcFilterState *)pp);
   SizeT destRem = *destLen;
   SizeT srcRem = *srcLen;
-  UNUSED_VAR(finishMode);
+  UNUSED_VAR(finishMode)
 
   *destLen = 0;
   *srcLen = 0;
   // *wasFinished = False;
   *status = CODER_STATUS_NOT_FINISHED;
   
-  while (destRem > 0)
+  while (destRem != 0)
   {
-    if (p->bufPos != p->bufConv)
     {
       size_t size = p->bufConv - p->bufPos;
-      if (size > destRem)
-        size = destRem;
-      memcpy(dest, p->buf + p->bufPos, size);
-      p->bufPos += size;
-      *destLen += size;
-      dest += size;
-      destRem -= size;
-      continue;
+      if (size)
+      {
+        if (size > destRem)
+          size = destRem;
+        memcpy(dest, p->buf + p->bufPos, size);
+        p->bufPos += size;
+        *destLen += size;
+        dest += size;
+        destRem -= size;
+        continue;
+      }
     }
     
     p->bufTotal -= p->bufPos;
@@ -220,7 +244,7 @@ static SRes BraState_Code2(void *pp,
     if (p->bufTotal == 0)
       break;
     
-    p->bufConv = BraState_Filter(pp, p->buf, p->bufTotal);
+    p->bufConv = p->filter_func(&p->base, p->buf, p->bufTotal);
 
     if (p->bufConv == 0)
     {
@@ -240,26 +264,37 @@ static SRes BraState_Code2(void *pp,
 }
 
 
-SRes BraState_SetFromMethod(IStateCoder *p, UInt64 id, int encodeMode, ISzAllocPtr alloc)
+#define XZ_IS_SUPPORTED_FILTER_ID(id) \
+    ((id) >= XZ_ID_Delta && (id) <= XZ_ID_RISCV)
+     
+SRes Xz_StateCoder_Bc_SetFromMethod_Func(IStateCoder *p, UInt64 id,
+    Xz_Func_BcFilterStateBase_Filter func, ISzAllocPtr alloc)
 {
-  CBraState *decoder;
-  if (id < XZ_ID_Delta || id > XZ_ID_SPARC)
+  CXzBcFilterState *decoder;
+  if (!XZ_IS_SUPPORTED_FILTER_ID(id))
     return SZ_ERROR_UNSUPPORTED;
-  decoder = (CBraState *)p->p;
+  decoder = (CXzBcFilterState *)p->p;
   if (!decoder)
   {
-    decoder = (CBraState *)ISzAlloc_Alloc(alloc, sizeof(CBraState));
+    decoder = (CXzBcFilterState *)ISzAlloc_Alloc(alloc, sizeof(CXzBcFilterState));
     if (!decoder)
       return SZ_ERROR_MEM;
+    decoder->buf = ISzAlloc_Alloc(alloc, BRA_BUF_SIZE);
+    if (!decoder->buf)
+    {
+      ISzAlloc_Free(alloc, decoder);
+      return SZ_ERROR_MEM;
+    }
     p->p = decoder;
-    p->Free = BraState_Free;
-    p->SetProps = BraState_SetProps;
-    p->Init = BraState_Init;
-    p->Code2 = BraState_Code2;
-    p->Filter = BraState_Filter;
+    p->Free     = XzBcFilterState_Free;
+    p->SetProps = XzBcFilterState_SetProps;
+    p->Init     = XzBcFilterState_Init;
+    p->Code2    = XzBcFilterState_Code2;
+    p->Filter   = XzBcFilterState_Filter;
+    decoder->filter_func = func;
   }
-  decoder->methodId = (UInt32)id;
-  decoder->encodeMode = encodeMode;
+  decoder->base.methodId = (UInt32)id;
+  // decoder->encodeMode = encodeMode;
   return SZ_OK;
 }
 
@@ -278,9 +313,9 @@ static void SbState_Free(void *pp, ISzAllocPtr alloc)
 
 static SRes SbState_SetProps(void *pp, const Byte *props, size_t propSize, ISzAllocPtr alloc)
 {
-  UNUSED_VAR(pp);
-  UNUSED_VAR(props);
-  UNUSED_VAR(alloc);
+  UNUSED_VAR(pp)
+  UNUSED_VAR(props)
+  UNUSED_VAR(alloc)
   return (propSize == 0) ? SZ_OK : SZ_ERROR_UNSUPPORTED;
 }
 
@@ -296,7 +331,7 @@ static SRes SbState_Code2(void *pp, Byte *dest, SizeT *destLen, const Byte *src,
 {
   CSbDec *p = (CSbDec *)pp;
   SRes res;
-  UNUSED_VAR(srcWasFinished);
+  UNUSED_VAR(srcWasFinished)
   p->dest = dest;
   p->destLen = *destLen;
   p->src = src;
@@ -388,7 +423,7 @@ static SRes Lzma2State_Code2(void *pp, Byte *dest, SizeT *destLen, const Byte *s
   ELzmaStatus status2;
   /* ELzmaFinishMode fm = (finishMode == LZMA_FINISH_ANY) ? LZMA_FINISH_ANY : LZMA_FINISH_END; */
   SRes res;
-  UNUSED_VAR(srcWasFinished);
+  UNUSED_VAR(srcWasFinished)
   if (spec->outBufMode)
   {
     SizeT dicPos = spec->decoder.decoder.dicPos;
@@ -419,7 +454,7 @@ static SRes Lzma2State_SetFromMethod(IStateCoder *p, Byte *outBuf, size_t outBuf
     p->Init = Lzma2State_Init;
     p->Code2 = Lzma2State_Code2;
     p->Filter = NULL;
-    Lzma2Dec_Construct(&spec->decoder);
+    Lzma2Dec_CONSTRUCT(&spec->decoder)
   }
   spec->outBufMode = False;
   if (outBuf)
@@ -509,26 +544,24 @@ static SRes MixCoder_SetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 met
 {
   IStateCoder *sc = &p->coders[coderIndex];
   p->ids[coderIndex] = methodId;
-  switch (methodId)
-  {
-    case XZ_ID_LZMA2: return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc);
-    #ifdef USE_SUBBLOCK
-    case XZ_ID_Subblock: return SbState_SetFromMethod(sc, p->alloc);
-    #endif
-  }
+  if (methodId == XZ_ID_LZMA2)
+    return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc);
+#ifdef USE_SUBBLOCK
+  if (methodId == XZ_ID_Subblock)
+    return SbState_SetFromMethod(sc, p->alloc);
+#endif
   if (coderIndex == 0)
     return SZ_ERROR_UNSUPPORTED;
-  return BraState_SetFromMethod(sc, methodId, 0, p->alloc);
+  return Xz_StateCoder_Bc_SetFromMethod_Func(sc, methodId,
+      XzBcFilterStateBase_Filter_Dec, p->alloc);
 }
 
 
 static SRes MixCoder_ResetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 methodId, Byte *outBuf, size_t outBufSize)
 {
   IStateCoder *sc = &p->coders[coderIndex];
-  switch (methodId)
-  {
-    case XZ_ID_LZMA2: return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize);
-  }
+  if (methodId == XZ_ID_LZMA2)
+    return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize);
   return SZ_ERROR_UNSUPPORTED;
 }
 
@@ -567,7 +600,7 @@ static SRes MixCoder_Code(CMixCoder *p,
     SizeT destLen2, srcLen2;
     int wasFinished;
     
-    PRF_STR("------- MixCoder Single ----------");
+    PRF_STR("------- MixCoder Single ----------")
       
     srcLen2 = srcLenOrig;
     destLen2 = destLenOrig;
@@ -614,14 +647,14 @@ static SRes MixCoder_Code(CMixCoder *p,
         processed = coder->Filter(coder->p, p->outBuf, processed);
         if (wasFinished || (destFinish && p->outWritten == destLenOrig))
           processed = p->outWritten;
-        PRF_STR_INT("filter", i);
+        PRF_STR_INT("filter", i)
       }
       *destLen = processed;
     }
     return res;
   }
 
-  PRF_STR("standard mix");
+  PRF_STR("standard mix")
 
   if (p->numCoders != 1)
   {
@@ -763,21 +796,21 @@ SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf)
 
 static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf)
 {
-  return indexSize == (((UInt64)GetUi32(buf + 4) + 1) << 2)
-      && GetUi32(buf) == CrcCalc(buf + 4, 6)
-      && flags == GetBe16(buf + 8)
-      && buf[10] == XZ_FOOTER_SIG_0
-      && buf[11] == XZ_FOOTER_SIG_1;
+  return indexSize == (((UInt64)GetUi32a(buf + 4) + 1) << 2)
+      && GetUi32a(buf) == CrcCalc(buf + 4, 6)
+      && flags == GetBe16a(buf + 8)
+      && GetUi16a(buf + 10) == (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8));
 }
 
 #define READ_VARINT_AND_CHECK(buf, pos, size, res) \
-  { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \
-  if (s == 0) return SZ_ERROR_ARCHIVE; pos += s; }
+  { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \
+  if (s == 0) return SZ_ERROR_ARCHIVE; \
+  pos += s; }
 
 
 static BoolInt XzBlock_AreSupportedFilters(const CXzBlock *p)
 {
-  unsigned numFilters = XzBlock_GetNumFilters(p) - 1;
+  const unsigned numFilters = XzBlock_GetNumFilters(p) - 1;
   unsigned i;
   {
     const CXzFilter *f = &p->filters[numFilters];
@@ -793,8 +826,7 @@ static BoolInt XzBlock_AreSupportedFilters(const CXzBlock *p)
       if (f->propsSize != 1)
         return False;
     }
-    else if (f->id < XZ_ID_Delta
-        || f->id > XZ_ID_SPARC
+    else if (!XZ_IS_SUPPORTED_FILTER_ID(f->id)
         || (f->propsSize != 0 && f->propsSize != 4))
       return False;
   }
@@ -819,22 +851,24 @@ SRes XzBlock_Parse(CXzBlock *p, const Byte *header)
   p->packSize = (UInt64)(Int64)-1;
   if (XzBlock_HasPackSize(p))
   {
-    READ_VARINT_AND_CHECK(header, pos, headerSize, &p->packSize);
+    READ_VARINT_AND_CHECK(header, pos, headerSize, &p->packSize)
     if (p->packSize == 0 || p->packSize + headerSize >= (UInt64)1 << 63)
       return SZ_ERROR_ARCHIVE;
   }
 
   p->unpackSize = (UInt64)(Int64)-1;
   if (XzBlock_HasUnpackSize(p))
-    READ_VARINT_AND_CHECK(header, pos, headerSize, &p->unpackSize);
+  {
+    READ_VARINT_AND_CHECK(header, pos, headerSize, &p->unpackSize)
+  }
 
   numFilters = XzBlock_GetNumFilters(p);
   for (i = 0; i < numFilters; i++)
   {
     CXzFilter *filter = p->filters + i;
     UInt64 size;
-    READ_VARINT_AND_CHECK(header, pos, headerSize, &filter->id);
-    READ_VARINT_AND_CHECK(header, pos, headerSize, &size);
+    READ_VARINT_AND_CHECK(header, pos, headerSize, &filter->id)
+    READ_VARINT_AND_CHECK(header, pos, headerSize, &size)
     if (size > headerSize - pos || size > XZ_FILTER_PROPS_SIZE_MAX)
       return SZ_ERROR_ARCHIVE;
     filter->propsSize = (UInt32)size;
@@ -892,20 +926,20 @@ static SRes XzDecMix_Init(CMixCoder *p, const CXzBlock *block, Byte *outBuf, siz
     MixCoder_Free(p);
     for (i = 0; i < numFilters; i++)
     {
-      RINOK(MixCoder_SetFromMethod(p, i, block->filters[numFilters - 1 - i].id, outBuf, outBufSize));
+      RINOK(MixCoder_SetFromMethod(p, i, block->filters[numFilters - 1 - i].id, outBuf, outBufSize))
     }
     p->numCoders = numFilters;
   }
   else
   {
-    RINOK(MixCoder_ResetFromMethod(p, 0, block->filters[numFilters - 1].id, outBuf, outBufSize));
+    RINOK(MixCoder_ResetFromMethod(p, 0, block->filters[numFilters - 1].id, outBuf, outBufSize))
   }
 
   for (i = 0; i < numFilters; i++)
   {
     const CXzFilter *f = &block->filters[numFilters - 1 - i];
     IStateCoder *sc = &p->coders[i];
-    RINOK(sc->SetProps(sc->p, f->props, f->propsSize, p->alloc));
+    RINOK(sc->SetProps(sc->p, f->props, f->propsSize, p->alloc))
   }
   
   MixCoder_Init(p);
@@ -999,7 +1033,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
       SRes res;
 
       ECoderFinishMode finishMode2 = finishMode;
-      BoolInt srcFinished2 = srcFinished;
+      BoolInt srcFinished2 = (BoolInt)srcFinished;
       BoolInt destFinish = False;
 
       if (p->block.packSize != (UInt64)(Int64)-1)
@@ -1038,7 +1072,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
             (p->outBuf ? NULL : dest), &destLen2, destFinish,
             src, &srcLen2, srcFinished2,
             finishMode2);
-        
+
         *status = p->decoder.status;
         XzCheck_Update(&p->check, (p->outBuf ? p->outBuf + p->outDataWritten : dest), destLen2);
         if (!p->outBuf)
@@ -1052,14 +1086,14 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
       (*destLen) += destLen2;
       p->unpackSize += destLen2;
 
-      RINOK(res);
+      RINOK(res)
 
       if (*status != CODER_STATUS_FINISHED_WITH_MARK)
       {
         if (p->block.packSize == p->packSize
             && *status == CODER_STATUS_NEEDS_MORE_INPUT)
         {
-          PRF_STR("CODER_STATUS_NEEDS_MORE_INPUT");
+          PRF_STR("CODER_STATUS_NEEDS_MORE_INPUT")
           *status = CODER_STATUS_NOT_SPECIFIED;
           return SZ_ERROR_DATA;
         }
@@ -1076,7 +1110,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
         if ((p->block.packSize != (UInt64)(Int64)-1 && p->block.packSize != p->packSize)
            || (p->block.unpackSize != (UInt64)(Int64)-1 && p->block.unpackSize != p->unpackSize))
         {
-          PRF_STR("ERROR: block.size mismatch");
+          PRF_STR("ERROR: block.size mismatch")
           return SZ_ERROR_DATA;
         }
       }
@@ -1092,7 +1126,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
       return SZ_OK;
     }
 
-    switch (p->state)
+    switch ((int)p->state)
     {
       case XZ_STATE_STREAM_HEADER:
       {
@@ -1107,7 +1141,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
         }
         else
         {
-          RINOK(Xz_ParseHeader(&p->streamFlags, p->buf));
+          RINOK(Xz_ParseHeader(&p->streamFlags, p->buf))
           p->numStartedStreams++;
           p->indexSize = 0;
           p->numBlocks = 0;
@@ -1131,21 +1165,21 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
             p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks);
             p->indexPos = p->indexPreSize;
             p->indexSize += p->indexPreSize;
-            Sha256_Final(&p->sha, p->shaDigest);
+            Sha256_Final(&p->sha, (Byte *)(void *)p->shaDigest32);
             Sha256_Init(&p->sha);
             p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize);
             p->state = XZ_STATE_STREAM_INDEX;
             break;
           }
-          p->blockHeaderSize = ((UInt32)p->buf[0] << 2) + 4;
+          p->blockHeaderSize = ((unsigned)p->buf[0] << 2) + 4;
           break;
         }
         
         if (p->pos != p->blockHeaderSize)
         {
-          UInt32 cur = p->blockHeaderSize - p->pos;
+          unsigned cur = p->blockHeaderSize - p->pos;
           if (cur > srcRem)
-            cur = (UInt32)srcRem;
+            cur = (unsigned)srcRem;
           memcpy(p->buf + p->pos, src, cur);
           p->pos += cur;
           (*srcLen) += cur;
@@ -1153,7 +1187,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
         }
         else
         {
-          RINOK(XzBlock_Parse(&p->block, p->buf));
+          RINOK(XzBlock_Parse(&p->block, p->buf))
           if (!XzBlock_AreSupportedFilters(&p->block))
             return SZ_ERROR_UNSUPPORTED;
           p->numTotalBlocks++;
@@ -1166,7 +1200,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
             p->headerParsedOk = True;
             return SZ_OK;
           }
-          RINOK(XzDecMix_Init(&p->decoder, &p->block, p->outBuf, p->outBufSize));
+          RINOK(XzDecMix_Init(&p->decoder, &p->block, p->outBuf, p->outBufSize))
         }
         break;
       }
@@ -1187,8 +1221,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
         }
         else
         {
-          UInt32 checkSize = XzFlags_GetCheckSize(p->streamFlags);
-          UInt32 cur = checkSize - p->pos;
+          const unsigned checkSize = XzFlags_GetCheckSize(p->streamFlags);
+          unsigned cur = checkSize - p->pos;
           if (cur != 0)
           {
             if (srcRem == 0)
@@ -1197,7 +1231,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
               return SZ_OK;
             }
             if (cur > srcRem)
-              cur = (UInt32)srcRem;
+              cur = (unsigned)srcRem;
             memcpy(p->buf + p->pos, src, cur);
             p->pos += cur;
             (*srcLen) += cur;
@@ -1206,10 +1240,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
               break;
           }
           {
-            Byte digest[XZ_CHECK_SIZE_MAX];
+            UInt32 digest32[XZ_CHECK_SIZE_MAX / 4];
             p->state = XZ_STATE_BLOCK_HEADER;
             p->pos = 0;
-            if (XzCheck_Final(&p->check, digest) && memcmp(digest, p->buf, checkSize) != 0)
+            if (XzCheck_Final(&p->check, (void *)digest32) && memcmp(digest32, p->buf, checkSize) != 0)
               return SZ_ERROR_CRC;
             if (p->decodeOnlyOneBlock)
             {
@@ -1254,12 +1288,12 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
           }
           else
           {
-            Byte digest[SHA256_DIGEST_SIZE];
+            UInt32 digest32[SHA256_DIGEST_SIZE / 4];
             p->state = XZ_STATE_STREAM_INDEX_CRC;
             p->indexSize += 4;
             p->pos = 0;
-            Sha256_Final(&p->sha, digest);
-            if (memcmp(digest, p->shaDigest, SHA256_DIGEST_SIZE) != 0)
+            Sha256_Final(&p->sha, (void *)digest32);
+            if (memcmp(digest32, p->shaDigest32, SHA256_DIGEST_SIZE) != 0)
               return SZ_ERROR_CRC;
           }
         }
@@ -1275,9 +1309,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
         }
         else
         {
+          const Byte *ptr = p->buf;
           p->state = XZ_STATE_STREAM_FOOTER;
           p->pos = 0;
-          if (CRC_GET_DIGEST(p->crc) != GetUi32(p->buf))
+          if (CRC_GET_DIGEST(p->crc) != GetUi32a(ptr))
             return SZ_ERROR_CRC;
         }
         break;
@@ -1285,9 +1320,9 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
 
       case XZ_STATE_STREAM_FOOTER:
       {
-        UInt32 cur = XZ_STREAM_FOOTER_SIZE - p->pos;
+        unsigned cur = XZ_STREAM_FOOTER_SIZE - p->pos;
         if (cur > srcRem)
-          cur = (UInt32)srcRem;
+          cur = (unsigned)srcRem;
         memcpy(p->buf + p->pos, src, cur);
         p->pos += cur;
         (*srcLen) += cur;
@@ -1307,7 +1342,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
       {
         if (*src != 0)
         {
-          if (((UInt32)p->padSize & 3) != 0)
+          if ((unsigned)p->padSize & 3)
             return SZ_ERROR_NO_ARCHIVE;
           p->pos = 0;
           p->state = XZ_STATE_STREAM_HEADER;
@@ -1322,6 +1357,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
       }
       
       case XZ_STATE_BLOCK: break; /* to disable GCC warning */
+      
+      default: return SZ_ERROR_FAIL;
     }
   }
   /*
@@ -1386,7 +1423,7 @@ UInt64 XzUnpacker_GetExtraSize(const CXzUnpacker *p)
 
 
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 #include "MtDec.h"
 #endif
 
@@ -1397,7 +1434,7 @@ void XzDecMtProps_Init(CXzDecMtProps *p)
   p->outStep_ST = 1 << 20;
   p->ignoreErrors = False;
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   p->numThreads = 1;
   p->inBufSize_MT = 1 << 18;
   p->memUseMax = sizeof(size_t) << 28;
@@ -1406,7 +1443,7 @@ void XzDecMtProps_Init(CXzDecMtProps *p)
 
 
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
 /* ---------- CXzDecMtThread ---------- */
 
@@ -1445,7 +1482,7 @@ typedef struct
 
 /* ---------- CXzDecMt ---------- */
 
-typedef struct
+struct CXzDecMt
 {
   CAlignOffsetAlloc alignOffsetAlloc;
   ISzAllocPtr allocMid;
@@ -1453,10 +1490,9 @@ typedef struct
   CXzDecMtProps props;
   size_t unpackBlockMaxSize;
   
-  ISeqInStream *inStream;
-  ISeqOutStream *outStream;
-  ICompressProgress *progress;
-  // CXzStatInfo *stat;
+  ISeqInStreamPtr inStream;
+  ISeqOutStreamPtr outStream;
+  ICompressProgressPtr progress;
 
   BoolInt finishMode;
   BoolInt outSize_Defined;
@@ -1479,7 +1515,7 @@ typedef struct
   ECoderStatus status;
   SRes codeRes;
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   BoolInt mainDecoderWasCalled;
   // int statErrorDefined;
   int finishedDecoderIndex;
@@ -1492,8 +1528,9 @@ typedef struct
   UInt64 numBlocks;
 
   // UInt64 numBadBlocks;
-  SRes mainErrorCode;
-
+  SRes mainErrorCode;  // it's set to error code, if the size Code() output doesn't patch the size from Parsing stage
+                       // it can be = SZ_ERROR_INPUT_EOF
+                       // it can be = SZ_ERROR_DATA, in some another cases
   BoolInt isBlockHeaderState_Parse;
   BoolInt isBlockHeaderState_Write;
   UInt64 outProcessed_Parse;
@@ -1501,10 +1538,9 @@ typedef struct
 
   BoolInt mtc_WasConstructed;
   CMtDec mtc;
-  CXzDecMtThread coders[MTDEC__THREADS_MAX];
+  CXzDecMtThread coders[MTDEC_THREADS_MAX];
   #endif
-
-} CXzDecMt;
+};
 
 
 
@@ -1532,11 +1568,11 @@ CXzDecMtHandle XzDecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid)
 
   XzDecMtProps_Init(&p->props);
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   p->mtc_WasConstructed = False;
   {
     unsigned i;
-    for (i = 0; i < MTDEC__THREADS_MAX; i++)
+    for (i = 0; i < MTDEC_THREADS_MAX; i++)
     {
       CXzDecMtThread *coder = &p->coders[i];
       coder->dec_created = False;
@@ -1546,16 +1582,16 @@ CXzDecMtHandle XzDecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid)
   }
   #endif
 
-  return p;
+  return (CXzDecMtHandle)p;
 }
 
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
 static void XzDecMt_FreeOutBufs(CXzDecMt *p)
 {
   unsigned i;
-  for (i = 0; i < MTDEC__THREADS_MAX; i++)
+  for (i = 0; i < MTDEC_THREADS_MAX; i++)
   {
     CXzDecMtThread *coder = &p->coders[i];
     if (coder->outBuf)
@@ -1592,13 +1628,15 @@ static void XzDecMt_FreeSt(CXzDecMt *p)
 }
 
 
-void XzDecMt_Destroy(CXzDecMtHandle pp)
+// #define GET_CXzDecMt_p  CXzDecMt *p = pp;
+
+void XzDecMt_Destroy(CXzDecMtHandle p)
 {
-  CXzDecMt *p = (CXzDecMt *)pp;
+  // GET_CXzDecMt_p
 
   XzDecMt_FreeSt(p);
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
 
   if (p->mtc_WasConstructed)
   {
@@ -1607,7 +1645,7 @@ void XzDecMt_Destroy(CXzDecMtHandle pp)
   }
   {
     unsigned i;
-    for (i = 0; i < MTDEC__THREADS_MAX; i++)
+    for (i = 0; i < MTDEC_THREADS_MAX; i++)
     {
       CXzDecMtThread *t = &p->coders[i];
       if (t->dec_created)
@@ -1622,12 +1660,12 @@ void XzDecMt_Destroy(CXzDecMtHandle pp)
 
   #endif
 
-  ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, pp);
+  ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, p);
 }
 
 
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
 static void XzDecMt_Callback_Parse(void *obj, unsigned coderIndex, CMtDecCallbackInfo *cc)
 {
@@ -1693,7 +1731,7 @@ static void XzDecMt_Callback_Parse(void *obj, unsigned coderIndex, CMtDecCallbac
     coder->dec.parseMode = True;
     coder->dec.headerParsedOk = False;
     
-    PRF_STR_INT("Parse", srcSize2);
+    PRF_STR_INT("Parse", srcSize2)
     
     res = XzUnpacker_Code(&coder->dec,
         NULL, &destSize,
@@ -1736,10 +1774,10 @@ static void XzDecMt_Callback_Parse(void *obj, unsigned coderIndex, CMtDecCallbac
           }
         }
         {
-        UInt64 packSize = block->packSize;
-        UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3);
-        UInt32 checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags);
-        UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize;
+        const UInt64 packSize = block->packSize;
+        const UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3);
+        const unsigned checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags);
+        const UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize;
         // if (blockPackSum <= me->props.inBlockMax)
         // unpackBlockMaxSize
         {
@@ -1877,7 +1915,7 @@ static SRes XzDecMt_Callback_PreCode(void *pp, unsigned coderIndex)
     {
       // if (res == SZ_ERROR_MEM) return res;
       if (me->props.ignoreErrors && res != SZ_ERROR_MEM)
-        return S_OK;
+        return SZ_OK;
       return res;
     }
   }
@@ -1898,15 +1936,18 @@ static SRes XzDecMt_Callback_Code(void *pp, unsigned coderIndex,
   *outCodePos = coder->outCodeSize;
   *stop = True;
 
+  if (srcSize > coder->inPreSize - coder->inCodeSize)
+    return SZ_ERROR_FAIL;
+  
   if (coder->inCodeSize < coder->inPreHeaderSize)
   {
-    UInt64 rem = coder->inPreHeaderSize - coder->inCodeSize;
-    size_t step = srcSize;
-    if (step > rem)
-      step = (size_t)rem;
+    size_t step = coder->inPreHeaderSize - coder->inCodeSize;
+    if (step > srcSize)
+      step = srcSize;
     src += step;
     srcSize -= step;
     coder->inCodeSize += step;
+    *inCodePos = coder->inCodeSize;
     if (coder->inCodeSize < coder->inPreHeaderSize)
     {
       *stop = False;
@@ -1956,7 +1997,7 @@ static SRes XzDecMt_Callback_Code(void *pp, unsigned coderIndex,
   {
     *inCodePos = coder->inPreSize;
     *outCodePos = coder->outPreSize;
-    return S_OK;
+    return SZ_OK;
   }
   return coder->codeRes;
 }
@@ -1966,7 +2007,7 @@ static SRes XzDecMt_Callback_Code(void *pp, unsigned coderIndex,
 
 static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
     BoolInt needWriteToStream,
-    const Byte *src, size_t srcSize,
+    const Byte *src, size_t srcSize, BoolInt isCross,
     // int srcFinished,
     BoolInt *needContinue,
     BoolInt *canRecode)
@@ -1985,7 +2026,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
   if (!coder->dec.headerParsedOk || !coder->outBuf)
   {
     if (me->finishedDecoderIndex < 0)
-      me->finishedDecoderIndex = coderIndex;
+      me->finishedDecoderIndex = (int)coderIndex;
     return SZ_OK;
   }
 
@@ -2065,7 +2106,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
         }
         data += cur;
         size -= cur;
-        // PRF_STR_INT("Written size =", size);
+        // PRF_STR_INT("Written size =", size)
         if (size == 0)
           break;
         res = MtProgress_ProgressAdd(&me->mtc.mtProgress, 0, 0);
@@ -2077,16 +2118,16 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
     if (coder->codeRes != SZ_OK)
       if (!me->props.ignoreErrors)
       {
-        me->finishedDecoderIndex = coderIndex;
+        me->finishedDecoderIndex = (int)coderIndex;
         return res;
       }
 
-    RINOK(res);
+    RINOK(res)
 
     if (coder->inPreSize != coder->inCodeSize
         || coder->blockPackTotal != coder->inCodeSize)
     {
-      me->finishedDecoderIndex = coderIndex;
+      me->finishedDecoderIndex = (int)coderIndex;
       return SZ_OK;
     }
 
@@ -2100,13 +2141,13 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
   // (coder->state == MTDEC_PARSE_END) means that there are no other working threads
   // so we can use mtc variables without lock
 
-  PRF_STR_INT("Write MTDEC_PARSE_END", me->mtc.inProcessed);
+  PRF_STR_INT("Write MTDEC_PARSE_END", me->mtc.inProcessed)
 
   me->mtc.mtProgress.totalInSize = me->mtc.inProcessed;
   {
     CXzUnpacker *dec = &me->dec;
     
-    PRF_STR_INT("PostSingle", srcSize);
+    PRF_STR_INT("PostSingle", srcSize)
     
     {
       size_t srcProcessed = srcSize;
@@ -2125,22 +2166,41 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
         return SZ_OK;
       }
       
+      /*
+      We have processed all xz-blocks of stream,
+      And xz unpacker is at XZ_STATE_BLOCK_HEADER state, where
+      (src) is a pointer to xz-Index structure.
+      We finish reading of current xz-Stream, including Zero padding after xz-Stream.
+      We exit, if we reach extra byte (first byte of new-Stream or another data).
+      But we don't update input stream pointer for that new extra byte.
+      If extra byte is not correct first byte of xz-signature,
+      we have SZ_ERROR_NO_ARCHIVE error here.
+      */
+
       res = XzUnpacker_Code(dec,
           NULL, &outSizeCur,
           src, &srcProcessed,
           me->mtc.readWasFinished, // srcFinished
           CODER_FINISH_END, // CODER_FINISH_ANY,
           &status);
+
+      // res = SZ_ERROR_ARCHIVE; // for failure test
       
       me->status = status;
       me->codeRes = res;
 
+      if (isCross)
+        me->mtc.crossStart += srcProcessed;
+
       me->mtc.inProcessed += srcProcessed;
       me->mtc.mtProgress.totalInSize = me->mtc.inProcessed;
 
+      srcSize -= srcProcessed;
+      src += srcProcessed;
+
       if (res != SZ_OK)
       {
-        return S_OK;
+        return SZ_OK;
         // return res;
       }
       
@@ -2149,20 +2209,26 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
         *needContinue = True;
         me->isBlockHeaderState_Parse = False;
         me->isBlockHeaderState_Write = False;
+
+        if (!isCross)
         {
           Byte *crossBuf = MtDec_GetCrossBuff(&me->mtc);
           if (!crossBuf)
             return SZ_ERROR_MEM;
-          memcpy(crossBuf, src + srcProcessed, srcSize - srcProcessed);
+          if (srcSize != 0)
+            memcpy(crossBuf, src, srcSize);
+          me->mtc.crossStart = 0;
+          me->mtc.crossEnd = srcSize;
         }
-        me->mtc.crossStart = 0;
-        me->mtc.crossEnd = srcSize - srcProcessed;
+
+        PRF_STR_INT("XZ_STATE_STREAM_HEADER crossEnd = ", (unsigned)me->mtc.crossEnd)
+
         return SZ_OK;
       }
       
-      if (status != CODER_STATUS_NEEDS_MORE_INPUT)
+      if (status != CODER_STATUS_NEEDS_MORE_INPUT || srcSize != 0)
       {
-        return E_FAIL;
+        return SZ_ERROR_FAIL;
       }
       
       if (me->mtc.readWasFinished)
@@ -2174,7 +2240,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
     {
       size_t inPos;
       size_t inLim;
-      const Byte *inData;
+      // const Byte *inData;
       UInt64 inProgressPrev = me->mtc.inProcessed;
       
       // XzDecMt_Prepare_InBuf_ST(p);
@@ -2184,9 +2250,8 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
       
       inPos = 0;
       inLim = 0;
-      // outProcessed = 0;
       
-      inData = crossBuf;
+      // inData = crossBuf;
       
       for (;;)
       {
@@ -2201,7 +2266,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
           {
             inPos = 0;
             inLim = me->mtc.inBufSize;
-            me->mtc.readRes = ISeqInStream_Read(me->inStream, (void *)inData, &inLim);
+            me->mtc.readRes = ISeqInStream_Read(me->inStream, (void *)crossBuf, &inLim);
             me->mtc.readProcessed += inLim;
             if (inLim == 0 || me->mtc.readRes != SZ_OK)
               me->mtc.readWasFinished = True;
@@ -2213,7 +2278,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
 
         res = XzUnpacker_Code(dec,
             NULL, &outProcessed,
-            inData + inPos, &inProcessed,
+            crossBuf + inPos, &inProcessed,
             (inProcessed == 0), // srcFinished
             CODER_FINISH_END, &status);
         
@@ -2225,7 +2290,7 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
 
         if (res != SZ_OK)
         {
-          return S_OK;
+          return SZ_OK;
           // return res;
         }
 
@@ -2240,14 +2305,14 @@ static SRes XzDecMt_Callback_Write(void *pp, unsigned coderIndex,
         }
         
         if (status != CODER_STATUS_NEEDS_MORE_INPUT)
-          return E_FAIL;
+          return SZ_ERROR_FAIL;
         
         if (me->mtc.progress)
         {
           UInt64 inDelta = me->mtc.inProcessed - inProgressPrev;
           if (inDelta >= (1 << 22))
           {
-            RINOK(MtProgress_Progress_ST(&me->mtc.mtProgress));
+            RINOK(MtProgress_Progress_ST(&me->mtc.mtProgress))
             inProgressPrev = me->mtc.inProcessed;
           }
         }
@@ -2276,13 +2341,6 @@ void XzStatInfo_Clear(CXzStatInfo *p)
   p->NumStreams_Defined = False;
   p->NumBlocks_Defined = False;
   
-  // p->IsArc = False;
-  // p->UnexpectedEnd = False;
-  // p->Unsupported = False;
-  // p->HeadersError = False;
-  // p->DataError = False;
-  // p->CrcError = False;
-
   p->DataAfterEnd = False;
   p->DecodingTruncated = False;
   
@@ -2296,9 +2354,19 @@ void XzStatInfo_Clear(CXzStatInfo *p)
 
 
 
+/*
+  XzDecMt_Decode_ST() can return SZ_OK or the following errors
+     - SZ_ERROR_MEM for memory allocation error
+     - error from XzUnpacker_Code() function
+     - SZ_ERROR_WRITE for ISeqOutStream::Write(). stat->CombinedRes_Type = SZ_ERROR_WRITE in that case
+     - ICompressProgress::Progress() error,  stat->CombinedRes_Type = SZ_ERROR_PROGRESS.
+  But XzDecMt_Decode_ST() doesn't return ISeqInStream::Read() errors.
+  ISeqInStream::Read() result is set to p->readRes.
+  also it can set stat->CombinedRes_Type to SZ_ERROR_WRITE or SZ_ERROR_PROGRESS.
+*/
 
 static SRes XzDecMt_Decode_ST(CXzDecMt *p
-    #ifndef _7ZIP_ST
+    #ifndef Z7_ST
     , BoolInt tMode
     #endif
     , CXzStatInfo *stat)
@@ -2310,11 +2378,11 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p
 
   CXzUnpacker *dec;
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   if (tMode)
   {
     XzDecMt_FreeOutBufs(p);
-    tMode = MtDec_PrepareRead(&p->mtc);
+    tMode = (BoolInt)MtDec_PrepareRead(&p->mtc);
   }
   #endif
 
@@ -2367,7 +2435,7 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p
 
     if (inPos == inLim)
     {
-      #ifndef _7ZIP_ST
+      #ifndef Z7_ST
       if (tMode)
       {
         inData = MtDec_Read(&p->mtc, &inLim);
@@ -2384,7 +2452,7 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p
         inPos = 0;
         inLim = p->inBufSize;
         inData = p->inBuf;
-        p->readRes = ISeqInStream_Read(p->inStream, (void *)inData, &inLim);
+        p->readRes = ISeqInStream_Read(p->inStream, (void *)p->inBuf, &inLim);
         p->readProcessed += inLim;
         if (inLim == 0 || p->readRes != SZ_OK)
           p->readWasFinished = True;
@@ -2426,8 +2494,8 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p
     if (finished || outProcessed >= outSize)
       if (outPos != 0)
       {
-        size_t written = ISeqOutStream_Write(p->outStream, p->outBuf, outPos);
-        p->outProcessed += written;
+        const size_t written = ISeqOutStream_Write(p->outStream, p->outBuf, outPos);
+        // p->outProcessed += written; // 21.01: BUG fixed
         if (written != outPos)
         {
           stat->CombinedRes_Type = SZ_ERROR_WRITE;
@@ -2438,9 +2506,8 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p
 
     if (p->progress && res == SZ_OK)
     {
-      UInt64 inDelta = p->inProcessed - inPrev;
-      UInt64 outDelta = p->outProcessed - outPrev;
-      if (inDelta >= (1 << 22) || outDelta >= (1 << 22))
+      if (p->inProcessed - inPrev >= (1 << 22) ||
+          p->outProcessed - outPrev >= (1 << 22))
       {
         res = ICompressProgress_Progress(p->progress, p->inProcessed, p->outProcessed);
         if (res != SZ_OK)
@@ -2455,14 +2522,31 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p
     }
 
     if (finished)
-      return res;
+    {
+      // p->codeRes is preliminary error from XzUnpacker_Code.
+      // and it can be corrected later as final result
+      // so we return SZ_OK here instead of (res);
+      return SZ_OK;
+      // return res;
+    }
   }
 }
 
-static SRes XzStatInfo_SetStat(const CXzUnpacker *dec,
+
+
+/*
+XzStatInfo_SetStat() transforms
+    CXzUnpacker return code and status to combined CXzStatInfo results.
+    it can convert SZ_OK to SZ_ERROR_INPUT_EOF
+    it can convert SZ_ERROR_NO_ARCHIVE to SZ_OK and (DataAfterEnd = 1)
+*/
+
+static void XzStatInfo_SetStat(const CXzUnpacker *dec,
     int finishMode,
-    UInt64 readProcessed, UInt64 inProcessed,
-    SRes res, ECoderStatus status,
+    // UInt64 readProcessed,
+    UInt64 inProcessed,
+    SRes res,                     // it's result from CXzUnpacker unpacker
+    ECoderStatus status,
     BoolInt decodingTruncated,
     CXzStatInfo *stat)
 {
@@ -2484,12 +2568,20 @@ static SRes XzStatInfo_SetStat(const CXzUnpacker *dec,
     if (status == CODER_STATUS_NEEDS_MORE_INPUT)
     {
       // CODER_STATUS_NEEDS_MORE_INPUT is expected status for correct xz streams
+      // any extra data is part of correct data
       extraSize = 0;
+      // if xz stream was not finished, then we need more data
       if (!XzUnpacker_IsStreamWasFinished(dec))
         res = SZ_ERROR_INPUT_EOF;
     }
-    else if (!decodingTruncated || finishMode) // (status == CODER_STATUS_NOT_FINISHED)
-      res = SZ_ERROR_DATA;
+    else
+    {
+      // CODER_STATUS_FINISHED_WITH_MARK is not possible for multi stream xz decoding
+      // so he we have (status == CODER_STATUS_NOT_FINISHED)
+      // if (status != CODER_STATUS_FINISHED_WITH_MARK)
+      if (!decodingTruncated || finishMode)
+        res = SZ_ERROR_DATA;
+    }
   }
   else if (res == SZ_ERROR_NO_ARCHIVE)
   {
@@ -2497,37 +2589,42 @@ static SRes XzStatInfo_SetStat(const CXzUnpacker *dec,
     SZ_ERROR_NO_ARCHIVE is possible for 2 states:
       XZ_STATE_STREAM_HEADER  - if bad signature or bad CRC
       XZ_STATE_STREAM_PADDING - if non-zero padding data
-    extraSize / inProcessed don't include "bad" byte
+    extraSize and inProcessed don't include "bad" byte
     */
-    if (inProcessed != extraSize) // if good streams before error
-      if (extraSize != 0 || readProcessed != inProcessed)
+    // if (inProcessed == extraSize), there was no any good xz stream header, and we keep error
+    if (inProcessed != extraSize) // if there were good xz streams before error
+    {
+      // if (extraSize != 0 || readProcessed != inProcessed)
       {
+        // he we suppose that all xz streams were finsihed OK, and we have
+        // some extra data after all streams
         stat->DataAfterEnd = True;
-        // there is some good xz stream before. So we set SZ_OK
         res = SZ_OK;
       }
+    }
   }
   
-  stat->DecodeRes = res;
+  if (stat->DecodeRes == SZ_OK)
+    stat->DecodeRes = res;
 
   stat->InSize -= extraSize;
-  return res;
 }
 
 
-SRes XzDecMt_Decode(CXzDecMtHandle pp,
+
+SRes XzDecMt_Decode(CXzDecMtHandle p,
     const CXzDecMtProps *props,
     const UInt64 *outDataSize, int finishMode,
-    ISeqOutStream *outStream,
+    ISeqOutStreamPtr outStream,
     // Byte *outBuf, size_t *outBufSize,
-    ISeqInStream *inStream,
+    ISeqInStreamPtr inStream,
     // const Byte *inData, size_t inDataSize,
     CXzStatInfo *stat,
     int *isMT,
-    ICompressProgress *progress)
+    ICompressProgressPtr progress)
 {
-  CXzDecMt *p = (CXzDecMt *)pp;
-  #ifndef _7ZIP_ST
+  // GET_CXzDecMt_p
+  #ifndef Z7_ST
   BoolInt tMode;
   #endif
 
@@ -2548,7 +2645,7 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp,
     p->outSize = *outDataSize;
   }
 
-  p->finishMode = finishMode;
+  p->finishMode = (BoolInt)finishMode;
 
   // p->outSize = 457; p->outSize_Defined = True; p->finishMode = False; // for test
 
@@ -2557,8 +2654,9 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp,
   p->inProcessed = 0;
   p->readProcessed = 0;
   p->readWasFinished = False;
+  p->readRes = SZ_OK;
 
-  p->codeRes = 0;
+  p->codeRes = SZ_OK;
   p->status = CODER_STATUS_NOT_SPECIFIED;
 
   XzUnpacker_Init(&p->dec);
@@ -2577,7 +2675,7 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp,
     */
 
   
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
 
   p->isBlockHeaderState_Parse = False;
   p->isBlockHeaderState_Write = False;
@@ -2589,8 +2687,9 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp,
 
   if (p->props.numThreads > 1)
   {
-    IMtDecCallback vt;
-
+    IMtDecCallback2 vt;
+    BoolInt needContinue;
+    SRes res;
     // we just free ST buffers here
     // but we still keep state variables, that was set in XzUnpacker_Init()
     XzDecMt_FreeSt(p);
@@ -2628,45 +2727,45 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp,
     vt.Code = XzDecMt_Callback_Code;
     vt.Write = XzDecMt_Callback_Write;
 
-    {
-      BoolInt needContinue;
-      
-      SRes res = MtDec_Code(&p->mtc);
-
-      stat->InSize = p->mtc.inProcessed;
 
-      p->inProcessed = p->mtc.inProcessed;
-      p->readRes = p->mtc.readRes;
-      p->readWasFinished = p->mtc.readWasFinished;
-      p->readProcessed = p->mtc.readProcessed;
+    res = MtDec_Code(&p->mtc);
 
-      tMode = True;
-      needContinue = False;
 
-      if (res == SZ_OK)
+    stat->InSize = p->mtc.inProcessed;
+    
+    p->inProcessed = p->mtc.inProcessed;
+    p->readRes = p->mtc.readRes;
+    p->readWasFinished = p->mtc.readWasFinished;
+    p->readProcessed = p->mtc.readProcessed;
+    
+    tMode = True;
+    needContinue = False;
+    
+    if (res == SZ_OK)
+    {
+      if (p->mtc.mtProgress.res != SZ_OK)
       {
-        if (p->mtc.mtProgress.res != SZ_OK)
-        {
-          res = p->mtc.mtProgress.res;
-          stat->ProgressRes = res;
-          stat->CombinedRes_Type = SZ_ERROR_PROGRESS;
-        }
-        else
-          needContinue = p->mtc.needContinue;
+        res = p->mtc.mtProgress.res;
+        stat->ProgressRes = res;
+        stat->CombinedRes_Type = SZ_ERROR_PROGRESS;
       }
-
-      if (!needContinue)
+      else
+        needContinue = p->mtc.needContinue;
+    }
+    
+    if (!needContinue)
+    {
       {
         SRes codeRes;
         BoolInt truncated = False;
         ECoderStatus status;
-        CXzUnpacker *dec;
+        const CXzUnpacker *dec;
 
         stat->OutSize = p->outProcessed;
        
         if (p->finishedDecoderIndex >= 0)
         {
-          CXzDecMtThread *coder = &p->coders[(unsigned)p->finishedDecoderIndex];
+          const CXzDecMtThread *coder = &p->coders[(unsigned)p->finishedDecoderIndex];
           codeRes = coder->codeRes;
           dec = &coder->dec;
           status = coder->status;
@@ -2679,41 +2778,46 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp,
           truncated = p->parsing_Truncated;
         }
         else
-          return E_FAIL;
+          return SZ_ERROR_FAIL;
+
+        if (p->mainErrorCode != SZ_OK)
+          stat->DecodeRes = p->mainErrorCode;
 
         XzStatInfo_SetStat(dec, p->finishMode,
-            p->mtc.readProcessed, p->mtc.inProcessed,
+            // p->mtc.readProcessed,
+            p->mtc.inProcessed,
             codeRes, status,
             truncated,
             stat);
+      }
+
+      if (res == SZ_OK)
+      {
+        stat->ReadRes = p->mtc.readRes;
 
-        if (res == SZ_OK)
+        if (p->writeRes != SZ_OK)
         {
-          if (p->writeRes != SZ_OK)
-          {
-            res = p->writeRes;
-            stat->CombinedRes_Type = SZ_ERROR_WRITE;
-          }
-          else if (p->mtc.readRes != SZ_OK && p->mtc.inProcessed == p->mtc.readProcessed)
-          {
-            res = p->mtc.readRes;
-            stat->ReadRes = res;
-            stat->CombinedRes_Type = SZ_ERROR_READ;
-          }
-          else if (p->mainErrorCode != SZ_OK)
-          {
-            res = p->mainErrorCode;
-          }
+          res = p->writeRes;
+          stat->CombinedRes_Type = SZ_ERROR_WRITE;
         }
-
-        stat->CombinedRes = res;
-        if (stat->CombinedRes_Type == SZ_OK)
-          stat->CombinedRes_Type = res;
-        return res;
+        else if (p->mtc.readRes != SZ_OK
+            // && p->mtc.inProcessed == p->mtc.readProcessed
+            && stat->DecodeRes == SZ_ERROR_INPUT_EOF)
+        {
+          res = p->mtc.readRes;
+          stat->CombinedRes_Type = SZ_ERROR_READ;
+        }
+        else if (stat->DecodeRes != SZ_OK)
+          res = stat->DecodeRes;
       }
-
-      PRF_STR("----- decoding ST -----");
+      
+      stat->CombinedRes = res;
+      if (stat->CombinedRes_Type == SZ_OK)
+        stat->CombinedRes_Type = res;
+      return res;
     }
+
+    PRF_STR("----- decoding ST -----")
   }
 
   #endif
@@ -2723,39 +2827,41 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp,
 
   {
     SRes res = XzDecMt_Decode_ST(p
-        #ifndef _7ZIP_ST
+        #ifndef Z7_ST
         , tMode
         #endif
         , stat
         );
 
+    #ifndef Z7_ST
+    // we must set error code from MT decoding at first
+    if (p->mainErrorCode != SZ_OK)
+      stat->DecodeRes = p->mainErrorCode;
+    #endif
+
     XzStatInfo_SetStat(&p->dec,
         p->finishMode,
-        p->readProcessed, p->inProcessed,
+        // p->readProcessed,
+        p->inProcessed,
         p->codeRes, p->status,
         False, // truncated
         stat);
 
+    stat->ReadRes = p->readRes;
+
     if (res == SZ_OK)
     {
-      /*
-      if (p->writeRes != SZ_OK)
-      {
-        res = p->writeRes;
-        stat->CombinedRes_Type = SZ_ERROR_WRITE;
-      }
-      else
-      */
-      if (p->readRes != SZ_OK && p->inProcessed == p->readProcessed)
+      if (p->readRes != SZ_OK
+          // && p->inProcessed == p->readProcessed
+          && stat->DecodeRes == SZ_ERROR_INPUT_EOF)
       {
+        // we set read error as combined error, only if that error was the reason
+        // of decoding problem
         res = p->readRes;
-        stat->ReadRes = res;
         stat->CombinedRes_Type = SZ_ERROR_READ;
       }
-      #ifndef _7ZIP_ST
-      else if (p->mainErrorCode != SZ_OK)
-        res = p->mainErrorCode;
-      #endif
+      else if (stat->DecodeRes != SZ_OK)
+        res = stat->DecodeRes;
     }
 
     stat->CombinedRes = res;
@@ -2764,3 +2870,7 @@ SRes XzDecMt_Decode(CXzDecMtHandle pp,
     return res;
   }
 }
+
+#undef PRF
+#undef PRF_STR
+#undef PRF_STR_INT_2
diff --git a/src/sdk/C/XzEnc.c b/src/sdk/C/XzEnc.c
index d0a8b44..e40f0c8 100644
--- a/src/sdk/C/XzEnc.c
+++ b/src/sdk/C/XzEnc.c
@@ -1,5 +1,5 @@
 /* XzEnc.c -- Xz Encode
-2019-02-02 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -18,42 +18,43 @@
 
 #include "XzEnc.h"
 
-// #define _7ZIP_ST
+// #define Z7_ST
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 #include "MtCoder.h"
 #else
-#define MTCODER__THREADS_MAX 1
-#define MTCODER__BLOCKS_MAX 1
+#define MTCODER_THREADS_MAX 1
+#define MTCODER_BLOCKS_MAX 1
 #endif
 
 #define XZ_GET_PAD_SIZE(dataSize) ((4 - ((unsigned)(dataSize) & 3)) & 3)
 
-/* max pack size for LZMA2 block + check-64bytrs: */
-#define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + 64)
+#define XZ_CHECK_SIZE_MAX 64
+/* max pack size for LZMA2 block + pad4 + check_size: */
+#define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + XZ_CHECK_SIZE_MAX)
 
 #define XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(unpackSize) (XZ_BLOCK_HEADER_SIZE_MAX + XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize))
 
 
-#define XzBlock_ClearFlags(p)       (p)->flags = 0;
-#define XzBlock_SetNumFilters(p, n) (p)->flags |= ((n) - 1);
+// #define XzBlock_ClearFlags(p)       (p)->flags = 0;
+#define XzBlock_ClearFlags_SetNumFilters(p, n) (p)->flags = (Byte)((n) - 1);
 #define XzBlock_SetHasPackSize(p)   (p)->flags |= XZ_BF_PACK_SIZE;
 #define XzBlock_SetHasUnpackSize(p) (p)->flags |= XZ_BF_UNPACK_SIZE;
 
 
-static SRes WriteBytes(ISeqOutStream *s, const void *buf, size_t size)
+static SRes WriteBytes(ISeqOutStreamPtr s, const void *buf, size_t size)
 {
   return (ISeqOutStream_Write(s, buf, size) == size) ? SZ_OK : SZ_ERROR_WRITE;
 }
 
-static SRes WriteBytesUpdateCrc(ISeqOutStream *s, const void *buf, size_t size, UInt32 *crc)
+static SRes WriteBytes_UpdateCrc(ISeqOutStreamPtr s, const void *buf, size_t size, UInt32 *crc)
 {
   *crc = CrcUpdate(*crc, buf, size);
   return WriteBytes(s, buf, size);
 }
 
 
-static SRes Xz_WriteHeader(CXzStreamFlags f, ISeqOutStream *s)
+static SRes Xz_WriteHeader(CXzStreamFlags f, ISeqOutStreamPtr s)
 {
   UInt32 crc;
   Byte header[XZ_STREAM_HEADER_SIZE];
@@ -61,12 +62,12 @@ static SRes Xz_WriteHeader(CXzStreamFlags f, ISeqOutStream *s)
   header[XZ_SIG_SIZE] = (Byte)(f >> 8);
   header[XZ_SIG_SIZE + 1] = (Byte)(f & 0xFF);
   crc = CrcCalc(header + XZ_SIG_SIZE, XZ_STREAM_FLAGS_SIZE);
-  SetUi32(header + XZ_SIG_SIZE + XZ_STREAM_FLAGS_SIZE, crc);
+  SetUi32(header + XZ_SIG_SIZE + XZ_STREAM_FLAGS_SIZE, crc)
   return WriteBytes(s, header, XZ_STREAM_HEADER_SIZE);
 }
 
 
-static SRes XzBlock_WriteHeader(const CXzBlock *p, ISeqOutStream *s)
+static SRes XzBlock_WriteHeader(const CXzBlock *p, ISeqOutStreamPtr s)
 {
   Byte header[XZ_BLOCK_HEADER_SIZE_MAX];
 
@@ -91,7 +92,7 @@ static SRes XzBlock_WriteHeader(const CXzBlock *p, ISeqOutStream *s)
     header[pos++] = 0;
 
   header[0] = (Byte)(pos >> 2);
-  SetUi32(header + pos, CrcCalc(header, pos));
+  SetUi32(header + pos, CrcCalc(header, pos))
   return WriteBytes(s, header, pos + 4);
 }
 
@@ -182,7 +183,7 @@ static SRes XzEncIndex_AddIndexRecord(CXzEncIndex *p, UInt64 unpackSize, UInt64
     size_t newSize = p->allocated * 2 + 16 * 2;
     if (newSize < p->size + pos)
       return SZ_ERROR_MEM;
-    RINOK(XzEncIndex_ReAlloc(p, newSize, alloc));
+    RINOK(XzEncIndex_ReAlloc(p, newSize, alloc))
   }
   memcpy(p->blocks + p->size, buf, pos);
   p->size += pos;
@@ -191,7 +192,7 @@ static SRes XzEncIndex_AddIndexRecord(CXzEncIndex *p, UInt64 unpackSize, UInt64
 }
 
 
-static SRes XzEncIndex_WriteFooter(const CXzEncIndex *p, CXzStreamFlags flags, ISeqOutStream *s)
+static SRes XzEncIndex_WriteFooter(const CXzEncIndex *p, CXzStreamFlags flags, ISeqOutStreamPtr s)
 {
   Byte buf[32];
   UInt64 globalPos;
@@ -200,8 +201,8 @@ static SRes XzEncIndex_WriteFooter(const CXzEncIndex *p, CXzStreamFlags flags, I
   
   globalPos = pos;
   buf[0] = 0;
-  RINOK(WriteBytesUpdateCrc(s, buf, pos, &crc));
-  RINOK(WriteBytesUpdateCrc(s, p->blocks, p->size, &crc));
+  RINOK(WriteBytes_UpdateCrc(s, buf, pos, &crc))
+  RINOK(WriteBytes_UpdateCrc(s, p->blocks, p->size, &crc))
   globalPos += p->size;
   
   pos = XZ_GET_PAD_SIZE(globalPos);
@@ -211,12 +212,12 @@ static SRes XzEncIndex_WriteFooter(const CXzEncIndex *p, CXzStreamFlags flags, I
   globalPos += pos;
   
   crc = CrcUpdate(crc, buf + 4 - pos, pos);
-  SetUi32(buf + 4, CRC_GET_DIGEST(crc));
+  SetUi32(buf + 4, CRC_GET_DIGEST(crc))
   
-  SetUi32(buf + 8 + 4, (UInt32)(globalPos >> 2));
+  SetUi32(buf + 8 + 4, (UInt32)(globalPos >> 2))
   buf[8 + 8] = (Byte)(flags >> 8);
   buf[8 + 9] = (Byte)(flags & 0xFF);
-  SetUi32(buf + 8, CrcCalc(buf + 8 + 4, 6));
+  SetUi32(buf + 8, CrcCalc(buf + 8 + 4, 6))
   buf[8 + 10] = XZ_FOOTER_SIG_0;
   buf[8 + 11] = XZ_FOOTER_SIG_1;
   
@@ -230,7 +231,7 @@ static SRes XzEncIndex_WriteFooter(const CXzEncIndex *p, CXzStreamFlags flags, I
 typedef struct
 {
   ISeqInStream vt;
-  ISeqInStream *realStream;
+  ISeqInStreamPtr realStream;
   const Byte *data;
   UInt64 limit;
   UInt64 processed;
@@ -251,9 +252,9 @@ static void SeqCheckInStream_GetDigest(CSeqCheckInStream *p, Byte *digest)
   XzCheck_Final(&p->check, digest);
 }
 
-static SRes SeqCheckInStream_Read(const ISeqInStream *pp, void *data, size_t *size)
+static SRes SeqCheckInStream_Read(ISeqInStreamPtr pp, void *data, size_t *size)
 {
-  CSeqCheckInStream *p = CONTAINER_FROM_VTBL(pp, CSeqCheckInStream, vt);
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSeqCheckInStream)
   size_t size2 = *size;
   SRes res = SZ_OK;
   
@@ -285,15 +286,15 @@ static SRes SeqCheckInStream_Read(const ISeqInStream *pp, void *data, size_t *si
 typedef struct
 {
   ISeqOutStream vt;
-  ISeqOutStream *realStream;
+  ISeqOutStreamPtr realStream;
   Byte *outBuf;
   size_t outBufLimit;
   UInt64 processed;
 } CSeqSizeOutStream;
 
-static size_t SeqSizeOutStream_Write(const ISeqOutStream *pp, const void *data, size_t size)
+static size_t SeqSizeOutStream_Write(ISeqOutStreamPtr pp, const void *data, size_t size)
 {
-  CSeqSizeOutStream *p = CONTAINER_FROM_VTBL(pp, CSeqSizeOutStream, vt);
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSeqSizeOutStream)
   if (p->realStream)
     size = ISeqOutStream_Write(p->realStream, data, size);
   else
@@ -313,8 +314,8 @@ static size_t SeqSizeOutStream_Write(const ISeqOutStream *pp, const void *data,
 
 typedef struct
 {
-  ISeqInStream p;
-  ISeqInStream *realStream;
+  ISeqInStream vt;
+  ISeqInStreamPtr realStream;
   IStateCoder StateCoder;
   Byte *buf;
   size_t curPos;
@@ -323,7 +324,40 @@ typedef struct
 } CSeqInFilter;
 
 
-SRes BraState_SetFromMethod(IStateCoder *p, UInt64 id, int encodeMode, ISzAllocPtr alloc);
+static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Enc[] =
+{
+  Z7_BRANCH_CONV_ENC_2 (BranchConv_PPC),
+  Z7_BRANCH_CONV_ENC_2 (BranchConv_IA64),
+  Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM),
+  Z7_BRANCH_CONV_ENC_2 (BranchConv_ARMT),
+  Z7_BRANCH_CONV_ENC_2 (BranchConv_SPARC),
+  Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM64),
+  Z7_BRANCH_CONV_ENC_2 (BranchConv_RISCV)
+};
+
+static SizeT XzBcFilterStateBase_Filter_Enc(CXzBcFilterStateBase *p, Byte *data, SizeT size)
+{
+  switch (p->methodId)
+  {
+    case XZ_ID_Delta:
+      Delta_Encode(p->delta_State, p->delta, data, size);
+      break;
+    case XZ_ID_X86:
+      size = (SizeT)(z7_BranchConvSt_X86_Enc(data, size, p->ip, &p->X86_State) - data);
+      break;
+    default:
+      if (p->methodId >= XZ_ID_PPC)
+      {
+        const UInt32 i = p->methodId - XZ_ID_PPC;
+        if (i < Z7_ARRAY_SIZE(g_Funcs_BranchConv_RISC_Enc))
+          size = (SizeT)(g_Funcs_BranchConv_RISC_Enc[i](data, size, p->ip) - data);
+      }
+      break;
+  }
+  p->ip += (UInt32)size;
+  return size;
+}
+
 
 static SRes SeqInFilter_Init(CSeqInFilter *p, const CXzFilter *props, ISzAllocPtr alloc)
 {
@@ -335,17 +369,17 @@ static SRes SeqInFilter_Init(CSeqInFilter *p, const CXzFilter *props, ISzAllocPt
   }
   p->curPos = p->endPos = 0;
   p->srcWasFinished = 0;
-  RINOK(BraState_SetFromMethod(&p->StateCoder, props->id, 1, alloc));
-  RINOK(p->StateCoder.SetProps(p->StateCoder.p, props->props, props->propsSize, alloc));
+  RINOK(Xz_StateCoder_Bc_SetFromMethod_Func(&p->StateCoder, props->id, XzBcFilterStateBase_Filter_Enc, alloc))
+  RINOK(p->StateCoder.SetProps(p->StateCoder.p, props->props, props->propsSize, alloc))
   p->StateCoder.Init(p->StateCoder.p);
   return SZ_OK;
 }
 
 
-static SRes SeqInFilter_Read(const ISeqInStream *pp, void *data, size_t *size)
+static SRes SeqInFilter_Read(ISeqInStreamPtr pp, void *data, size_t *size)
 {
-  CSeqInFilter *p = CONTAINER_FROM_VTBL(pp, CSeqInFilter, p);
-  size_t sizeOriginal = *size;
+  Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSeqInFilter)
+  const size_t sizeOriginal = *size;
   if (sizeOriginal == 0)
     return SZ_OK;
   *size = 0;
@@ -356,7 +390,7 @@ static SRes SeqInFilter_Read(const ISeqInStream *pp, void *data, size_t *size)
     {
       p->curPos = 0;
       p->endPos = FILTER_BUF_SIZE;
-      RINOK(ISeqInStream_Read(p->realStream, p->buf, &p->endPos));
+      RINOK(ISeqInStream_Read(p->realStream, p->buf, &p->endPos))
       if (p->endPos == 0)
         p->srcWasFinished = 1;
     }
@@ -377,13 +411,15 @@ static SRes SeqInFilter_Read(const ISeqInStream *pp, void *data, size_t *size)
   }
 }
 
+Z7_FORCE_INLINE
 static void SeqInFilter_Construct(CSeqInFilter *p)
 {
   p->buf = NULL;
   p->StateCoder.p = NULL;
-  p->p.Read = SeqInFilter_Read;
+  p->vt.Read = SeqInFilter_Read;
 }
 
+Z7_FORCE_INLINE
 static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc)
 {
   if (p->StateCoder.p)
@@ -406,13 +442,13 @@ static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc)
 typedef struct
 {
   ISeqInStream vt;
-  ISeqInStream *inStream;
+  ISeqInStreamPtr inStream;
   CSbEnc enc;
 } CSbEncInStream;
 
-static SRes SbEncInStream_Read(const ISeqInStream *pp, void *data, size_t *size)
+static SRes SbEncInStream_Read(ISeqInStreamPtr pp, void *data, size_t *size)
 {
-  CSbEncInStream *p = CONTAINER_FROM_VTBL(pp, CSbEncInStream, vt);
+  CSbEncInStream *p = Z7_CONTAINER_FROM_VTBL(pp, CSbEncInStream, vt);
   size_t sizeOriginal = *size;
   if (sizeOriginal == 0)
     return SZ_OK;
@@ -422,7 +458,7 @@ static SRes SbEncInStream_Read(const ISeqInStream *pp, void *data, size_t *size)
     if (p->enc.needRead && !p->enc.readWasFinished)
     {
       size_t processed = p->enc.needReadSizeMax;
-      RINOK(p->inStream->Read(p->inStream, p->enc.buf + p->enc.readPos, &processed));
+      RINOK(p->inStream->Read(p->inStream, p->enc.buf + p->enc.readPos, &processed))
       p->enc.readPos += processed;
       if (processed == 0)
       {
@@ -433,7 +469,7 @@ static SRes SbEncInStream_Read(const ISeqInStream *pp, void *data, size_t *size)
     }
   
     *size = sizeOriginal;
-    RINOK(SbEnc_Read(&p->enc, data, size));
+    RINOK(SbEnc_Read(&p->enc, data, size))
     if (*size != 0 || !p->enc.needRead)
       return SZ_OK;
   }
@@ -473,7 +509,8 @@ void XzFilterProps_Init(CXzFilterProps *p)
 void XzProps_Init(CXzProps *p)
 {
   p->checkId = XZ_CHECK_CRC32;
-  p->blockSize = XZ_PROPS__BLOCK_SIZE__AUTO;
+  p->numThreadGroups = 0;
+  p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO;
   p->numBlockThreads_Reduced = -1;
   p->numBlockThreads_Max = -1;
   p->numTotalThreads = -1;
@@ -502,8 +539,8 @@ static void XzEncProps_Normalize_Fixed(CXzProps *p)
   t2 = p->numBlockThreads_Max;
   t3 = p->numTotalThreads;
 
-  if (t2 > MTCODER__THREADS_MAX)
-    t2 = MTCODER__THREADS_MAX;
+  if (t2 > MTCODER_THREADS_MAX)
+    t2 = MTCODER_THREADS_MAX;
 
   if (t3 <= 0)
   {
@@ -519,8 +556,8 @@ static void XzEncProps_Normalize_Fixed(CXzProps *p)
       t1 = 1;
       t2 = t3;
     }
-    if (t2 > MTCODER__THREADS_MAX)
-      t2 = MTCODER__THREADS_MAX;
+    if (t2 > MTCODER_THREADS_MAX)
+      t2 = MTCODER_THREADS_MAX;
   }
   else if (t1 <= 0)
   {
@@ -552,7 +589,7 @@ static void XzEncProps_Normalize_Fixed(CXzProps *p)
         numBlocks++;
       if (numBlocks < (unsigned)t2)
       {
-        t2r = (unsigned)numBlocks;
+        t2r = (int)numBlocks;
         if (t2r == 0)
           t2r = 1;
         t3 = t1 * t2r;
@@ -571,7 +608,7 @@ static void XzProps_Normalize(CXzProps *p)
   /* we normalize xzProps properties, but we normalize only some of CXzProps::lzma2Props properties.
      Lzma2Enc_SetProps() will normalize lzma2Props later. */
   
-  if (p->blockSize == XZ_PROPS__BLOCK_SIZE__SOLID)
+  if (p->blockSize == XZ_PROPS_BLOCK_SIZE_SOLID)
   {
     p->lzma2Props.lzmaProps.reduceSize = p->reduceSize;
     p->numBlockThreads_Reduced = 1;
@@ -583,15 +620,15 @@ static void XzProps_Normalize(CXzProps *p)
   else
   {
     CLzma2EncProps *lzma2 = &p->lzma2Props;
-    if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO)
+    if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO)
     {
       // xz-auto
       p->lzma2Props.lzmaProps.reduceSize = p->reduceSize;
 
-      if (lzma2->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID)
+      if (lzma2->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID)
       {
         // if (xz-auto && lzma2-solid) - we use solid for both
-        p->blockSize = XZ_PROPS__BLOCK_SIZE__SOLID;
+        p->blockSize = XZ_PROPS_BLOCK_SIZE_SOLID;
         p->numBlockThreads_Reduced = 1;
         p->numBlockThreads_Max = 1;
         if (p->lzma2Props.numTotalThreads <= 0)
@@ -610,9 +647,9 @@ static void XzProps_Normalize(CXzProps *p)
         p->blockSize = tp.blockSize; // fixed or solid
         p->numBlockThreads_Reduced = tp.numBlockThreads_Reduced;
         p->numBlockThreads_Max = tp.numBlockThreads_Max;
-        if (lzma2->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO)
-          lzma2->blockSize = tp.blockSize; // fixed or solid, LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID
-        if (lzma2->lzmaProps.reduceSize > tp.blockSize && tp.blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID)
+        if (lzma2->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO)
+          lzma2->blockSize = tp.blockSize; // fixed or solid, LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID
+        if (lzma2->lzmaProps.reduceSize > tp.blockSize && tp.blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID)
           lzma2->lzmaProps.reduceSize = tp.blockSize;
         lzma2->numBlockThreads_Reduced = 1;
         lzma2->numBlockThreads_Max = 1;
@@ -631,9 +668,9 @@ static void XzProps_Normalize(CXzProps *p)
           r = p->blockSize;
         lzma2->lzmaProps.reduceSize = r;
       }
-      if (lzma2->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO)
-        lzma2->blockSize = LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID;
-      else if (lzma2->blockSize > p->blockSize && lzma2->blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID)
+      if (lzma2->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO)
+        lzma2->blockSize = LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID;
+      else if (lzma2->blockSize > p->blockSize && lzma2->blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID)
         lzma2->blockSize = p->blockSize;
       
       XzEncProps_Normalize_Fixed(p);
@@ -655,6 +692,7 @@ typedef struct
 } CLzma2WithFilters;
 
 
+Z7_FORCE_INLINE
 static void Lzma2WithFilters_Construct(CLzma2WithFilters *p)
 {
   p->lzma2 = NULL;
@@ -678,6 +716,7 @@ static SRes Lzma2WithFilters_Create(CLzma2WithFilters *p, ISzAllocPtr alloc, ISz
 }
 
 
+Z7_FORCE_INLINE
 static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc)
 {
   #ifdef USE_SUBBLOCK
@@ -704,17 +743,17 @@ typedef struct
 static SRes Xz_CompressBlock(
     CLzma2WithFilters *lzmaf,
     
-    ISeqOutStream *outStream,
+    ISeqOutStreamPtr outStream,
     Byte *outBufHeader,
     Byte *outBufData, size_t outBufDataLimit,
 
-    ISeqInStream *inStream,
+    ISeqInStreamPtr inStream,
     // UInt64 expectedSize,
     const Byte *inBuf, // used if (!inStream)
     size_t inBufSize,  // used if (!inStream), it's block size, props->blockSize is ignored
 
     const CXzProps *props,
-    ICompressProgress *progress,
+    ICompressProgressPtr progress,
     int *inStreamFinished,  /* only for inStream version */
     CXzEncBlockInfo *blockSizes,
     ISzAllocPtr alloc,
@@ -731,12 +770,12 @@ static SRes Xz_CompressBlock(
   
   *inStreamFinished = False;
   
-  RINOK(Lzma2WithFilters_Create(lzmaf, alloc, allocBig));
+  RINOK(Lzma2WithFilters_Create(lzmaf, alloc, allocBig))
   
-  RINOK(Lzma2Enc_SetProps(lzmaf->lzma2, &props->lzma2Props));
+  RINOK(Lzma2Enc_SetProps(lzmaf->lzma2, &props->lzma2Props))
   
-  XzBlock_ClearFlags(&block);
-  XzBlock_SetNumFilters(&block, 1 + (fp ? 1 : 0));
+  // XzBlock_ClearFlags(&block)
+  XzBlock_ClearFlags_SetNumFilters(&block, 1 + (fp ? 1 : 0))
   
   if (fp)
   {
@@ -751,7 +790,8 @@ static SRes Xz_CompressBlock(
     }
     else if (fp->ipDefined)
     {
-      SetUi32(filter->props, fp->ip);
+      Byte *ptr = filter->props;
+      SetUi32(ptr, fp->ip)
       filter->propsSize = 4;
     }
   }
@@ -776,13 +816,13 @@ static SRes Xz_CompressBlock(
     if (props->blockSize != (UInt64)(Int64)-1)
       if (expectedSize > props->blockSize)
         block.unpackSize = props->blockSize;
-    XzBlock_SetHasUnpackSize(&block);
+    XzBlock_SetHasUnpackSize(&block)
   }
   */
 
   if (outStream)
   {
-    RINOK(XzBlock_WriteHeader(&block, &seqSizeOutStream.vt));
+    RINOK(XzBlock_WriteHeader(&block, &seqSizeOutStream.vt))
   }
   
   checkInStream.vt.Read = SeqCheckInStream_Read;
@@ -800,13 +840,13 @@ static SRes Xz_CompressBlock(
     if (fp->id == XZ_ID_Subblock)
     {
       lzmaf->sb.inStream = &checkInStream.vt;
-      RINOK(SbEncInStream_Init(&lzmaf->sb));
+      RINOK(SbEncInStream_Init(&lzmaf->sb))
     }
     else
     #endif
     {
       lzmaf->filter.realStream = &checkInStream.vt;
-      RINOK(SeqInFilter_Init(&lzmaf->filter, filter, alloc));
+      RINOK(SeqInFilter_Init(&lzmaf->filter, filter, alloc))
     }
   }
 
@@ -840,7 +880,7 @@ static SRes Xz_CompressBlock(
             #ifdef USE_SUBBLOCK
             (fp->id == XZ_ID_Subblock) ? &lzmaf->sb.vt:
             #endif
-            &lzmaf->filter.p) :
+            &lzmaf->filter.vt) :
             &checkInStream.vt) : NULL,
       
         useStream ? NULL : inBuf,
@@ -851,13 +891,13 @@ static SRes Xz_CompressBlock(
     if (outBuf)
       seqSizeOutStream.processed += outSize;
     
-    RINOK(res);
+    RINOK(res)
     blockSizes->unpackSize = checkInStream.processed;
   }
   {
-    Byte buf[4 + 64];
-    unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed);
-    UInt64 packSize = seqSizeOutStream.processed;
+    Byte buf[4 + XZ_CHECK_SIZE_MAX];
+    const unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed);
+    const UInt64 packSize = seqSizeOutStream.processed;
     
     buf[0] = 0;
     buf[1] = 0;
@@ -865,7 +905,8 @@ static SRes Xz_CompressBlock(
     buf[3] = 0;
     
     SeqCheckInStream_GetDigest(&checkInStream, buf + 4);
-    RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize), padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId)));
+    RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize),
+        padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId)))
     
     blockSizes->totalSize = seqSizeOutStream.processed - padSize;
     
@@ -876,12 +917,12 @@ static SRes Xz_CompressBlock(
       seqSizeOutStream.processed = 0;
       
       block.unpackSize = blockSizes->unpackSize;
-      XzBlock_SetHasUnpackSize(&block);
+      XzBlock_SetHasUnpackSize(&block)
       
       block.packSize = packSize;
-      XzBlock_SetHasPackSize(&block);
+      XzBlock_SetHasPackSize(&block)
       
-      RINOK(XzBlock_WriteHeader(&block, &seqSizeOutStream.vt));
+      RINOK(XzBlock_WriteHeader(&block, &seqSizeOutStream.vt))
       
       blockSizes->headerSize = (size_t)seqSizeOutStream.processed;
       blockSizes->totalSize += seqSizeOutStream.processed;
@@ -905,15 +946,15 @@ static SRes Xz_CompressBlock(
 typedef struct
 {
   ICompressProgress vt;
-  ICompressProgress *progress;
+  ICompressProgressPtr progress;
   UInt64 inOffset;
   UInt64 outOffset;
 } CCompressProgress_XzEncOffset;
 
 
-static SRes CompressProgress_XzEncOffset_Progress(const ICompressProgress *pp, UInt64 inSize, UInt64 outSize)
+static SRes CompressProgress_XzEncOffset_Progress(ICompressProgressPtr pp, UInt64 inSize, UInt64 outSize)
 {
-  const CCompressProgress_XzEncOffset *p = CONTAINER_FROM_VTBL(pp, CCompressProgress_XzEncOffset, vt);
+  const CCompressProgress_XzEncOffset *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CCompressProgress_XzEncOffset, vt);
   inSize += p->inOffset;
   outSize += p->outOffset;
   return ICompressProgress_Progress(p->progress, inSize, outSize);
@@ -922,7 +963,7 @@ static SRes CompressProgress_XzEncOffset_Progress(const ICompressProgress *pp, U
 
 
 
-typedef struct
+struct CXzEnc
 {
   ISzAllocPtr alloc;
   ISzAllocPtr allocBig;
@@ -932,20 +973,19 @@ typedef struct
 
   CXzEncIndex xzIndex;
 
-  CLzma2WithFilters lzmaf_Items[MTCODER__THREADS_MAX];
+  CLzma2WithFilters lzmaf_Items[MTCODER_THREADS_MAX];
   
   size_t outBufSize;       /* size of allocated outBufs[i] */
-  Byte *outBufs[MTCODER__BLOCKS_MAX];
+  Byte *outBufs[MTCODER_BLOCKS_MAX];
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   unsigned checkType;
-  ISeqOutStream *outStream;
+  ISeqOutStreamPtr outStream;
   BoolInt mtCoder_WasConstructed;
   CMtCoder mtCoder;
-  CXzEncBlockInfo EncBlocks[MTCODER__BLOCKS_MAX];
+  CXzEncBlockInfo EncBlocks[MTCODER_BLOCKS_MAX];
   #endif
-
-} CXzEnc;
+};
 
 
 static void XzEnc_Construct(CXzEnc *p)
@@ -954,13 +994,13 @@ static void XzEnc_Construct(CXzEnc *p)
 
   XzEncIndex_Construct(&p->xzIndex);
 
-  for (i = 0; i < MTCODER__THREADS_MAX; i++)
+  for (i = 0; i < MTCODER_THREADS_MAX; i++)
     Lzma2WithFilters_Construct(&p->lzmaf_Items[i]);
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   p->mtCoder_WasConstructed = False;
   {
-    for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
+    for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
       p->outBufs[i] = NULL;
     p->outBufSize = 0;
   }
@@ -971,7 +1011,7 @@ static void XzEnc_Construct(CXzEnc *p)
 static void XzEnc_FreeOutBufs(CXzEnc *p)
 {
   unsigned i;
-  for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
+  for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
     if (p->outBufs[i])
     {
       ISzAlloc_Free(p->alloc, p->outBufs[i]);
@@ -987,10 +1027,10 @@ static void XzEnc_Free(CXzEnc *p, ISzAllocPtr alloc)
 
   XzEncIndex_Free(&p->xzIndex, alloc);
 
-  for (i = 0; i < MTCODER__THREADS_MAX; i++)
+  for (i = 0; i < MTCODER_THREADS_MAX; i++)
     Lzma2WithFilters_Free(&p->lzmaf_Items[i], alloc);
   
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   if (p->mtCoder_WasConstructed)
   {
     MtCoder_Destruct(&p->mtCoder);
@@ -1012,37 +1052,38 @@ CXzEncHandle XzEnc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig)
   p->expectedDataSize = (UInt64)(Int64)-1;
   p->alloc = alloc;
   p->allocBig = allocBig;
-  return p;
+  return (CXzEncHandle)p;
 }
 
+// #define GET_CXzEnc_p  CXzEnc *p = (CXzEnc *)(void *)pp;
 
-void XzEnc_Destroy(CXzEncHandle pp)
+void XzEnc_Destroy(CXzEncHandle p)
 {
-  CXzEnc *p = (CXzEnc *)pp;
+  // GET_CXzEnc_p
   XzEnc_Free(p, p->alloc);
   ISzAlloc_Free(p->alloc, p);
 }
 
 
-SRes XzEnc_SetProps(CXzEncHandle pp, const CXzProps *props)
+SRes XzEnc_SetProps(CXzEncHandle p, const CXzProps *props)
 {
-  CXzEnc *p = (CXzEnc *)pp;
+  // GET_CXzEnc_p
   p->xzProps = *props;
   XzProps_Normalize(&p->xzProps);
   return SZ_OK;
 }
 
 
-void XzEnc_SetDataSize(CXzEncHandle pp, UInt64 expectedDataSiize)
+void XzEnc_SetDataSize(CXzEncHandle p, UInt64 expectedDataSiize)
 {
-  CXzEnc *p = (CXzEnc *)pp;
+  // GET_CXzEnc_p
   p->expectedDataSize = expectedDataSiize;
 }
 
 
 
 
-#ifndef _7ZIP_ST
+#ifndef Z7_ST
 
 static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBufIndex,
     const Byte *src, size_t srcSize, int finished)
@@ -1050,18 +1091,19 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf
   CXzEnc *me = (CXzEnc *)pp;
   SRes res;
   CMtProgressThunk progressThunk;
-
-  Byte *dest = me->outBufs[outBufIndex];
-
+  Byte *dest;
   UNUSED_VAR(finished)
-
   {
     CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex];
     bInfo->totalSize = 0;
     bInfo->unpackSize = 0;
     bInfo->headerSize = 0;
+    // v23.02: we don't compress empty blocks
+    // also we must ignore that empty block in XzEnc_MtCallback_Write()
+    if (srcSize == 0)
+      return SZ_OK;
   }
-
+  dest = me->outBufs[outBufIndex];
   if (!dest)
   {
     dest = (Byte *)ISzAlloc_Alloc(me->alloc, me->outBufSize);
@@ -1072,7 +1114,7 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf
   
   MtProgressThunk_CreateVTable(&progressThunk);
   progressThunk.mtProgress = &me->mtCoder.mtProgress;
-  MtProgressThunk_Init(&progressThunk);
+  MtProgressThunk_INIT(&progressThunk)
 
   {
     CXzEncBlockInfo blockSizes;
@@ -1107,27 +1149,29 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf
 static SRes XzEnc_MtCallback_Write(void *pp, unsigned outBufIndex)
 {
   CXzEnc *me = (CXzEnc *)pp;
-
   const CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex];
-  const Byte *data = me->outBufs[outBufIndex];
-
-  RINOK(WriteBytes(me->outStream, data, bInfo->headerSize));
-
+  // v23.02: we don't write empty blocks
+  // note: if (bInfo->unpackSize == 0) then there is no compressed data of block
+  if (bInfo->unpackSize == 0)
+    return SZ_OK;
   {
-    UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize);
-    RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize));
+    const Byte *data = me->outBufs[outBufIndex];
+    RINOK(WriteBytes(me->outStream, data, bInfo->headerSize))
+    {
+      const UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize);
+      RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize))
+    }
+    return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc);
   }
-
-  return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc);
 }
 
 #endif
 
 
 
-SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ICompressProgress *progress)
+SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, ICompressProgressPtr progress)
 {
-  CXzEnc *p = (CXzEnc *)pp;
+  // GET_CXzEnc_p
 
   const CXzProps *props = &p->xzProps;
 
@@ -1136,7 +1180,7 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr
     UInt64 numBlocks = 1;
     UInt64 blockSize = props->blockSize;
     
-    if (blockSize != XZ_PROPS__BLOCK_SIZE__SOLID
+    if (blockSize != XZ_PROPS_BLOCK_SIZE_SOLID
         && props->reduceSize != (UInt64)(Int64)-1)
     {
       numBlocks = props->reduceSize / blockSize;
@@ -1146,13 +1190,13 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr
     else
       blockSize = (UInt64)1 << 62;
     
-    RINOK(XzEncIndex_PreAlloc(&p->xzIndex, numBlocks, blockSize, XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(blockSize), p->alloc));
+    RINOK(XzEncIndex_PreAlloc(&p->xzIndex, numBlocks, blockSize, XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(blockSize), p->alloc))
   }
 
-  RINOK(Xz_WriteHeader((CXzStreamFlags)props->checkId, outStream));
+  RINOK(Xz_WriteHeader((CXzStreamFlags)props->checkId, outStream))
 
 
-  #ifndef _7ZIP_ST
+  #ifndef Z7_ST
   if (props->numBlockThreads_Reduced > 1)
   {
     IMtCoderCallback2 vt;
@@ -1179,8 +1223,8 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr
     p->mtCoder.mtCallback = &vt;
     p->mtCoder.mtCallbackObject = p;
 
-    if (   props->blockSize == XZ_PROPS__BLOCK_SIZE__SOLID
-        || props->blockSize == XZ_PROPS__BLOCK_SIZE__AUTO)
+    if (   props->blockSize == XZ_PROPS_BLOCK_SIZE_SOLID
+        || props->blockSize == XZ_PROPS_BLOCK_SIZE_AUTO)
       return SZ_ERROR_FAIL;
 
     p->mtCoder.blockSize = (size_t)props->blockSize;
@@ -1196,10 +1240,11 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr
       p->outBufSize = destBlockSize;
     }
 
-    p->mtCoder.numThreadsMax = props->numBlockThreads_Max;
+    p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max;
+    p->mtCoder.numThreadGroups = props->numThreadGroups;
     p->mtCoder.expectedDataSize = p->expectedDataSize;
     
-    RINOK(MtCoder_Code(&p->mtCoder));
+    RINOK(MtCoder_Code(&p->mtCoder))
   }
   else
   #endif
@@ -1216,7 +1261,7 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr
     
     writeStartSizes = 0;
     
-    if (props->blockSize != XZ_PROPS__BLOCK_SIZE__SOLID)
+    if (props->blockSize != XZ_PROPS_BLOCK_SIZE_SOLID)
     {
       writeStartSizes = (props->forceWriteSizesInHeader > 0);
       
@@ -1273,18 +1318,18 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr
           &inStreamFinished,
           &blockSizes,
           p->alloc,
-          p->allocBig));
+          p->allocBig))
 
       {
         UInt64 totalPackFull = blockSizes.totalSize + XZ_GET_PAD_SIZE(blockSizes.totalSize);
       
         if (writeStartSizes)
         {
-          RINOK(WriteBytes(outStream, p->outBufs[0], blockSizes.headerSize));
-          RINOK(WriteBytes(outStream, bufData, (size_t)totalPackFull - blockSizes.headerSize));
+          RINOK(WriteBytes(outStream, p->outBufs[0], blockSizes.headerSize))
+          RINOK(WriteBytes(outStream, bufData, (size_t)totalPackFull - blockSizes.headerSize))
         }
         
-        RINOK(XzEncIndex_AddIndexRecord(&p->xzIndex, blockSizes.unpackSize, blockSizes.totalSize, p->alloc));
+        RINOK(XzEncIndex_AddIndexRecord(&p->xzIndex, blockSizes.unpackSize, blockSizes.totalSize, p->alloc))
         
         progress2.inOffset += blockSizes.unpackSize;
         progress2.outOffset += totalPackFull;
@@ -1301,8 +1346,8 @@ SRes XzEnc_Encode(CXzEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStr
 
 #include "Alloc.h"
 
-SRes Xz_Encode(ISeqOutStream *outStream, ISeqInStream *inStream,
-    const CXzProps *props, ICompressProgress *progress)
+SRes Xz_Encode(ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream,
+    const CXzProps *props, ICompressProgressPtr progress)
 {
   SRes res;
   CXzEncHandle xz = XzEnc_Create(&g_Alloc, &g_BigAlloc);
@@ -1316,7 +1361,7 @@ SRes Xz_Encode(ISeqOutStream *outStream, ISeqInStream *inStream,
 }
 
 
-SRes Xz_EncodeEmpty(ISeqOutStream *outStream)
+SRes Xz_EncodeEmpty(ISeqOutStreamPtr outStream)
 {
   SRes res;
   CXzEncIndex xzIndex;
diff --git a/src/sdk/C/XzEnc.h b/src/sdk/C/XzEnc.h
index 0c29e7e..ac6bbf7 100644
--- a/src/sdk/C/XzEnc.h
+++ b/src/sdk/C/XzEnc.h
@@ -1,8 +1,8 @@
 /* XzEnc.h -- Xz Encode
-2017-06-27 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
-#ifndef __XZ_ENC_H
-#define __XZ_ENC_H
+#ifndef ZIP7_INC_XZ_ENC_H
+#define ZIP7_INC_XZ_ENC_H
 
 #include "Lzma2Enc.h"
 
@@ -11,8 +11,8 @@
 EXTERN_C_BEGIN
 
 
-#define XZ_PROPS__BLOCK_SIZE__AUTO   LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO
-#define XZ_PROPS__BLOCK_SIZE__SOLID  LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID
+#define XZ_PROPS_BLOCK_SIZE_AUTO   LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO
+#define XZ_PROPS_BLOCK_SIZE_SOLID  LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID
 
 
 typedef struct
@@ -31,6 +31,7 @@ typedef struct
   CLzma2EncProps lzma2Props;
   CXzFilterProps filterProps;
   unsigned checkId;
+  unsigned numThreadGroups; // 0 : no groups
   UInt64 blockSize;
   int numBlockThreads_Reduced;
   int numBlockThreads_Max;
@@ -41,19 +42,20 @@ typedef struct
 
 void XzProps_Init(CXzProps *p);
 
-
-typedef void * CXzEncHandle;
+typedef struct CXzEnc CXzEnc;
+typedef CXzEnc * CXzEncHandle;
+// Z7_DECLARE_HANDLE(CXzEncHandle)
 
 CXzEncHandle XzEnc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig);
 void XzEnc_Destroy(CXzEncHandle p);
 SRes XzEnc_SetProps(CXzEncHandle p, const CXzProps *props);
 void XzEnc_SetDataSize(CXzEncHandle p, UInt64 expectedDataSiize);
-SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStream *outStream, ISeqInStream *inStream, ICompressProgress *progress);
+SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream, ICompressProgressPtr progress);
 
-SRes Xz_Encode(ISeqOutStream *outStream, ISeqInStream *inStream,
-    const CXzProps *props, ICompressProgress *progress);
+SRes Xz_Encode(ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream,
+    const CXzProps *props, ICompressProgressPtr progress);
 
-SRes Xz_EncodeEmpty(ISeqOutStream *outStream);
+SRes Xz_EncodeEmpty(ISeqOutStreamPtr outStream);
 
 EXTERN_C_END
 
diff --git a/src/sdk/C/XzIn.c b/src/sdk/C/XzIn.c
index ff48e2d..ba31636 100644
--- a/src/sdk/C/XzIn.c
+++ b/src/sdk/C/XzIn.c
@@ -1,40 +1,44 @@
 /* XzIn.c - Xz input
-2018-07-04 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include <string.h>
 
 #include "7zCrc.h"
-#include "CpuArch.h"
 #include "Xz.h"
+#include "CpuArch.h"
 
-/*
-#define XZ_FOOTER_SIG_CHECK(p) (memcmp((p), XZ_FOOTER_SIG, XZ_FOOTER_SIG_SIZE) == 0)
-*/
-#define XZ_FOOTER_SIG_CHECK(p) ((p)[0] == XZ_FOOTER_SIG_0 && (p)[1] == XZ_FOOTER_SIG_1)
-
+#define XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(p) \
+    (GetUi16a((const Byte *)(const void *)(p) + 10) == \
+      (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8)))
 
-SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStream *inStream)
+SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream)
 {
-  Byte sig[XZ_STREAM_HEADER_SIZE];
-  RINOK(SeqInStream_Read2(inStream, sig, XZ_STREAM_HEADER_SIZE, SZ_ERROR_NO_ARCHIVE));
-  if (memcmp(sig, XZ_SIG, XZ_SIG_SIZE) != 0)
+  UInt32 data32[XZ_STREAM_HEADER_SIZE / 4];
+  size_t processedSize = XZ_STREAM_HEADER_SIZE;
+  RINOK(SeqInStream_ReadMax(inStream, data32, &processedSize))
+  if (processedSize != XZ_STREAM_HEADER_SIZE
+      || memcmp(data32, XZ_SIG, XZ_SIG_SIZE) != 0)
     return SZ_ERROR_NO_ARCHIVE;
-  return Xz_ParseHeader(p, sig);
+  return Xz_ParseHeader(p, (const Byte *)(const void *)data32);
 }
 
-#define READ_VARINT_AND_CHECK(buf, pos, size, res) \
-  { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \
-  if (s == 0) return SZ_ERROR_ARCHIVE; pos += s; }
+#define READ_VARINT_AND_CHECK(buf, size, res) \
+{ const unsigned s = Xz_ReadVarInt(buf, size, res); \
+  if (s == 0) return SZ_ERROR_ARCHIVE; \
+  size -= s; \
+  buf += s; \
+}
 
-SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStream *inStream, BoolInt *isIndex, UInt32 *headerSizeRes)
+SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes)
 {
+  MY_ALIGN(4)
   Byte header[XZ_BLOCK_HEADER_SIZE_MAX];
   unsigned headerSize;
   *headerSizeRes = 0;
-  RINOK(SeqInStream_ReadByte(inStream, &header[0]));
-  headerSize = (unsigned)header[0];
+  RINOK(SeqInStream_ReadByte(inStream, &header[0]))
+  headerSize = header[0];
   if (headerSize == 0)
   {
     *headerSizeRes = 1;
@@ -44,20 +48,31 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStream *inStream, BoolInt *isIndex, U
 
   *isIndex = False;
   headerSize = (headerSize << 2) + 4;
-  *headerSizeRes = headerSize;
-  RINOK(SeqInStream_Read(inStream, header + 1, headerSize - 1));
+  *headerSizeRes = (UInt32)headerSize;
+  {
+    size_t processedSize = headerSize - 1;
+    RINOK(SeqInStream_ReadMax(inStream, header + 1, &processedSize))
+    if (processedSize != headerSize - 1)
+      return SZ_ERROR_INPUT_EOF;
+  }
   return XzBlock_Parse(p, header);
 }
 
+
 #define ADD_SIZE_CHECK(size, val) \
-  { UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; }
+{ const UInt64 newSize = size + (val); \
+  if (newSize < size) return XZ_SIZE_OVERFLOW; \
+  size = newSize; \
+}
 
 UInt64 Xz_GetUnpackSize(const CXzStream *p)
 {
   UInt64 size = 0;
   size_t i;
   for (i = 0; i < p->numBlocks; i++)
-    ADD_SIZE_CHECK(size, p->blocks[i].unpackSize);
+  {
+    ADD_SIZE_CHECK(size, p->blocks[i].unpackSize)
+  }
   return size;
 }
 
@@ -66,171 +81,204 @@ UInt64 Xz_GetPackSize(const CXzStream *p)
   UInt64 size = 0;
   size_t i;
   for (i = 0; i < p->numBlocks; i++)
-    ADD_SIZE_CHECK(size, (p->blocks[i].totalSize + 3) & ~(UInt64)3);
+  {
+    ADD_SIZE_CHECK(size, (p->blocks[i].totalSize + 3) & ~(UInt64)3)
+  }
   return size;
 }
 
-/*
-SRes XzBlock_ReadFooter(CXzBlock *p, CXzStreamFlags f, ISeqInStream *inStream)
-{
-  return SeqInStream_Read(inStream, p->check, XzFlags_GetCheckSize(f));
-}
-*/
 
-static SRes Xz_ReadIndex2(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
+// input;
+//   CXzStream (p) is empty object.
+//   size != 0
+//   (size & 3) == 0
+//   (buf) is aligned for at least 4 bytes.
+// output:
+//   p->numBlocks is number of allocated items in p->blocks
+//   p->blocks[*] values must be ignored, if function returns error.
+static SRes Xz_ParseIndex(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
 {
-  size_t numBlocks, pos = 1;
-  UInt32 crc;
-
+  size_t numBlocks;
   if (size < 5 || buf[0] != 0)
     return SZ_ERROR_ARCHIVE;
-
   size -= 4;
-  crc = CrcCalc(buf, size);
-  if (crc != GetUi32(buf + size))
-    return SZ_ERROR_ARCHIVE;
-
+  {
+    const UInt32 crc = CrcCalc(buf, size);
+    if (crc != GetUi32a(buf + size))
+      return SZ_ERROR_ARCHIVE;
+  }
+  buf++;
+  size--;
   {
     UInt64 numBlocks64;
-    READ_VARINT_AND_CHECK(buf, pos, size, &numBlocks64);
-    numBlocks = (size_t)numBlocks64;
-    if (numBlocks != numBlocks64 || numBlocks * 2 > size)
+    READ_VARINT_AND_CHECK(buf, size, &numBlocks64)
+    // (numBlocks64) is 63-bit value, so we can calculate (numBlocks64 * 2):
+    if (numBlocks64 * 2 > size)
       return SZ_ERROR_ARCHIVE;
+    if (numBlocks64 >= ((size_t)1 << (sizeof(size_t) * 8 - 1)) / sizeof(CXzBlockSizes))
+      return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
+    numBlocks = (size_t)numBlocks64;
   }
-  
-  Xz_Free(p, alloc);
-  if (numBlocks != 0)
+  // Xz_Free(p, alloc); // it's optional, because (p) is empty already
+  if (numBlocks)
   {
-    size_t i;
-    p->numBlocks = numBlocks;
-    p->blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
-    if (!p->blocks)
+    CXzBlockSizes *blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
+    if (!blocks)
       return SZ_ERROR_MEM;
-    for (i = 0; i < numBlocks; i++)
+    p->blocks = blocks;
+    p->numBlocks = numBlocks;
+    // the caller will call Xz_Free() in case of error
+    do
     {
-      CXzBlockSizes *block = &p->blocks[i];
-      READ_VARINT_AND_CHECK(buf, pos, size, &block->totalSize);
-      READ_VARINT_AND_CHECK(buf, pos, size, &block->unpackSize);
-      if (block->totalSize == 0)
+      READ_VARINT_AND_CHECK(buf, size, &blocks->totalSize)
+      READ_VARINT_AND_CHECK(buf, size, &blocks->unpackSize)
+      if (blocks->totalSize == 0)
         return SZ_ERROR_ARCHIVE;
+      blocks++;
     }
+    while (--numBlocks);
   }
-  while ((pos & 3) != 0)
-    if (buf[pos++] != 0)
+  if (size >= 4)
+    return SZ_ERROR_ARCHIVE;
+  while (size)
+    if (buf[--size])
       return SZ_ERROR_ARCHIVE;
-  return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE;
+  return SZ_OK;
 }
 
-static SRes Xz_ReadIndex(CXzStream *p, ILookInStream *stream, UInt64 indexSize, ISzAllocPtr alloc)
+
+/*
+static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc)
 {
   SRes res;
   size_t size;
   Byte *buf;
-  if (indexSize > ((UInt32)1 << 31))
-    return SZ_ERROR_UNSUPPORTED;
+  if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
+    return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
   size = (size_t)indexSize;
-  if (size != indexSize)
-    return SZ_ERROR_UNSUPPORTED;
   buf = (Byte *)ISzAlloc_Alloc(alloc, size);
   if (!buf)
     return SZ_ERROR_MEM;
   res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
   if (res == SZ_OK)
-    res = Xz_ReadIndex2(p, buf, size, alloc);
+    res = Xz_ParseIndex(p, buf, size, alloc);
   ISzAlloc_Free(alloc, buf);
   return res;
 }
+*/
 
-static SRes LookInStream_SeekRead_ForArc(ILookInStream *stream, UInt64 offset, void *buf, size_t size)
+static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size)
 {
-  RINOK(LookInStream_SeekTo(stream, offset));
+  RINOK(LookInStream_SeekTo(stream, offset))
   return LookInStream_Read(stream, buf, size);
   /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */
 }
 
-static SRes Xz_ReadBackward(CXzStream *p, ILookInStream *stream, Int64 *startOffset, ISzAllocPtr alloc)
+
+/*
+in:
+  (*startOffset) is position in (stream) where xz_stream must be finished.
+out:
+  if returns SZ_OK, then (*startOffset) is position in stream that shows start of xz_stream.
+*/
+static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc)
 {
-  UInt64 indexSize;
-  Byte buf[XZ_STREAM_FOOTER_SIZE];
-  UInt64 pos = *startOffset;
+  #define TEMP_BUF_SIZE  (1 << 10)
+  UInt32 buf32[TEMP_BUF_SIZE / 4];
+  UInt64 pos = (UInt64)*startOffset;
 
-  if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE)
+  if ((pos & 3) || pos < XZ_STREAM_FOOTER_SIZE)
     return SZ_ERROR_NO_ARCHIVE;
-
   pos -= XZ_STREAM_FOOTER_SIZE;
-  RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE));
+  RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
   
-  if (!XZ_FOOTER_SIG_CHECK(buf + 10))
+  if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
   {
-    UInt32 total = 0;
     pos += XZ_STREAM_FOOTER_SIZE;
-    
     for (;;)
     {
-      size_t i;
-      #define TEMP_BUF_SIZE (1 << 10)
-      Byte temp[TEMP_BUF_SIZE];
-      
-      i = (pos > TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : (size_t)pos;
+      // pos != 0
+      // (pos & 3) == 0
+      size_t i = pos >= TEMP_BUF_SIZE ? TEMP_BUF_SIZE : (size_t)pos;
       pos -= i;
-      RINOK(LookInStream_SeekRead_ForArc(stream, pos, temp, i));
-      total += (UInt32)i;
-      for (; i != 0; i--)
-        if (temp[i - 1] != 0)
+      RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, i))
+      i /= 4;
+      do
+        if (buf32[i - 1] != 0)
           break;
-      if (i != 0)
-      {
-        if ((i & 3) != 0)
-          return SZ_ERROR_NO_ARCHIVE;
-        pos += i;
-        break;
-      }
-      if (pos < XZ_STREAM_FOOTER_SIZE || total > (1 << 16))
+      while (--i);
+
+      pos += i * 4;
+      #define XZ_STREAM_BACKWARD_READING_PAD_MAX (1 << 16)
+      // here we don't support rare case with big padding for xz stream.
+      // so we have padding limit for backward reading.
+      if ((UInt64)*startOffset - pos > XZ_STREAM_BACKWARD_READING_PAD_MAX)
         return SZ_ERROR_NO_ARCHIVE;
+      if (i)
+        break;
     }
-    
+    // we try to open xz stream after skipping zero padding.
+    // ((UInt64)*startOffset == pos) is possible here!
     if (pos < XZ_STREAM_FOOTER_SIZE)
       return SZ_ERROR_NO_ARCHIVE;
     pos -= XZ_STREAM_FOOTER_SIZE;
-    RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE));
-    if (!XZ_FOOTER_SIG_CHECK(buf + 10))
+    RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
+    if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
       return SZ_ERROR_NO_ARCHIVE;
   }
   
-  p->flags = (CXzStreamFlags)GetBe16(buf + 8);
-
+  p->flags = (CXzStreamFlags)GetBe16a(buf32 + 2);
   if (!XzFlags_IsSupported(p->flags))
     return SZ_ERROR_UNSUPPORTED;
-
-  if (GetUi32(buf) != CrcCalc(buf + 4, 6))
-    return SZ_ERROR_ARCHIVE;
-
-  indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2;
-
-  if (pos < indexSize)
-    return SZ_ERROR_ARCHIVE;
-
-  pos -= indexSize;
-  RINOK(LookInStream_SeekTo(stream, pos));
-  RINOK(Xz_ReadIndex(p, stream, indexSize, alloc));
-
   {
-    UInt64 totalSize = Xz_GetPackSize(p);
-    if (totalSize == XZ_SIZE_OVERFLOW
-        || totalSize >= ((UInt64)1 << 63)
-        || pos < totalSize + XZ_STREAM_HEADER_SIZE)
+    /* to eliminate GCC 6.3 warning:
+       dereferencing type-punned pointer will break strict-aliasing rules */
+    const UInt32 *buf_ptr = buf32;
+    if (GetUi32a(buf_ptr) != CrcCalc(buf32 + 1, 6))
       return SZ_ERROR_ARCHIVE;
-    pos -= (totalSize + XZ_STREAM_HEADER_SIZE);
-    RINOK(LookInStream_SeekTo(stream, pos));
-    *startOffset = pos;
+  }
+  {
+    const UInt64 indexSize = ((UInt64)GetUi32a(buf32 + 1) + 1) << 2;
+    if (pos < indexSize)
+      return SZ_ERROR_ARCHIVE;
+    pos -= indexSize;
+    // v25.00: relaxed indexSize check. We allow big index table.
+    // if (indexSize > ((UInt32)1 << 31))
+    if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
+      return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
+    RINOK(LookInStream_SeekTo(stream, pos))
+    // RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
+    {
+      SRes res;
+      const size_t size = (size_t)indexSize;
+      // if (size != indexSize) return SZ_ERROR_UNSUPPORTED;
+      Byte *buf = (Byte *)ISzAlloc_Alloc(alloc, size);
+      if (!buf)
+        return SZ_ERROR_MEM;
+      res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
+      if (res == SZ_OK)
+        res = Xz_ParseIndex(p, buf, size, alloc);
+      ISzAlloc_Free(alloc, buf);
+      RINOK(res)
+    }
+  }
+  {
+    UInt64 total = Xz_GetPackSize(p);
+    if (total == XZ_SIZE_OVERFLOW || total >= ((UInt64)1 << 63))
+      return SZ_ERROR_ARCHIVE;
+    total += XZ_STREAM_HEADER_SIZE;
+    if (pos < total)
+      return SZ_ERROR_ARCHIVE;
+    pos -= total;
+    RINOK(LookInStream_SeekTo(stream, pos))
+    *startOffset = (Int64)pos;
   }
   {
     CXzStreamFlags headerFlags;
     CSecToRead secToRead;
     SecToRead_CreateVTable(&secToRead);
     secToRead.realStream = stream;
-
-    RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt));
+    RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt))
     return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE;
   }
 }
@@ -240,8 +288,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStream *stream, Int64 *startOff
 
 void Xzs_Construct(CXzs *p)
 {
-  p->num = p->numAllocated = 0;
-  p->streams = 0;
+  Xzs_CONSTRUCT(p)
 }
 
 void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
@@ -251,7 +298,7 @@ void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
     Xz_Free(&p->streams[i], alloc);
   ISzAlloc_Free(alloc, p->streams);
   p->num = p->numAllocated = 0;
-  p->streams = 0;
+  p->streams = NULL;
 }
 
 UInt64 Xzs_GetNumBlocks(const CXzs *p)
@@ -268,7 +315,9 @@ UInt64 Xzs_GetUnpackSize(const CXzs *p)
   UInt64 size = 0;
   size_t i;
   for (i = 0; i < p->num; i++)
-    ADD_SIZE_CHECK(size, Xz_GetUnpackSize(&p->streams[i]));
+  {
+    ADD_SIZE_CHECK(size, Xz_GetUnpackSize(&p->streams[i]))
+  }
   return size;
 }
 
@@ -278,42 +327,59 @@ UInt64 Xzs_GetPackSize(const CXzs *p)
   UInt64 size = 0;
   size_t i;
   for (i = 0; i < p->num; i++)
-    ADD_SIZE_CHECK(size, Xz_GetTotalSize(&p->streams[i]));
+  {
+    ADD_SIZE_CHECK(size, Xz_GetTotalSize(&p->streams[i]))
+  }
   return size;
 }
 */
 
-SRes Xzs_ReadBackward(CXzs *p, ILookInStream *stream, Int64 *startOffset, ICompressProgress *progress, ISzAllocPtr alloc)
+SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc)
 {
   Int64 endOffset = 0;
-  RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END));
+  // it's supposed that CXzs object is empty here.
+  // if CXzs object is not empty, it will add new streams to that non-empty object.
+  // Xzs_Free(p, alloc); // it's optional call to empty CXzs object.
+  RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END))
   *startOffset = endOffset;
   for (;;)
   {
     CXzStream st;
     SRes res;
-    Xz_Construct(&st);
+    Xz_CONSTRUCT(&st)
     res = Xz_ReadBackward(&st, stream, startOffset, alloc);
-    st.startOffset = *startOffset;
-    RINOK(res);
+    // if (res == SZ_OK), then (*startOffset) is start offset of new stream if
+    // if (res != SZ_OK), then (*startOffset) is unchend or it's expected start offset of stream with error
+    st.startOffset = (UInt64)*startOffset;
+    // we must store (st) object to array, or we must free (st) local object.
+    if (res != SZ_OK)
+    {
+      Xz_Free(&st, alloc);
+      return res;
+    }
     if (p->num == p->numAllocated)
     {
-      size_t newNum = p->num + p->num / 4 + 1;
-      Byte *data = (Byte *)ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream));
+      const size_t newNum = p->num + p->num / 4 + 1;
+      void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream));
       if (!data)
+      {
+        Xz_Free(&st, alloc);
         return SZ_ERROR_MEM;
+      }
       p->numAllocated = newNum;
       if (p->num != 0)
         memcpy(data, p->streams, p->num * sizeof(CXzStream));
       ISzAlloc_Free(alloc, p->streams);
       p->streams = (CXzStream *)data;
     }
+    // we use direct copying of raw data from local variable (st) to object in array.
+    // so we don't need to call Xz_Free(&st, alloc) after copying and after p->num++
     p->streams[p->num++] = st;
     if (*startOffset == 0)
-      break;
-    RINOK(LookInStream_SeekTo(stream, *startOffset));
-    if (progress && ICompressProgress_Progress(progress, endOffset - *startOffset, (UInt64)(Int64)-1) != SZ_OK)
+      return SZ_OK;
+    // seek operation is optional:
+    // RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset))
+    if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK)
       return SZ_ERROR_PROGRESS;
   }
-  return SZ_OK;
 }
diff --git a/src/sdk/DOC/7zC.txt b/src/sdk/DOC/7zC.txt
index 939b720..4927678 100644
--- a/src/sdk/DOC/7zC.txt
+++ b/src/sdk/DOC/7zC.txt
@@ -1,187 +1,187 @@
-7z ANSI-C Decoder 9.35
-----------------------
-
-7z ANSI-C provides 7z/LZMA decoding.
-7z ANSI-C version is simplified version ported from C++ code.
-
-LZMA is default and general compression method of 7z format
-in 7-Zip compression program (www.7-zip.org). LZMA provides high 
-compression ratio and very fast decompression.
-
-
-LICENSE
--------
-
-7z ANSI-C Decoder is part of the LZMA SDK.
-LZMA SDK is written and placed in the public domain by Igor Pavlov.
-
-Files
----------------------
-
-7zDecode.*   - Low level 7z decoding
-7zExtract.*  - High level 7z decoding
-7zHeader.*   - .7z format constants
-7zIn.*       - .7z archive opening
-7zItem.*     - .7z structures
-7zMain.c     - Test application
-
-
-How To Use
-----------
-
-You can create .7z archive with 7z.exe, 7za.exe or 7zr.exe:
-
-  7z.exe a archive.7z *.htm -r -mx -m0fb=255
-
-If you have big number of files in archive, and you need fast extracting, 
-you can use partly-solid archives:
-  
-  7za.exe a archive.7z *.htm -ms=512K -r -mx -m0fb=255 -m0d=512K
-
-In that example 7-Zip will use 512KB solid blocks. So it needs to decompress only 
-512KB for extracting one file from such archive.
-
-
-Limitations of current version of 7z ANSI-C Decoder
----------------------------------------------------
-
- - It reads only "FileName", "Size", "LastWriteTime" and "CRC" information for each file in archive.
- - It supports only LZMA and Copy (no compression) methods with BCJ or BCJ2 filters.
- - It converts original UTF-16 Unicode file names to UTF-8 Unicode file names.
- 
-These limitations will be fixed in future versions.
-
-
-Using 7z ANSI-C Decoder Test application:
------------------------------------------
-
-Usage: 7zDec <command> <archive_name>
-
-<Command>:
-  e: Extract files from archive
-  l: List contents of archive
-  t: Test integrity of archive
-
-Example: 
-
-  7zDec l archive.7z
-
-lists contents of archive.7z
-
-  7zDec e archive.7z
-
-extracts files from archive.7z to current folder.
-
-
-How to use .7z Decoder
-----------------------
-
-Memory allocation
-~~~~~~~~~~~~~~~~~
-
-7z Decoder uses two memory pools:
-1) Temporary pool
-2) Main pool
-Such scheme can allow you to avoid fragmentation of allocated blocks.
-
-
-Steps for using 7z decoder
---------------------------
-
-Use code at 7zMain.c as example.
-
-1) Declare variables:
-  inStream                 /* implements ILookInStream interface */
-  CSzArEx db;              /* 7z archive database structure */
-  ISzAlloc allocImp;       /* memory functions for main pool */
-  ISzAlloc allocTempImp;   /* memory functions for temporary pool */
-
-2) call CrcGenerateTable(); function to initialize CRC structures.
-
-3) call SzArEx_Init(&db); function to initialize db structures.
-
-4) call SzArEx_Open(&db, inStream, &allocMain, &allocTemp) to open archive
-
-This function opens archive "inStream" and reads headers to "db".
-All items in "db" will be allocated with "allocMain" functions.
-SzArEx_Open function allocates and frees temporary structures by "allocTemp" functions.
-
-5) List items or Extract items
-
-  Listing code:
-  ~~~~~~~~~~~~~
-
-    Use SzArEx_GetFileNameUtf16 function. Look example code in C\Util\7z\7zMain.c file. 
-    
-
-  Extracting code:
-  ~~~~~~~~~~~~~~~~
-
-  SZ_RESULT SzAr_Extract(
-    CArchiveDatabaseEx *db,
-    ILookInStream *inStream, 
-    UInt32 fileIndex,         /* index of file */
-    UInt32 *blockIndex,       /* index of solid block */
-    Byte **outBuffer,         /* pointer to pointer to output buffer (allocated with allocMain) */
-    size_t *outBufferSize,    /* buffer size for output buffer */
-    size_t *offset,           /* offset of stream for required file in *outBuffer */
-    size_t *outSizeProcessed, /* size of file in *outBuffer */
-    ISzAlloc *allocMain,
-    ISzAlloc *allocTemp);
-
-  If you need to decompress more than one file, you can send these values from previous call:
-    blockIndex, 
-    outBuffer, 
-    outBufferSize,
-  You can consider "outBuffer" as cache of solid block. If your archive is solid, 
-  it will increase decompression speed.
-
-  After decompressing you must free "outBuffer":
-  allocImp.Free(outBuffer);
-
-6) call SzArEx_Free(&db, allocImp.Free) to free allocated items in "db".
-
-
-
-
-Memory requirements for .7z decoding 
-------------------------------------
-
-Memory usage for Archive opening:
-  - Temporary pool:
-     - Memory for uncompressed .7z headers
-     - some other temporary blocks
-  - Main pool:
-     - Memory for database: 
-       Estimated size of one file structures in solid archive:
-         - Size (4 or 8 Bytes)
-         - CRC32 (4 bytes)
-         - LastWriteTime (8 bytes)
-         - Some file information (4 bytes)
-         - File Name (variable length) + pointer + allocation structures
-
-Memory usage for archive Decompressing:
-  - Temporary pool:
-     - Memory for LZMA decompressing structures
-  - Main pool:
-     - Memory for decompressed solid block
-     - Memory for temprorary buffers, if BCJ2 fileter is used. Usually these 
-       temprorary buffers can be about 15% of solid block size. 
-  
-
-7z Decoder doesn't allocate memory for compressed blocks. 
-Instead of this, you must allocate buffer with desired 
-size before calling 7z Decoder. Use 7zMain.c as example.
-
-
-Defines
--------
-
-_SZ_ALLOC_DEBUG   - define it if you want to debug alloc/free operations to stderr.
-
-
----
-
-http://www.7-zip.org
-http://www.7-zip.org/sdk.html
-http://www.7-zip.org/support.html
+7z ANSI-C Decoder 9.35
+----------------------
+
+7z ANSI-C provides 7z/LZMA decoding.
+7z ANSI-C version is simplified version ported from C++ code.
+
+LZMA is default and general compression method of 7z format
+in 7-Zip compression program (www.7-zip.org). LZMA provides high 
+compression ratio and very fast decompression.
+
+
+LICENSE
+-------
+
+7z ANSI-C Decoder is part of the LZMA SDK.
+LZMA SDK is written and placed in the public domain by Igor Pavlov.
+
+Files
+---------------------
+
+7zDecode.*   - Low level 7z decoding
+7zExtract.*  - High level 7z decoding
+7zHeader.*   - .7z format constants
+7zIn.*       - .7z archive opening
+7zItem.*     - .7z structures
+7zMain.c     - Test application
+
+
+How To Use
+----------
+
+You can create .7z archive with 7z.exe, 7za.exe or 7zr.exe:
+
+  7z.exe a archive.7z *.htm -r -mx -m0fb=255
+
+If you have big number of files in archive, and you need fast extracting, 
+you can use partly-solid archives:
+  
+  7za.exe a archive.7z *.htm -ms=512K -r -mx -m0fb=255 -m0d=512K
+
+In that example 7-Zip will use 512KB solid blocks. So it needs to decompress only 
+512KB for extracting one file from such archive.
+
+
+Limitations of current version of 7z ANSI-C Decoder
+---------------------------------------------------
+
+ - It reads only "FileName", "Size", "LastWriteTime" and "CRC" information for each file in archive.
+ - It supports only LZMA and Copy (no compression) methods with BCJ or BCJ2 filters.
+ - It converts original UTF-16 Unicode file names to UTF-8 Unicode file names.
+ 
+These limitations will be fixed in future versions.
+
+
+Using 7z ANSI-C Decoder Test application:
+-----------------------------------------
+
+Usage: 7zDec <command> <archive_name>
+
+<Command>:
+  e: Extract files from archive
+  l: List contents of archive
+  t: Test integrity of archive
+
+Example: 
+
+  7zDec l archive.7z
+
+lists contents of archive.7z
+
+  7zDec e archive.7z
+
+extracts files from archive.7z to current folder.
+
+
+How to use .7z Decoder
+----------------------
+
+Memory allocation
+~~~~~~~~~~~~~~~~~
+
+7z Decoder uses two memory pools:
+1) Temporary pool
+2) Main pool
+Such scheme can allow you to avoid fragmentation of allocated blocks.
+
+
+Steps for using 7z decoder
+--------------------------
+
+Use code at 7zMain.c as example.
+
+1) Declare variables:
+  inStream                 /* implements ILookInStream interface */
+  CSzArEx db;              /* 7z archive database structure */
+  ISzAlloc allocImp;       /* memory functions for main pool */
+  ISzAlloc allocTempImp;   /* memory functions for temporary pool */
+
+2) call CrcGenerateTable(); function to initialize CRC structures.
+
+3) call SzArEx_Init(&db); function to initialize db structures.
+
+4) call SzArEx_Open(&db, inStream, &allocMain, &allocTemp) to open archive
+
+This function opens archive "inStream" and reads headers to "db".
+All items in "db" will be allocated with "allocMain" functions.
+SzArEx_Open function allocates and frees temporary structures by "allocTemp" functions.
+
+5) List items or Extract items
+
+  Listing code:
+  ~~~~~~~~~~~~~
+
+    Use SzArEx_GetFileNameUtf16 function. Look example code in C\Util\7z\7zMain.c file. 
+    
+
+  Extracting code:
+  ~~~~~~~~~~~~~~~~
+
+  SZ_RESULT SzAr_Extract(
+    CArchiveDatabaseEx *db,
+    ILookInStream *inStream, 
+    UInt32 fileIndex,         /* index of file */
+    UInt32 *blockIndex,       /* index of solid block */
+    Byte **outBuffer,         /* pointer to pointer to output buffer (allocated with allocMain) */
+    size_t *outBufferSize,    /* buffer size for output buffer */
+    size_t *offset,           /* offset of stream for required file in *outBuffer */
+    size_t *outSizeProcessed, /* size of file in *outBuffer */
+    ISzAlloc *allocMain,
+    ISzAlloc *allocTemp);
+
+  If you need to decompress more than one file, you can send these values from previous call:
+    blockIndex, 
+    outBuffer, 
+    outBufferSize,
+  You can consider "outBuffer" as cache of solid block. If your archive is solid, 
+  it will increase decompression speed.
+
+  After decompressing you must free "outBuffer":
+  allocImp.Free(outBuffer);
+
+6) call SzArEx_Free(&db, allocImp.Free) to free allocated items in "db".
+
+
+
+
+Memory requirements for .7z decoding 
+------------------------------------
+
+Memory usage for Archive opening:
+  - Temporary pool:
+     - Memory for uncompressed .7z headers
+     - some other temporary blocks
+  - Main pool:
+     - Memory for database: 
+       Estimated size of one file structures in solid archive:
+         - Size (4 or 8 Bytes)
+         - CRC32 (4 bytes)
+         - LastWriteTime (8 bytes)
+         - Some file information (4 bytes)
+         - File Name (variable length) + pointer + allocation structures
+
+Memory usage for archive Decompressing:
+  - Temporary pool:
+     - Memory for LZMA decompressing structures
+  - Main pool:
+     - Memory for decompressed solid block
+     - Memory for temprorary buffers, if BCJ2 fileter is used. Usually these 
+       temprorary buffers can be about 15% of solid block size. 
+  
+
+7z Decoder doesn't allocate memory for compressed blocks. 
+Instead of this, you must allocate buffer with desired 
+size before calling 7z Decoder. Use 7zMain.c as example.
+
+
+Defines
+-------
+
+_SZ_ALLOC_DEBUG   - define it if you want to debug alloc/free operations to stderr.
+
+
+---
+
+http://www.7-zip.org
+http://www.7-zip.org/sdk.html
+http://www.7-zip.org/support.html
diff --git a/src/sdk/DOC/7zFormat.txt b/src/sdk/DOC/7zFormat.txt
index 74cdfa4..9239e93 100644
--- a/src/sdk/DOC/7zFormat.txt
+++ b/src/sdk/DOC/7zFormat.txt
@@ -1,469 +1,469 @@
-7z Format description (18.06)
-----------------------------
-
-This file contains description of 7z archive format. 
-7z archive can contain files compressed with any method.
-See "Methods.txt" for description for defined compressing methods.
-
-
-Format structure Overview
--------------------------
-
-Some fields can be optional.
-
-Archive structure
-~~~~~~~~~~~~~~~~~  
-SignatureHeader
-[PackedStreams]
-[PackedStreamsForHeaders]
-[
-  Header 
-  or 
-  {
-    Packed Header
-    HeaderInfo
-  }
-]
-
-
-
-Header structure
-~~~~~~~~~~~~~~~~  
-{
-  ArchiveProperties
-  AdditionalStreams
-  {
-    PackInfo
-    {
-      PackPos
-      NumPackStreams
-      Sizes[NumPackStreams]
-      CRCs[NumPackStreams]
-    }
-    CodersInfo
-    {
-      NumFolders
-      Folders[NumFolders]
-      {
-        NumCoders
-        CodersInfo[NumCoders]
-        {
-          ID
-          NumInStreams;
-          NumOutStreams;
-          PropertiesSize
-          Properties[PropertiesSize]
-        }
-        NumBindPairs
-        BindPairsInfo[NumBindPairs]
-        {
-          InIndex;
-          OutIndex;
-        }
-        PackedIndices
-      }
-      UnPackSize[Folders][Folders.NumOutstreams]
-      CRCs[NumFolders]
-    }
-    SubStreamsInfo
-    {
-      NumUnPackStreamsInFolders[NumFolders];
-      UnPackSizes[]
-      CRCs[]
-    }
-  }
-  MainStreamsInfo
-  {
-    (Same as in AdditionalStreams)
-  }
-  FilesInfo
-  {
-    NumFiles
-    Properties[]
-    {
-      ID
-      Size
-      Data
-    }
-  }
-}
-
-HeaderInfo structure
-~~~~~~~~~~~~~~~~~~~~
-{
-  (Same as in AdditionalStreams)
-}
-
-
-
-Notes about Notation and encoding
----------------------------------
-
-7z uses little endian encoding.
-
-7z archive format has optional headers that are marked as
-[]
-Header
-[]
-
-REAL_UINT64 means real UINT64.
-
-UINT64 means real UINT64 encoded with the following scheme:
-
-  Size of encoding sequence depends from first byte:
-  First_Byte  Extra_Bytes        Value
-  (binary)   
-  0xxxxxxx               : ( xxxxxxx           )
-  10xxxxxx    BYTE y[1]  : (  xxxxxx << (8 * 1)) + y
-  110xxxxx    BYTE y[2]  : (   xxxxx << (8 * 2)) + y
-  ...
-  1111110x    BYTE y[6]  : (       x << (8 * 6)) + y
-  11111110    BYTE y[7]  :                         y
-  11111111    BYTE y[8]  :                         y
-
-
-
-Property IDs
-------------
-
-0x00 = kEnd
-
-0x01 = kHeader
-
-0x02 = kArchiveProperties
-    
-0x03 = kAdditionalStreamsInfo
-0x04 = kMainStreamsInfo
-0x05 = kFilesInfo
-    
-0x06 = kPackInfo
-0x07 = kUnPackInfo
-0x08 = kSubStreamsInfo
-
-0x09 = kSize
-0x0A = kCRC
-
-0x0B = kFolder
-
-0x0C = kCodersUnPackSize
-0x0D = kNumUnPackStream
-
-0x0E = kEmptyStream
-0x0F = kEmptyFile
-0x10 = kAnti
-
-0x11 = kName
-0x12 = kCTime
-0x13 = kATime
-0x14 = kMTime
-0x15 = kWinAttributes
-0x16 = kComment
-
-0x17 = kEncodedHeader
-
-0x18 = kStartPos
-0x19 = kDummy
-
-
-7z format headers
------------------
-
-SignatureHeader
-~~~~~~~~~~~~~~~
-  BYTE kSignature[6] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
-
-  ArchiveVersion
-  {
-    BYTE Major;   // now = 0
-    BYTE Minor;   // now = 4
-  };
-
-  UINT32 StartHeaderCRC;
-
-  StartHeader
-  {
-    REAL_UINT64 NextHeaderOffset
-    REAL_UINT64 NextHeaderSize
-    UINT32 NextHeaderCRC
-  }
-
-
-...........................
-
-
-ArchiveProperties
-~~~~~~~~~~~~~~~~~
-BYTE NID::kArchiveProperties (0x02)
-for (;;)
-{
-  BYTE PropertyType;
-  if (aType == 0)
-    break;
-  UINT64 PropertySize;
-  BYTE PropertyData[PropertySize];
-}
-
-
-Digests (NumStreams)
-~~~~~~~~~~~~~~~~~~~~~
-  BYTE AllAreDefined
-  if (AllAreDefined == 0)
-  {
-    for(NumStreams)
-      BIT Defined
-  }
-  UINT32 CRCs[NumDefined]
-
-
-PackInfo
-~~~~~~~~~~~~
-  BYTE NID::kPackInfo  (0x06)
-  UINT64 PackPos
-  UINT64 NumPackStreams
-
-  []
-  BYTE NID::kSize    (0x09)
-  UINT64 PackSizes[NumPackStreams]
-  []
-
-  []
-  BYTE NID::kCRC      (0x0A)
-  PackStreamDigests[NumPackStreams]
-  []
-
-  BYTE NID::kEnd
-
-
-Folder
-~~~~~~
-  UINT64 NumCoders;
-  for (NumCoders)
-  {
-    BYTE 
-    {
-      0:3 CodecIdSize
-      4:  Is Complex Coder
-      5:  There Are Attributes
-      6:  Reserved
-      7:  There are more alternative methods. (Not used anymore, must be 0).
-    } 
-    BYTE CodecId[CodecIdSize]
-    if (Is Complex Coder)
-    {
-      UINT64 NumInStreams;
-      UINT64 NumOutStreams;
-    }
-    if (There Are Attributes)
-    {
-      UINT64 PropertiesSize
-      BYTE Properties[PropertiesSize]
-    }
-  }
-    
-  NumBindPairs = NumOutStreamsTotal - 1;
-
-  for (NumBindPairs)
-  {
-    UINT64 InIndex;
-    UINT64 OutIndex;
-  }
-
-  NumPackedStreams = NumInStreamsTotal - NumBindPairs;
-  if (NumPackedStreams > 1)
-    for(NumPackedStreams)
-    {
-      UINT64 Index;
-    };
-
-
-
-
-Coders Info
-~~~~~~~~~~~
-
-  BYTE NID::kUnPackInfo  (0x07)
-
-
-  BYTE NID::kFolder  (0x0B)
-  UINT64 NumFolders
-  BYTE External
-  switch(External)
-  {
-    case 0:
-      Folders[NumFolders]
-    case 1:
-      UINT64 DataStreamIndex
-  }
-
-
-  BYTE ID::kCodersUnPackSize  (0x0C)
-  for(Folders)
-    for(Folder.NumOutStreams)
-     UINT64 UnPackSize;
-
-
-  []
-  BYTE NID::kCRC   (0x0A)
-  UnPackDigests[NumFolders]
-  []
-
-  
-
-  BYTE NID::kEnd
-
-
-
-SubStreams Info
-~~~~~~~~~~~~~~
-  BYTE NID::kSubStreamsInfo; (0x08)
-
-  []
-  BYTE NID::kNumUnPackStream; (0x0D)
-  UINT64 NumUnPackStreamsInFolders[NumFolders];
-  []
-
-
-  []
-  BYTE NID::kSize  (0x09)
-  UINT64 UnPackSizes[]
-  []
-
-
-  []
-  BYTE NID::kCRC  (0x0A)
-  Digests[Number of streams with unknown CRC]
-  []
-
-  
-  BYTE NID::kEnd
-
-
-Streams Info
-~~~~~~~~~~~~
-
-  []
-  PackInfo
-  []
-
-
-  []
-  CodersInfo
-  []
-
-
-  []
-  SubStreamsInfo
-  []
-
-  BYTE NID::kEnd
-
-
-FilesInfo
-~~~~~~~~~
-  BYTE NID::kFilesInfo;  (0x05)
-  UINT64 NumFiles
-
-  for (;;)
-  {
-    BYTE PropertyType;
-    if (aType == 0)
-      break;
-
-    UINT64 Size;
-
-    switch(PropertyType)
-    {
-      kEmptyStream:   (0x0E)
-        for(NumFiles)
-          BIT IsEmptyStream
-
-      kEmptyFile:     (0x0F)
-        for(EmptyStreams)
-          BIT IsEmptyFile
-
-      kAnti:          (0x10)
-        for(EmptyStreams)
-          BIT IsAntiFile
-      
-      case kCTime: (0x12)
-      case kATime: (0x13)
-      case kMTime: (0x14)
-        BYTE AllAreDefined
-        if (AllAreDefined == 0)
-        {
-          for(NumFiles)
-            BIT TimeDefined
-        }
-        BYTE External;
-        if(External != 0)
-          UINT64 DataIndex
-        []
-        for(Definded Items)
-          REAL_UINT64 Time
-        []
-      
-      kNames:     (0x11)
-        BYTE External;
-        if(External != 0)
-          UINT64 DataIndex
-        []
-        for(Files)
-        {
-          wchar_t Names[NameSize];
-          wchar_t 0;
-        }
-        []
-
-      kAttributes:  (0x15)
-        BYTE AllAreDefined
-        if (AllAreDefined == 0)
-        {
-          for(NumFiles)
-            BIT AttributesAreDefined
-        }
-        BYTE External;
-        if(External != 0)
-          UINT64 DataIndex
-        []
-        for(Definded Attributes)
-          UINT32 Attributes
-        []
-    }
-  }
-
-
-Header
-~~~~~~
-  BYTE NID::kHeader (0x01)
-
-  []
-  ArchiveProperties
-  []
-
-  []
-  BYTE NID::kAdditionalStreamsInfo; (0x03)
-  StreamsInfo
-  []
-
-  []
-  BYTE NID::kMainStreamsInfo;    (0x04)
-  StreamsInfo
-  []
-
-  []
-  FilesInfo
-  []
-
-  BYTE NID::kEnd
-
-
-HeaderInfo
-~~~~~~~~~~
-  []
-  BYTE NID::kEncodedHeader; (0x17)
-  StreamsInfo for Encoded Header
-  []
-
-
----
-End of document
+7z Format description (18.06)
+----------------------------
+
+This file contains description of 7z archive format. 
+7z archive can contain files compressed with any method.
+See "Methods.txt" for description for defined compressing methods.
+
+
+Format structure Overview
+-------------------------
+
+Some fields can be optional.
+
+Archive structure
+~~~~~~~~~~~~~~~~~  
+SignatureHeader
+[PackedStreams]
+[PackedStreamsForHeaders]
+[
+  Header 
+  or 
+  {
+    Packed Header
+    HeaderInfo
+  }
+]
+
+
+
+Header structure
+~~~~~~~~~~~~~~~~  
+{
+  ArchiveProperties
+  AdditionalStreams
+  {
+    PackInfo
+    {
+      PackPos
+      NumPackStreams
+      Sizes[NumPackStreams]
+      CRCs[NumPackStreams]
+    }
+    CodersInfo
+    {
+      NumFolders
+      Folders[NumFolders]
+      {
+        NumCoders
+        CodersInfo[NumCoders]
+        {
+          ID
+          NumInStreams;
+          NumOutStreams;
+          PropertiesSize
+          Properties[PropertiesSize]
+        }
+        NumBindPairs
+        BindPairsInfo[NumBindPairs]
+        {
+          InIndex;
+          OutIndex;
+        }
+        PackedIndices
+      }
+      UnPackSize[Folders][Folders.NumOutstreams]
+      CRCs[NumFolders]
+    }
+    SubStreamsInfo
+    {
+      NumUnPackStreamsInFolders[NumFolders];
+      UnPackSizes[]
+      CRCs[]
+    }
+  }
+  MainStreamsInfo
+  {
+    (Same as in AdditionalStreams)
+  }
+  FilesInfo
+  {
+    NumFiles
+    Properties[]
+    {
+      ID
+      Size
+      Data
+    }
+  }
+}
+
+HeaderInfo structure
+~~~~~~~~~~~~~~~~~~~~
+{
+  (Same as in AdditionalStreams)
+}
+
+
+
+Notes about Notation and encoding
+---------------------------------
+
+7z uses little endian encoding.
+
+7z archive format has optional headers that are marked as
+[]
+Header
+[]
+
+REAL_UINT64 means real UINT64.
+
+UINT64 means real UINT64 encoded with the following scheme:
+
+  Size of encoding sequence depends from first byte:
+  First_Byte  Extra_Bytes        Value
+  (binary)   
+  0xxxxxxx               : ( xxxxxxx           )
+  10xxxxxx    BYTE y[1]  : (  xxxxxx << (8 * 1)) + y
+  110xxxxx    BYTE y[2]  : (   xxxxx << (8 * 2)) + y
+  ...
+  1111110x    BYTE y[6]  : (       x << (8 * 6)) + y
+  11111110    BYTE y[7]  :                         y
+  11111111    BYTE y[8]  :                         y
+
+
+
+Property IDs
+------------
+
+0x00 = kEnd
+
+0x01 = kHeader
+
+0x02 = kArchiveProperties
+    
+0x03 = kAdditionalStreamsInfo
+0x04 = kMainStreamsInfo
+0x05 = kFilesInfo
+    
+0x06 = kPackInfo
+0x07 = kUnPackInfo
+0x08 = kSubStreamsInfo
+
+0x09 = kSize
+0x0A = kCRC
+
+0x0B = kFolder
+
+0x0C = kCodersUnPackSize
+0x0D = kNumUnPackStream
+
+0x0E = kEmptyStream
+0x0F = kEmptyFile
+0x10 = kAnti
+
+0x11 = kName
+0x12 = kCTime
+0x13 = kATime
+0x14 = kMTime
+0x15 = kWinAttributes
+0x16 = kComment
+
+0x17 = kEncodedHeader
+
+0x18 = kStartPos
+0x19 = kDummy
+
+
+7z format headers
+-----------------
+
+SignatureHeader
+~~~~~~~~~~~~~~~
+  BYTE kSignature[6] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
+
+  ArchiveVersion
+  {
+    BYTE Major;   // now = 0
+    BYTE Minor;   // now = 4
+  };
+
+  UINT32 StartHeaderCRC;
+
+  StartHeader
+  {
+    REAL_UINT64 NextHeaderOffset
+    REAL_UINT64 NextHeaderSize
+    UINT32 NextHeaderCRC
+  }
+
+
+...........................
+
+
+ArchiveProperties
+~~~~~~~~~~~~~~~~~
+BYTE NID::kArchiveProperties (0x02)
+for (;;)
+{
+  BYTE PropertyType;
+  if (aType == 0)
+    break;
+  UINT64 PropertySize;
+  BYTE PropertyData[PropertySize];
+}
+
+
+Digests (NumStreams)
+~~~~~~~~~~~~~~~~~~~~~
+  BYTE AllAreDefined
+  if (AllAreDefined == 0)
+  {
+    for(NumStreams)
+      BIT Defined
+  }
+  UINT32 CRCs[NumDefined]
+
+
+PackInfo
+~~~~~~~~~~~~
+  BYTE NID::kPackInfo  (0x06)
+  UINT64 PackPos
+  UINT64 NumPackStreams
+
+  []
+  BYTE NID::kSize    (0x09)
+  UINT64 PackSizes[NumPackStreams]
+  []
+
+  []
+  BYTE NID::kCRC      (0x0A)
+  PackStreamDigests[NumPackStreams]
+  []
+
+  BYTE NID::kEnd
+
+
+Folder
+~~~~~~
+  UINT64 NumCoders;
+  for (NumCoders)
+  {
+    BYTE 
+    {
+      0:3 CodecIdSize
+      4:  Is Complex Coder
+      5:  There Are Attributes
+      6:  Reserved
+      7:  There are more alternative methods. (Not used anymore, must be 0).
+    } 
+    BYTE CodecId[CodecIdSize]
+    if (Is Complex Coder)
+    {
+      UINT64 NumInStreams;
+      UINT64 NumOutStreams;
+    }
+    if (There Are Attributes)
+    {
+      UINT64 PropertiesSize
+      BYTE Properties[PropertiesSize]
+    }
+  }
+    
+  NumBindPairs = NumOutStreamsTotal - 1;
+
+  for (NumBindPairs)
+  {
+    UINT64 InIndex;
+    UINT64 OutIndex;
+  }
+
+  NumPackedStreams = NumInStreamsTotal - NumBindPairs;
+  if (NumPackedStreams > 1)
+    for(NumPackedStreams)
+    {
+      UINT64 Index;
+    };
+
+
+
+
+Coders Info
+~~~~~~~~~~~
+
+  BYTE NID::kUnPackInfo  (0x07)
+
+
+  BYTE NID::kFolder  (0x0B)
+  UINT64 NumFolders
+  BYTE External
+  switch(External)
+  {
+    case 0:
+      Folders[NumFolders]
+    case 1:
+      UINT64 DataStreamIndex
+  }
+
+
+  BYTE ID::kCodersUnPackSize  (0x0C)
+  for(Folders)
+    for(Folder.NumOutStreams)
+     UINT64 UnPackSize;
+
+
+  []
+  BYTE NID::kCRC   (0x0A)
+  UnPackDigests[NumFolders]
+  []
+
+  
+
+  BYTE NID::kEnd
+
+
+
+SubStreams Info
+~~~~~~~~~~~~~~
+  BYTE NID::kSubStreamsInfo; (0x08)
+
+  []
+  BYTE NID::kNumUnPackStream; (0x0D)
+  UINT64 NumUnPackStreamsInFolders[NumFolders];
+  []
+
+
+  []
+  BYTE NID::kSize  (0x09)
+  UINT64 UnPackSizes[]
+  []
+
+
+  []
+  BYTE NID::kCRC  (0x0A)
+  Digests[Number of streams with unknown CRC]
+  []
+
+  
+  BYTE NID::kEnd
+
+
+Streams Info
+~~~~~~~~~~~~
+
+  []
+  PackInfo
+  []
+
+
+  []
+  CodersInfo
+  []
+
+
+  []
+  SubStreamsInfo
+  []
+
+  BYTE NID::kEnd
+
+
+FilesInfo
+~~~~~~~~~
+  BYTE NID::kFilesInfo;  (0x05)
+  UINT64 NumFiles
+
+  for (;;)
+  {
+    BYTE PropertyType;
+    if (aType == 0)
+      break;
+
+    UINT64 Size;
+
+    switch(PropertyType)
+    {
+      kEmptyStream:   (0x0E)
+        for(NumFiles)
+          BIT IsEmptyStream
+
+      kEmptyFile:     (0x0F)
+        for(EmptyStreams)
+          BIT IsEmptyFile
+
+      kAnti:          (0x10)
+        for(EmptyStreams)
+          BIT IsAntiFile
+      
+      case kCTime: (0x12)
+      case kATime: (0x13)
+      case kMTime: (0x14)
+        BYTE AllAreDefined
+        if (AllAreDefined == 0)
+        {
+          for(NumFiles)
+            BIT TimeDefined
+        }
+        BYTE External;
+        if(External != 0)
+          UINT64 DataIndex
+        []
+        for(Definded Items)
+          REAL_UINT64 Time
+        []
+      
+      kNames:     (0x11)
+        BYTE External;
+        if(External != 0)
+          UINT64 DataIndex
+        []
+        for(Files)
+        {
+          wchar_t Names[NameSize];
+          wchar_t 0;
+        }
+        []
+
+      kAttributes:  (0x15)
+        BYTE AllAreDefined
+        if (AllAreDefined == 0)
+        {
+          for(NumFiles)
+            BIT AttributesAreDefined
+        }
+        BYTE External;
+        if(External != 0)
+          UINT64 DataIndex
+        []
+        for(Definded Attributes)
+          UINT32 Attributes
+        []
+    }
+  }
+
+
+Header
+~~~~~~
+  BYTE NID::kHeader (0x01)
+
+  []
+  ArchiveProperties
+  []
+
+  []
+  BYTE NID::kAdditionalStreamsInfo; (0x03)
+  StreamsInfo
+  []
+
+  []
+  BYTE NID::kMainStreamsInfo;    (0x04)
+  StreamsInfo
+  []
+
+  []
+  FilesInfo
+  []
+
+  BYTE NID::kEnd
+
+
+HeaderInfo
+~~~~~~~~~~
+  []
+  BYTE NID::kEncodedHeader; (0x17)
+  StreamsInfo for Encoded Header
+  []
+
+
+---
+End of document
diff --git a/src/sdk/DOC/Methods.txt b/src/sdk/DOC/Methods.txt
index d4a1b1d..b840b55 100644
--- a/src/sdk/DOC/Methods.txt
+++ b/src/sdk/DOC/Methods.txt
@@ -1,173 +1,177 @@
-7-Zip method IDs for 7z and xz archives
----------------------------------------
-
-Version: 18.06
-Date: 2018-06-30
-
-Each compression or crypto method in 7z is associated with unique binary value (ID).
-The length of ID in bytes is arbitrary but it can not exceed 63 bits (8 bytes).
-
-xz and 7z formats use same ID map.
-
-If you want to add some new ID, you have two ways:
-  1) Write request for allocating IDs to 7-Zip developers.
-  2) Generate 8-bytes ID:
-
-    3F ZZ ZZ ZZ ZZ ZZ MM MM 
-
-    3F              - Prefix for random IDs (1 byte)
-    ZZ ZZ ZZ ZZ ZZ  - Developer ID (5 bytes). Use real random bytes. 
-                      
-    MM MM           - Method ID (2 bytes)
-
-    You can notify 7-Zip developers about your Developer ID / Method ID.
-
-    Note: Use new ID, if old codec can not decode data encoded with new version.
-
-
-List of defined IDs
--------------------
-      
-00 - Copy
-
-03 - Delta
-04 - BCJ (x86)
-05 - PPC (big-endian)
-06 - IA64
-07 - ARM (little-endian)
-08 - ARMT (little-endian)
-09 - SPARC
-
-21 - LZMA2
-          
-02.. - Common
-   03 [Swap]
-      - 2 Swap2
-      - 4 Swap4
-
-03.. - 7z
-   01 - 
-      01 - LZMA
-  
-   03 - [Branch Codecs]
-      01 - [x86 Codecs]
-         03  - BCJ
-         1B  - BCJ2 (4 packed streams)
-      02 - 
-         05 - PPC (big-endian)
-      03 - 
-         01 - Alpha
-      04 - 
-         01 - IA64
-      05 - 
-         01 - ARM (little-endian)
-      06 - 
-         05 - M68 (big-endian)
-      07 - 
-         01 - ARMT (little-endian)
-      08 - 
-         05 - SPARC
-
-   04 - 
-      01 - PPMD
-
-   7F -
-      01 - experimental method.
-
-
-04.. - Misc codecs
-
-   00 - Reserved
-
-   01 - [Zip]
-      00 - Copy (not used. Use {00} instead)
-      01 - Shrink
-      06 - Implode
-      08 - Deflate
-      09 - Deflate64
-      0A - Imploding
-      0C - BZip2 (not used. Use {040202} instead)
-      0E - LZMA (LZMA-zip)
-      5F - xz
-      60 - Jpeg
-      61 - WavPack
-      62 - PPMd (PPMd-zip)
-      63 - wzAES
-
-   02 - 
-      02 - BZip2
-
-   03 - [Rar]
-      01 - Rar1
-      02 - Rar2
-      03 - Rar3
-      05 - Rar5
-
-   04 - [Arj]
-      01 - Arj(1,2,3)
-      02 - Arj4
-
-   05 - [Z]
-
-   06 - [Lzh]
-
-   07 - Reserved for 7z
-
-   08 - [Cab]
-
-   09 - [NSIS]
-      01 - DeflateNSIS
-      02 - BZip2NSIS
-
-   F7 - External codecs (that are not included to 7-Zip)
-
-      0x xx - reserved
-
-      10 xx - reserved (LZHAM)
-         01 - LZHAM
-
-      11 xx - reserved (Tino Reichardt)
-         01 - ZSTD
-         02 - BROTLI
-         04 - LZ4
-         05 - LZ5
-         06 - LIZARD
-
-      12 xx - reserverd (Denis Anisimov)
-        
-         01 - WavPack2
-         FE - eSplitter 
-         FF - RawSplitter
- 
-
-06.. - Crypto 
-
-   F0 - Ciphers without hashing algo
-
-      01 - [AES]
-         0x - AES-128
-         4x - AES-192
-         8x - AES-256
-         Cx - AES
-
-         x0 - ECB
-         x1 - CBC
-         x2 - CFB
-         x3 - OFB
-         x4 - CTR
-
-   F1 - Combine Ciphers
-
-      01 - [Zip]
-         01 - ZipCrypto (Main Zip crypto algo)
-
-      03 - [RAR]
-         02 - 
-         03 - Rar29AES (AES-128 + modified SHA-1)
-
-      07 - [7z]
-         01 - 7zAES (AES-256 + SHA-256)
-
-
----
-End of document
+7-Zip method IDs for 7z and xz archives
+---------------------------------------
+
+Version: 24.02
+Date: 2024-03-22
+
+Each compression or crypto method in 7z is associated with unique binary value (ID).
+The length of ID in bytes is arbitrary but it can not exceed 63 bits (8 bytes).
+
+xz and 7z formats use same ID map.
+
+If you want to add some new ID, you have two ways:
+  1) Write request for allocating IDs to 7-Zip developers.
+  2) Generate 8-bytes ID:
+
+    3F ZZ ZZ ZZ ZZ ZZ MM MM 
+
+    3F              - Prefix for random IDs (1 byte)
+    ZZ ZZ ZZ ZZ ZZ  - Developer ID (5 bytes). Use real random bytes. 
+                      
+    MM MM           - Method ID (2 bytes)
+
+    You can notify 7-Zip developers about your Developer ID / Method ID.
+
+    Note: Use new ID, if old codec can not decode data encoded with new version.
+
+
+List of defined IDs
+-------------------
+      
+00 - Copy
+
+03 - Delta
+04 - BCJ (x86)
+05 - PPC (big-endian)
+06 - IA64
+07 - ARM (little-endian)
+08 - ARMT (little-endian)
+09 - SPARC
+0A - ARM64
+0B - RISCV
+
+21 - LZMA2
+          
+02.. - Common
+   03 [Swap]
+      - 2 Swap2
+      - 4 Swap4
+
+03.. - 7z
+   01 - 
+      01 - LZMA
+  
+   03 - [Branch Codecs]
+      01 - [x86 Codecs]
+         03  - BCJ
+         1B  - BCJ2 (4 packed streams)
+      02 - 
+         05 - PPC (big-endian)
+      03 - 
+         01 - Alpha
+      04 - 
+         01 - IA64
+      05 - 
+         01 - ARM (little-endian)
+      06 - 
+         05 - M68 (big-endian)
+      07 - 
+         01 - ARMT (little-endian)
+      08 - 
+         05 - SPARC
+
+   04 - 
+      01 - PPMD
+
+   7F -
+      01 - experimental method.
+
+
+04.. - Misc codecs
+
+   00 - Reserved
+
+   01 - [Zip]
+      00 - Copy (not used. Use {00} instead)
+      01 - Shrink
+      06 - Implode
+      08 - Deflate
+      09 - Deflate64
+      0A - Imploding
+      0C - BZip2 (not used. Use {040202} instead)
+      0E - LZMA (LZMA-zip)
+
+      5D - ZSTD
+      5F - xz
+      60 - Jpeg
+      61 - WavPack
+      62 - PPMd (PPMd-zip)
+      63 - wzAES
+
+   02 - 
+      02 - BZip2
+
+   03 - [Rar]
+      01 - Rar1
+      02 - Rar2
+      03 - Rar3
+      05 - Rar5
+
+   04 - [Arj]
+      01 - Arj(1,2,3)
+      02 - Arj4
+
+   05 - [Z]
+
+   06 - [Lzh]
+
+   07 - Reserved for 7z
+
+   08 - [Cab]
+
+   09 - [NSIS]
+      01 - DeflateNSIS
+      02 - BZip2NSIS
+
+   F7 - External codecs (that are not included to 7-Zip)
+
+      0x xx - reserved
+
+      10 xx - reserved (LZHAM)
+         01 - LZHAM
+
+      11 xx - reserved (Tino Reichardt)
+         01 - ZSTD
+         02 - BROTLI
+         04 - LZ4
+         05 - LZ5
+         06 - LIZARD
+
+      12 xx - reserverd (Denis Anisimov)
+        
+         01 - WavPack2
+         FE - eSplitter 
+         FF - RawSplitter
+ 
+
+06.. - Crypto 
+
+   F0 - Ciphers without hashing algo
+
+      01 - [AES]
+         0x - AES-128
+         4x - AES-192
+         8x - AES-256
+         Cx - AES
+
+         x0 - ECB
+         x1 - CBC
+         x2 - CFB
+         x3 - OFB
+         x4 - CTR
+
+   F1 - Combine Ciphers
+
+      01 - [Zip]
+         01 - ZipCrypto (Main Zip crypto algo)
+
+      03 - [RAR]
+         02 - 
+         03 - Rar29AES (AES-128 + modified SHA-1)
+
+      07 - [7z]
+         01 - 7zAES (AES-256 + SHA-256)
+
+
+---
+End of document
diff --git a/src/sdk/DOC/installer.txt b/src/sdk/DOC/installer.txt
index b99d21d..70ad7dc 100644
--- a/src/sdk/DOC/installer.txt
+++ b/src/sdk/DOC/installer.txt
@@ -1,166 +1,166 @@
-7-Zip for installers 9.38
--------------------------
-
-7-Zip is a file archiver for Windows NT/2000/2003/2008/XP/Vista/7/8/10. 
-
-7-Zip for installers is part of LZMA SDK.
-LZMA SDK is written and placed in the public domain by Igor Pavlov.
-
-It's allowed to join 7-Zip SFX module with another software.
-It's allowed to change resources of 7-Zip's SFX modules.
-
-
-HOW to use
------------
-
-7zr.exe is reduced version of 7za.exe of 7-Zip.
-7zr.exe supports only format with these codecs: LZMA, LZMA2, BCJ, BCJ2, ARM, Copy.
-
-Example of compressing command for installation packages:
-
-7zr a archive.7z files
-
-7zSD.sfx is SFX module for installers. 7zSD.sfx uses msvcrt.dll.
-
-SFX modules for installers allow to create installation program. 
-Such module extracts archive to temp folder and then runs specified program and removes 
-temp files after program finishing. Self-extract archive for installers must be created 
-as joining 3 files: SFX_Module, Installer_Config, 7z_Archive. 
-Installer_Config is optional file. You can use the following command to create installer 
-self-extract archive:
-
-copy /b 7zSD.sfx + config.txt + archive.7z archive.exe
-
-The smallest installation package size can be achieved, if installation files was 
-uncompressed before including to 7z archive.
-
--y switch for installer module (at runtime) specifies quiet mode for extracting.
-
-Installer Config file format
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Config file contains commands for Installer. File begins from string 
-;!@Install@!UTF-8! and ends with ;!@InstallEnd@!. File must be written 
-in UTF-8 encoding. File contains string pairs: 
-
-ID_String="Value"
-
-ID_String          Description 
-
-Title              Title for messages 
-BeginPrompt        Begin Prompt message 
-Progress           Value can be "yes" or "no". Default value is "yes". 
-RunProgram         Command for executing. Default value is "setup.exe". 
-                   Substring %%T will be replaced with path to temporary 
-                   folder, where files were extracted 
-Directory          Directory prefix for "RunProgram". Default value is ".\\" 
-ExecuteFile        Name of file for executing 
-ExecuteParameters  Parameters for "ExecuteFile" 
-
-
-You can omit any string pair.
-
-There are two ways to run program: RunProgram and ExecuteFile. 
-Use RunProgram, if you want to run some program from .7z archive. 
-Use ExecuteFile, if you want to open some document from .7z archive or 
-if you want to execute some command from Windows.
-
-If you use RunProgram and if you specify empty directory prefix: Directory="", 
-the system searches for the executable file in the following sequence:
-
-1. The directory from which the application (installer) loaded. 
-2. The temporary folder, where files were extracted. 
-3. The Windows system directory. 
-
-
-Config file Examples
-~~~~~~~~~~~~~~~~~~~~
-
-;!@Install@!UTF-8!
-Title="7-Zip 4.00"
-BeginPrompt="Do you want to install the 7-Zip 4.00?"
-RunProgram="setup.exe"
-;!@InstallEnd@!
-
-
-
-;!@Install@!UTF-8!
-Title="7-Zip 4.00"
-BeginPrompt="Do you want to install the 7-Zip 4.00?"
-ExecuteFile="7zip.msi"
-;!@InstallEnd@!
-
-
-
-;!@Install@!UTF-8!
-Title="7-Zip 4.01 Update"
-BeginPrompt="Do you want to install the 7-Zip 4.01 Update?"
-ExecuteFile="msiexec.exe"
-ExecuteParameters="/i 7zip.msi REINSTALL=ALL REINSTALLMODE=vomus"
-;!@InstallEnd@!
-
-
-
-Small SFX modules for installers
---------------------------------
-
-7zS2.sfx     - small SFX module (GUI version)
-7zS2con.sfx  - small SFX module (Console version)
-
-Small SFX modules support this codecs: LZMA, LZMA2, BCJ, BCJ2, ARM, COPY
-
-Small SFX module is similar to common SFX module for installers.
-The difference (what's new in small version):
- - Smaller size (30 KB vs 100 KB)
- - C source code instead of �++
- - No installer Configuration file
- - No extracting progress window
- - It decompresses solid 7z blocks (it can be whole 7z archive) to RAM.
-   So user that calls SFX installer must have free RAM of size of largest 
-   solid 7z block (size of 7z archive at simplest case).
-
-How to use
-----------
-
-copy /b 7zS2.sfx + archive.7z sfx.exe
-
-When you run installer sfx module (sfx.exe)
-1) It creates "7zNNNNNNNN" temp folder in system temp folder.
-2) It extracts .7z archive to that folder
-3) It executes one file from "7zNNNNNNNN" temp folder. 
-4) It removes "7zNNNNNNNN" temp folder
-
-You can send parameters to installer, and installer will transfer them to extracted .exe file.
-
-Small SFX uses 3 levels of priorities to select file to execute:
-
-  1) Files in root folder have higher priority than files in subfolders.
-  2) File extension priorities (from high to low priority order): 
-       bat, cmd, exe, inf, msi, cab (under Windows CE), html, htm
-  3) File name priorities (from high to low priority order): 
-       setup, install, run, start
-
-Windows CE (ARM) version of 7zS2.sfx is included to 7-Zip for Windows Mobile package.
-
-
-Examples
---------
-
-1) To create compressed console 7-Zip:
-
-7zr a c.7z 7z.exe 7z.dll -mx
-copy /b 7zS2con.sfx + c.7z 7zCompr.exe
-7zCompr.exe b -md22
-
-
-2) To create compressed GUI 7-Zip:
-
-7zr a g.7z 7zg.exe 7z.dll -mx
-copy /b 7zS2.sfx + g.7z 7zgCompr.exe
-7zgCompr.exe b -md22
-
-
-3) To open some file:
-
-7zr a h.7z readme.txt -mx
-copy /b 7zS2.sfx + h.7z 7zTxt.exe 
-7zTxt.exe
+7-Zip for installers 9.38
+-------------------------
+
+7-Zip is a file archiver for Windows NT/2000/2003/2008/XP/Vista/7/8/10. 
+
+7-Zip for installers is part of LZMA SDK.
+LZMA SDK is written and placed in the public domain by Igor Pavlov.
+
+It's allowed to join 7-Zip SFX module with another software.
+It's allowed to change resources of 7-Zip's SFX modules.
+
+
+HOW to use
+-----------
+
+7zr.exe is reduced version of 7za.exe of 7-Zip.
+7zr.exe supports only format with these codecs: LZMA, LZMA2, BCJ, BCJ2, ARM, Copy.
+
+Example of compressing command for installation packages:
+
+7zr a archive.7z files
+
+7zSD.sfx is SFX module for installers. 7zSD.sfx uses msvcrt.dll.
+
+SFX modules for installers allow to create installation program. 
+Such module extracts archive to temp folder and then runs specified program and removes 
+temp files after program finishing. Self-extract archive for installers must be created 
+as joining 3 files: SFX_Module, Installer_Config, 7z_Archive. 
+Installer_Config is optional file. You can use the following command to create installer 
+self-extract archive:
+
+copy /b 7zSD.sfx + config.txt + archive.7z archive.exe
+
+The smallest installation package size can be achieved, if installation files was 
+uncompressed before including to 7z archive.
+
+-y switch for installer module (at runtime) specifies quiet mode for extracting.
+
+Installer Config file format
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Config file contains commands for Installer. File begins from string 
+;!@Install@!UTF-8! and ends with ;!@InstallEnd@!. File must be written 
+in UTF-8 encoding. File contains string pairs: 
+
+ID_String="Value"
+
+ID_String          Description 
+
+Title              Title for messages 
+BeginPrompt        Begin Prompt message 
+Progress           Value can be "yes" or "no". Default value is "yes". 
+RunProgram         Command for executing. Default value is "setup.exe". 
+                   Substring %%T will be replaced with path to temporary 
+                   folder, where files were extracted 
+Directory          Directory prefix for "RunProgram". Default value is ".\\" 
+ExecuteFile        Name of file for executing 
+ExecuteParameters  Parameters for "ExecuteFile" 
+
+
+You can omit any string pair.
+
+There are two ways to run program: RunProgram and ExecuteFile. 
+Use RunProgram, if you want to run some program from .7z archive. 
+Use ExecuteFile, if you want to open some document from .7z archive or 
+if you want to execute some command from Windows.
+
+If you use RunProgram and if you specify empty directory prefix: Directory="", 
+the system searches for the executable file in the following sequence:
+
+1. The directory from which the application (installer) loaded. 
+2. The temporary folder, where files were extracted. 
+3. The Windows system directory. 
+
+
+Config file Examples
+~~~~~~~~~~~~~~~~~~~~
+
+;!@Install@!UTF-8!
+Title="7-Zip 4.00"
+BeginPrompt="Do you want to install the 7-Zip 4.00?"
+RunProgram="setup.exe"
+;!@InstallEnd@!
+
+
+
+;!@Install@!UTF-8!
+Title="7-Zip 4.00"
+BeginPrompt="Do you want to install the 7-Zip 4.00?"
+ExecuteFile="7zip.msi"
+;!@InstallEnd@!
+
+
+
+;!@Install@!UTF-8!
+Title="7-Zip 4.01 Update"
+BeginPrompt="Do you want to install the 7-Zip 4.01 Update?"
+ExecuteFile="msiexec.exe"
+ExecuteParameters="/i 7zip.msi REINSTALL=ALL REINSTALLMODE=vomus"
+;!@InstallEnd@!
+
+
+
+Small SFX modules for installers
+--------------------------------
+
+7zS2.sfx     - small SFX module (GUI version)
+7zS2con.sfx  - small SFX module (Console version)
+
+Small SFX modules support this codecs: LZMA, LZMA2, BCJ, BCJ2, ARM, COPY
+
+Small SFX module is similar to common SFX module for installers.
+The difference (what's new in small version):
+ - Smaller size (30 KB vs 100 KB)
+ - C source code instead of �++
+ - No installer Configuration file
+ - No extracting progress window
+ - It decompresses solid 7z blocks (it can be whole 7z archive) to RAM.
+   So user that calls SFX installer must have free RAM of size of largest 
+   solid 7z block (size of 7z archive at simplest case).
+
+How to use
+----------
+
+copy /b 7zS2.sfx + archive.7z sfx.exe
+
+When you run installer sfx module (sfx.exe)
+1) It creates "7zNNNNNNNN" temp folder in system temp folder.
+2) It extracts .7z archive to that folder
+3) It executes one file from "7zNNNNNNNN" temp folder. 
+4) It removes "7zNNNNNNNN" temp folder
+
+You can send parameters to installer, and installer will transfer them to extracted .exe file.
+
+Small SFX uses 3 levels of priorities to select file to execute:
+
+  1) Files in root folder have higher priority than files in subfolders.
+  2) File extension priorities (from high to low priority order): 
+       bat, cmd, exe, inf, msi, cab (under Windows CE), html, htm
+  3) File name priorities (from high to low priority order): 
+       setup, install, run, start
+
+Windows CE (ARM) version of 7zS2.sfx is included to 7-Zip for Windows Mobile package.
+
+
+Examples
+--------
+
+1) To create compressed console 7-Zip:
+
+7zr a c.7z 7z.exe 7z.dll -mx
+copy /b 7zS2con.sfx + c.7z 7zCompr.exe
+7zCompr.exe b -md22
+
+
+2) To create compressed GUI 7-Zip:
+
+7zr a g.7z 7zg.exe 7z.dll -mx
+copy /b 7zS2.sfx + g.7z 7zgCompr.exe
+7zgCompr.exe b -md22
+
+
+3) To open some file:
+
+7zr a h.7z readme.txt -mx
+copy /b 7zS2.sfx + h.7z 7zTxt.exe 
+7zTxt.exe
diff --git a/src/sdk/DOC/lzma-history.txt b/src/sdk/DOC/lzma-history.txt
index 48ee748..96da8bf 100644
--- a/src/sdk/DOC/lzma-history.txt
+++ b/src/sdk/DOC/lzma-history.txt
@@ -1,446 +1,651 @@
-HISTORY of the LZMA SDK
------------------------
-
-19.00          2019-02-21
--------------------------
-- Encryption strength for 7z archives was increased:
-  the size of random initialization vector was increased from 64-bit to 128-bit,
-  and the pseudo-random number generator was improved.
-- The bug in 7zIn.c code was fixed.
-
-
-18.06          2018-12-30
--------------------------
-- The speed for LZMA/LZMA2 compressing was increased by 3-10%,
-  and there are minor changes in compression ratio.
-- Some bugs were fixed.
-- The bug in 7-Zip 18.02-18.05 was fixed:
-  There was memory leak in multithreading xz decoder - XzDecMt_Decode(),
-  if xz stream contains only one block.
-- The changes for MSVS compiler makefiles: 
-   - the makefiles now use "PLATFORM" macroname with values (x64, x86, arm64)
-     instead of "CPU" macroname with values (AMD64, ARM64).
-   - the makefiles by default now use static version of the run-time library.
-
-
-18.05          2018-04-30
--------------------------
-- The speed for LZMA/LZMA2 compressing was increased 
-    by 8% for fastest/fast compression levels and 
-    by 3% for normal/maximum compression levels.
-- Previous versions of 7-Zip could work incorrectly in "Large memory pages" mode in
-  Windows 10 because of some BUG with "Large Pages" in Windows 10. 
-  Now 7-Zip doesn't use "Large Pages" on Windows 10 up to revision 1709 (16299).
-- The BUG was fixed in Lzma2Enc.c
-    Lzma2Enc_Encode2() function worked incorretly,
-      if (inStream == NULL) and the number of block threads is more than 1.
-
-
-18.03 beta     2018-03-04
--------------------------
-- Asm\x86\LzmaDecOpt.asm: new optimized LZMA decoder written in asm 
-  for x64 with about 30% higher speed than main version of LZMA decoder written in C.
-- The speed for single-thread LZMA/LZMA2 decoder written in C was increased by 3%.
-- 7-Zip now can use multi-threading for 7z/LZMA2 decoding,
-  if there are multiple independent data chunks in LZMA2 stream.
-- 7-Zip now can use multi-threading for xz decoding,
-  if there are multiple blocks in xz stream.
-
-
-18.01          2019-01-28
--------------------------
-- The BUG in 17.01 - 18.00 beta was fixed:
-  XzDec.c : random block unpacking and XzUnpacker_IsBlockFinished()
-  didn't work correctly for xz archives without checksum (CRC).
-
-
-18.00 beta     2019-01-10
--------------------------
-- The BUG in xz encoder was fixed:
-  There was memory leak of 16 KB for each file compressed with 
-  xz compression method, if additional filter was used.
-
-
-17.01 beta     2017-08-28
--------------------------
-- Minor speed optimization for LZMA2 (xz and 7z) multi-threading compression.
-  7-Zip now uses additional memory buffers for multi-block LZMA2 compression.
-  CPU utilization was slightly improved.
-- 7-zip now creates multi-block xz archives by default. Block size can be 
-  specified with -ms[Size]{m|g} switch.
-- xz decoder now can unpack random block from multi-block xz archives.
-- 7-Zip command line: @listfile now doesn't work after -- switch.
-  Use -i@listfile before -- switch instead.
-- The BUGs were fixed:
-  7-Zip 17.00 beta crashed for commands that write anti-item to 7z archive.
-
-
-17.00 beta     2017-04-29
--------------------------
-- NewHandler.h / NewHandler.cpp: 
-    now it redefines operator new() only for old MSVC compilers (_MSC_VER < 1900).
-- C/7zTypes.h : the names of variables in interface structures were changed (vt).
-- Some bugs were fixed. 7-Zip could crash in some cases.
-- Some internal changes in code.
-
-
-16.04          2016-10-04
--------------------------
-- The bug was fixed in DllSecur.c.
-
-
-16.03          2016-09-28
--------------------------
-- SFX modules now use some protection against DLL preloading attack.
-- Some bugs in 7z code were fixed.
-
-
-16.02          2016-05-21
--------------------------
-- The BUG in 16.00 - 16.01 was fixed:
-  Split Handler (SplitHandler.cpp) returned incorrect 
-  total size value (kpidSize) for split archives.
-
-
-16.01          2016-05-19
--------------------------	
-- Some internal changes to reduce the number of compiler warnings.
-
-
-16.00          2016-05-10
--------------------------	
-- Some bugs were fixed.
-
-
-15.12          2015-11-19
--------------------------	
-- The BUG in C version of 7z decoder was fixed:
-  7zDec.c : SzDecodeLzma2()
-  7z decoder could mistakenly report about decoding error for some 7z archives
-  that use LZMA2 compression method.
-  The probability to get that mistaken decoding error report was about 
-  one error per 16384 solid blocks for solid blocks larger than 16 KB (compressed size). 
-- The BUG (in 9.26-15.11) in C version of 7z decoder was fixed:
-  7zArcIn.c : SzReadHeader2()
-  7z decoder worked incorrectly for 7z archives that contain 
-  empty solid blocks, that can be placed to 7z archive, if some file is 
-  unavailable for reading during archive creation.
-
-
-15.09 beta     2015-10-16
--------------------------	
-- The BUG in LZMA / LZMA2 encoding code was fixed.
-  The BUG in LzFind.c::MatchFinder_ReadBlock() function.
-  If input data size is larger than (4 GiB - dictionary_size),
-  the following code worked incorrectly:
-  -  LZMA : LzmaEnc_MemEncode(), LzmaEncode() : LZMA encoding functions 
-     for compressing from memory to memory. 
-     That BUG is not related to LZMA encoder version that works via streams.
-  -  LZMA2 : multi-threaded version of LZMA2 encoder worked incorrectly, if 
-     default value of chunk size (CLzma2EncProps::blockSize) is changed 
-     to value larger than (4 GiB - dictionary_size).
-
-
-9.38 beta      2015-01-03
--------------------------	
-- The BUG in 9.31-9.37 was fixed:
-  IArchiveGetRawProps interface was disabled for 7z archives.
-- The BUG in 9.26-9.36 was fixed:
-  Some code in CPP\7zip\Archive\7z\ worked correctly only under Windows.
-
-
-9.36 beta      2014-12-26
--------------------------	
-- The BUG in command line version was fixed:
-  7-Zip created temporary archive in current folder during update archive
-  operation, if -w{Path} switch was not specified. 
-  The fixed 7-Zip creates temporary archive in folder that contains updated archive.
-- The BUG in 9.33-9.35 was fixed:
-  7-Zip silently ignored file reading errors during 7z or gz archive creation,
-  and the created archive contained only part of file that was read before error.
-  The fixed 7-Zip stops archive creation and it reports about error.
-
-
-9.35 beta      2014-12-07
--------------------------	
-- 7zr.exe now support AES encryption.
-- SFX mudules were added to LZMA SDK
-- Some bugs were fixed.
-
-
-9.21 beta      2011-04-11
--------------------------	
-- New class FString for file names at file systems.
-- Speed optimization in CRC code for big-endian CPUs.
-- The BUG in Lzma2Dec.c was fixed:
-    Lzma2Decode function didn't work.
-
-
-9.18 beta      2010-11-02
--------------------------	
-- New small SFX module for installers (SfxSetup).
-
-
-9.12 beta      2010-03-24
--------------------------
-- The BUG in LZMA SDK 9.* was fixed: LZMA2 codec didn't work,
-  if more than 10 threads were used (or more than 20 threads in some modes).
-
-
-9.11 beta      2010-03-15
--------------------------
-- PPMd compression method support
-   
-
-9.09           2009-12-12
--------------------------
-- The bug was fixed:
-   Utf16_To_Utf8 funstions in UTFConvert.cpp and 7zMain.c
-   incorrectly converted surrogate characters (the code >= 0x10000) to UTF-8.
-- Some bugs were fixed
-
-
-9.06           2009-08-17
--------------------------
-- Some changes in ANSI-C 7z Decoder interfaces.
-
-
-9.04           2009-05-30
--------------------------
-- LZMA2 compression method support
-- xz format support
-
-
-4.65           2009-02-03
--------------------------
-- Some minor fixes
-
-
-4.63           2008-12-31
--------------------------
-- Some minor fixes
-
-
-4.61 beta      2008-11-23
--------------------------
-- The bug in ANSI-C LZMA Decoder was fixed:
-    If encoded stream was corrupted, decoder could access memory 
-    outside of allocated range.
-- Some changes in ANSI-C 7z Decoder interfaces.
-- LZMA SDK is placed in the public domain.
-
-
-4.60 beta      2008-08-19
--------------------------
-- Some minor fixes.
-
-
-4.59 beta      2008-08-13
--------------------------
-- The bug was fixed:
-    LZMA Encoder in fast compression mode could access memory outside of 
-    allocated range in some rare cases.
-
-
-4.58 beta      2008-05-05
--------------------------
-- ANSI-C LZMA Decoder was rewritten for speed optimizations.
-- ANSI-C LZMA Encoder was included to LZMA SDK.
-- C++ LZMA code now is just wrapper over ANSI-C code.
-
-
-4.57           2007-12-12
--------------------------
-- Speed optimizations in �++ LZMA Decoder. 
-- Small changes for more compatibility with some C/C++ compilers.
-
-
-4.49 beta      2007-07-05
--------------------------
-- .7z ANSI-C Decoder:
-     - now it supports BCJ and BCJ2 filters
-     - now it supports files larger than 4 GB.
-     - now it supports "Last Write Time" field for files.
-- C++ code for .7z archives compressing/decompressing from 7-zip 
-  was included to LZMA SDK.
-  
-
-4.43           2006-06-04
--------------------------
-- Small changes for more compatibility with some C/C++ compilers.
-  
-
-4.42           2006-05-15
--------------------------
-- Small changes in .h files in ANSI-C version.
-  
-
-4.39 beta      2006-04-14
--------------------------
-- The bug in versions 4.33b:4.38b was fixed:
-  C++ version of LZMA encoder could not correctly compress 
-  files larger than 2 GB with HC4 match finder (-mfhc4).
-  
-
-4.37 beta      2005-04-06
--------------------------
-- Fixes in C++ code: code could no be compiled if _NO_EXCEPTIONS was defined. 
-
-
-4.35 beta      2005-03-02
--------------------------
-- The bug was fixed in C++ version of LZMA Decoder:
-    If encoded stream was corrupted, decoder could access memory 
-    outside of allocated range.
-
-
-4.34 beta      2006-02-27
--------------------------
-- Compressing speed and memory requirements for compressing were increased
-- LZMA now can use only these match finders: HC4, BT2, BT3, BT4
-
-
-4.32           2005-12-09
--------------------------
-- Java version of LZMA SDK was included
-
-
-4.30           2005-11-20
--------------------------
-- Compression ratio was improved in -a2 mode
-- Speed optimizations for compressing in -a2 mode
-- -fb switch now supports values up to 273
-- The bug in 7z_C (7zIn.c) was fixed:
-  It used Alloc/Free functions from different memory pools.
-  So if program used two memory pools, it worked incorrectly.
-- 7z_C: .7z format supporting was improved
-- LZMA# SDK (C#.NET version) was included
-
-
-4.27 (Updated) 2005-09-21
--------------------------
-- Some GUIDs/interfaces in C++ were changed.
- IStream.h:
-   ISequentialInStream::Read now works as old ReadPart
-   ISequentialOutStream::Write now works as old WritePart
-
-
-4.27           2005-08-07
--------------------------
-- The bug in LzmaDecodeSize.c was fixed:
-   if _LZMA_IN_CB and _LZMA_OUT_READ were defined,
-   decompressing worked incorrectly.
-
-
-4.26           2005-08-05
--------------------------
-- Fixes in 7z_C code and LzmaTest.c:
-  previous versions could work incorrectly,
-  if malloc(0) returns 0
-
-
-4.23           2005-06-29
--------------------------
-- Small fixes in C++ code
-
-
-4.22           2005-06-10
--------------------------
-- Small fixes
-
-
-4.21           2005-06-08
--------------------------
-- Interfaces for ANSI-C LZMA Decoder (LzmaDecode.c) were changed
-- New additional version of ANSI-C LZMA Decoder with zlib-like interface:
-    - LzmaStateDecode.h
-    - LzmaStateDecode.c
-    - LzmaStateTest.c
-- ANSI-C LZMA Decoder now can decompress files larger than 4 GB
-
-
-4.17           2005-04-18
--------------------------
-- New example for RAM->RAM compressing/decompressing: 
-  LZMA + BCJ (filter for x86 code):
-    - LzmaRam.h
-    - LzmaRam.cpp
-    - LzmaRamDecode.h
-    - LzmaRamDecode.c
-    - -f86 switch for lzma.exe
-
-
-4.16           2005-03-29
--------------------------
-- The bug was fixed in LzmaDecode.c (ANSI-C LZMA Decoder): 
-   If _LZMA_OUT_READ was defined, and if encoded stream was corrupted,
-   decoder could access memory outside of allocated range.
-- Speed optimization of ANSI-C LZMA Decoder (now it's about 20% faster).
-  Old version of LZMA Decoder now is in file LzmaDecodeSize.c. 
-  LzmaDecodeSize.c can provide slightly smaller code than LzmaDecode.c
-- Small speed optimization in LZMA C++ code
-- filter for SPARC's code was added
-- Simplified version of .7z ANSI-C Decoder was included
-
-
-4.06           2004-09-05
--------------------------
-- The bug in v4.05 was fixed:
-    LZMA-Encoder didn't release output stream in some cases.
-
-
-4.05           2004-08-25
--------------------------
-- Source code of filters for x86, IA-64, ARM, ARM-Thumb 
-  and PowerPC code was included to SDK
-- Some internal minor changes
-
-
-4.04           2004-07-28
--------------------------
-- More compatibility with some C++ compilers
-
-
-4.03           2004-06-18
--------------------------
-- "Benchmark" command was added. It measures compressing 
-  and decompressing speed and shows rating values. 
-  Also it checks hardware errors.
-
-
-4.02           2004-06-10
--------------------------
-- C++ LZMA Encoder/Decoder code now is more portable
-  and it can be compiled by GCC on Linux.
-
-
-4.01           2004-02-15
--------------------------
-- Some detection of data corruption was enabled.
-    LzmaDecode.c / RangeDecoderReadByte
-    .....
-    {
-      rd->ExtraBytes = 1;
-      return 0xFF;
-    }
-
-
-4.00           2004-02-13
--------------------------
-- Original version of LZMA SDK
-
-
-
-HISTORY of the LZMA
--------------------
-  2001-2008:  Improvements to LZMA compressing/decompressing code, 
-              keeping compatibility with original LZMA format
-  1996-2001:  Development of LZMA compression format
-
-  Some milestones:
-
-  2001-08-30: LZMA compression was added to 7-Zip
-  1999-01-02: First version of 7-Zip was released
-  
-
-End of document
+HISTORY of the LZMA SDK
+-----------------------
+
+25.01          2025-08-03
+-------------------------
+- The code for handling symbolic links has been changed 
+  to provide greater security when extracting files from archives.
+  Command line switch -snld20 can be used to bypass default security 
+  checks when creating symbolic links.
+
+
+25.00          2025-07-05
+-------------------------
+- 7-Zip for Windows can now use more than 64 CPU threads for compression
+  to zip/7z/xz archives and for the 7-Zip benchmark.
+  If there are more than one processor group in Windows (on systems with more than
+  64 cpu threads), 7-Zip distributes running CPU threads across different processor groups.
+- fixed some bugs and vulnerabilities.
+
+
+24.09          2024-11-29
+-------------------------
+- The default dictionary size values for LZMA/LZMA2 compression methods were increased:
+         dictionary size   compression level
+  v24.08  v24.09  v24.09   
+          32-bit  64-bit    
+    8 MB   16 MB   16 MB   -mx4
+   16 MB   32 MB   32 MB   -mx5 : Normal
+   32 MB   64 MB   64 MB   -mx6
+   32 MB   64 MB  128 MB   -mx7 : Maximum
+   64 MB   64 MB  256 MB   -mx8
+   64 MB   64 MB  256 MB   -mx9 : Ultra
+  The default dictionary size values for 32-bit versions of LZMA/LZMA2 don't exceed 64 MB.
+- If an archive update operation uses a temporary archive folder and 
+  the archive is moved to the destination folder, 7-Zip shows the progress of moving 
+  the archive file, as this operation can take a long time if the archive is large.
+- Some bugs were fixed.
+
+
+24.07          2024-06-19
+-------------------------
+- Changes in files:
+    Asm/x86/Sha256Opt.asm
+  Now it uses "READONLY" flag for constant array segment.
+  It fixes an issue where ".rodata" section in 7-Zip for x86/x64 Linux had a "WRITE" attribute.
+
+
+24.05          2024-05-14
+-------------------------
+- New switch -myv={MMNN} to set decoder compatibility version for 7z archive creating.
+  {MMNN} is 4-digit number that represents the version of 7-Zip without a dot.
+  If -myv={MMNN} switch is specified, 7-Zip will only use compression methods that can 
+  be decoded by the specified version {MMNN} of 7-Zip and newer versions.
+  If -myv={MMNN} switch is not specified, -myv=2300 is used, and 7-Zip will only
+  use compression methods that can be decoded by 7-Zip 23.00 and newer versions.
+- New switch -myfa={FilterID} to    allow 7-Zip to use the specified filter method for 7z archive creating.
+- New switch -myfd={FilterID} to disallow 7-Zip to use the specified filter method for 7z archive creating.
+
+
+24.03          2024-03-23
+-------------------------
+- 7-Zip now can use new RISCV filter for compression to 7z and xz archives.
+  RISCV filter can increase compression ratio for data containing executable
+  files compiled for RISC-V architecture.
+- The speed for LZMA and LZMA2 decompression in ARM64 version for Windows
+  was increased by 20%-60%.
+  It uses arm64 assembler code, and clang-cl is required for arm64 assembler code compiling.
+- -slmu switch : to show timestamps as UTC instead of LOCAL TIME.
+- -slsl switch : in console 7-Zip for Windows : to show file paths with 
+  linux path separator slash '/' instead of backslash separator '\'.
+- 7-Zip supports .sha256 files that use backslash path separator '\'.
+- Some bugs were fixed.
+
+
+24.01          2024-01-31
+-------------------------
+- 7-Zip uses file C/Precomp.h that is included to all c and c++ files.
+  CPP/Common/Common.h also includes C/Precomp.h.
+  C/Precomp.h defines the following macros (if _WIN32 is defined):
+    Z7_LARGE_PAGES 1
+    Z7_LONG_PATH 1
+    Z7_WIN32_WINNT_MIN  0x0500 (or higher)
+    _WIN32_WINNT        0x0500 (or higher)
+    WINVER  _WIN32_WINNT
+    UNICODE 1
+    _UNICODE 1
+  if _WIN32_WINNT is defined already, C/Precomp.h doesn't redefine it.
+
+- Speed optimizations for hash caclulation: CRC-32, CRC-64.
+- The bug was fixed: 7-Zip for Linux could fail for multivolume creation in some cases.
+- 7zr.exe for arm64 is included to LZMA SDK package.
+- Some bugs were fixed.
+
+
+23.01          2023-06-20
+-------------------------
+- 7-Zip now can use new ARM64 filter for compression to 7z and xz archives.
+  ARM64 filter can increase compression ratio for data containing executable 
+  files compiled for ARM64 (AArch64) architecture.
+  Also 7-Zip now parses executable files (that have exe and dll filename extensions) 
+  before compressing, and it selects appropriate filter for each parsed file: 
+    - BCJ or BCJ2 filter for x86 executable files,
+    - ARM64 filter for ARM64 executable files.
+  Previous versions by default used x86 filter BCJ or BCJ2 for all exe/dll files.
+- Default section size for BCJ2 filter was changed from 64 MiB to 240 MiB.
+  It can increase compression ratio for executable files larger than 64 MiB.
+- Some optimizations in filters code: BCJ, BCJ2, Swap* and opthers.
+- If 7-Zip uses BCJ2 filter for big datasets compressing, it can use additional temp 
+  files in system's TEMP folder. 7-Zip uses temp file for additional compressed 
+  data stream, if size of such compressed stream is larger than predefined limit: 
+  16 MiB in 32-bit version, 4 GiB in 64-bit version.
+- When new 7-Zip creates multivolume archive, 7-Zip keeps in open state
+  only volumes that still can be changed. Previous versions kept all volumes 
+  in open state until the end of the archive creation.
+- 7-Zip for Linux and macOS now can reduce the number of simultaneously open files,
+  when 7-Zip opens, extracts or creates multivolume archive. It allows to avoid 
+  the failures for cases with big number of volumes, bacause there is a limitation 
+  for number of open files allowed for a single program in Linux and macOS.
+- Some bugs were fixed.
+- Source code changes:
+- All external macros for compiling C/C++ code of 7-Zip now have Z7_ prefix.
+- 7-Zip COM interfaces now use new macros that allow to declare and implement COM interface.
+- The code has been modified to compile with the maximum diagnostic warning level:
+    -Wall in MSVC and -Weverything in CLANG.
+  And some warning types are disabled in 2 files:
+    - C/Compiler.h for C/C++ code warnings.
+    - CPP/Common/Common.h for C++ code warnings.
+- Linux/macOS versions of 7-Zip: IUnknown interface in new code doesn't use 
+  virtual destructor that was used in previous 7-Zip and p7zip:
+     // virtual ~IUnknown() {}
+  So 7-Zip's dynamically linked shared libraries (codecs) are not compatible 
+  between new 7-Zip for Linux/macOS and old 7-Zip (and p7zip).
+
+
+21.07          2021-12-26
+-------------------------
+- New switches: -spm and -im!{file_path} to exclude directories from processing 
+    for specified paths that don't contain path separator character at the end of path.
+- The sorting order of files in archives was slightly changed to be more consistent
+  for cases where the name of some directory is the same as the prefix part of the name
+  of another directory or file.
+
+
+21.06          2021-11-24
+-------------------------
+- Bug in LZMA encoder in file LzmaEnc.c was fixed:
+  LzmaEnc_MemEncode(), LzmaEncode() and LzmaCompress() could work incorrectly, 
+    if size value for output buffer is smaller than size required for all compressed data.
+  LzmaEnc_Encode() could work incorrectly,
+    if callback ISeqOutStream::Write() doesn't write all compressed data.
+  NCompress::NLzma::CEncoder::Code() could work incorrectly,
+    if callback ISequentialOutStream::Write() returns error code.
+- Bug in versions 21.00-21.05 was fixed:
+  7-Zip didn't set attributes of directories during archive extracting.
+
+
+21.04 beta     2021-11-02
+-------------------------
+- 7-Zip now reduces the number of working CPU threads for compression,
+  if RAM size is not enough for compression with big LZMA2 dictionary.
+- 7-Zip now can create and check "file.sha256" text files that contain the list 
+  of file names and SHA-256 checksums in format compatible with sha256sum program.
+
+
+21.03 beta     2021-07-20
+-------------------------
+- The maximum dictionary size for LZMA/LZMA2 compressing was increased to 4 GB (3840 MiB).
+- Minor speed optimizations in LZMA/LZMA2 compressing.
+
+
+21.02 alpha    2021-05-06
+-------------------------
+- The command line version of 7-Zip for macOS was released.
+- The speed for LZMA and LZMA2 decompression in arm64 versions for macOS and Linux 
+  was increased by 20%-60%.
+
+
+21.01 alpha    2021-03-09
+-------------------------
+- The command line version of 7-Zip for Linux was released.
+- The improvements for speed of ARM64 version using hardware CPU instructions 
+  for AES, CRC-32, SHA-1 and SHA-256.
+- Some bugs were fixed.
+
+
+20.02 alpha    2020-08-08
+-------------------------
+- The default number of LZMA2 chunks per solid block in 7z archive was increased to 64.
+  It allows to increase the compression speed for big 7z archives, if there is a big number 
+  of CPU cores and threads.
+- The speed of PPMd compressing/decompressing was increased for 7z archives.
+- The new -ssp switch. If the switch -ssp is specified, 7-Zip doesn't allow the system 
+  to modify "Last Access Time" property of source files for archiving and hashing operations. 
+- Some bugs were fixed.
+
+
+20.00 alpha    2020-02-06
+-------------------------
+- 7-Zip now supports new optional match finders for LZMA/LZMA2 compression: bt5 and hc5, 
+  that can work faster than bt4 and hc4 match finders for the data with big redundancy.
+- The compression ratio was improved for Fast and Fastest compression levels with the 
+  following default settings:
+   - Fastest level (-mx1) : hc5 match finder with 256 KB dictionary.
+   - Fast    level (-mx3) : hc5 match finder with 4 MB dictionary.
+- Minor speed optimizations in multithreaded LZMA/LZMA2 compression for Normal/Maximum/Ultra 
+  compression levels.
+
+
+19.00          2019-02-21
+-------------------------
+- Encryption strength for 7z archives was increased:
+  the size of random initialization vector was increased from 64-bit to 128-bit,
+  and the pseudo-random number generator was improved.
+- The bug in 7zIn.c code was fixed.
+
+
+18.06          2018-12-30
+-------------------------
+- The speed for LZMA/LZMA2 compressing was increased by 3-10%,
+  and there are minor changes in compression ratio.
+- Some bugs were fixed.
+- The bug in 7-Zip 18.02-18.05 was fixed:
+  There was memory leak in multithreading xz decoder - XzDecMt_Decode(),
+  if xz stream contains only one block.
+- The changes for MSVS compiler makefiles: 
+   - the makefiles now use "PLATFORM" macroname with values (x64, x86, arm64)
+     instead of "CPU" macroname with values (AMD64, ARM64).
+   - the makefiles by default now use static version of the run-time library.
+
+
+18.05          2018-04-30
+-------------------------
+- The speed for LZMA/LZMA2 compressing was increased 
+    by 8% for fastest/fast compression levels and 
+    by 3% for normal/maximum compression levels.
+- Previous versions of 7-Zip could work incorrectly in "Large memory pages" mode in
+  Windows 10 because of some BUG with "Large Pages" in Windows 10. 
+  Now 7-Zip doesn't use "Large Pages" on Windows 10 up to revision 1709 (16299).
+- The BUG was fixed in Lzma2Enc.c
+    Lzma2Enc_Encode2() function worked incorretly,
+      if (inStream == NULL) and the number of block threads is more than 1.
+
+
+18.03 beta     2018-03-04
+-------------------------
+- Asm\x86\LzmaDecOpt.asm: new optimized LZMA decoder written in asm 
+  for x64 with about 30% higher speed than main version of LZMA decoder written in C.
+- The speed for single-thread LZMA/LZMA2 decoder written in C was increased by 3%.
+- 7-Zip now can use multi-threading for 7z/LZMA2 decoding,
+  if there are multiple independent data chunks in LZMA2 stream.
+- 7-Zip now can use multi-threading for xz decoding,
+  if there are multiple blocks in xz stream.
+
+
+18.01          2019-01-28
+-------------------------
+- The BUG in 17.01 - 18.00 beta was fixed:
+  XzDec.c : random block unpacking and XzUnpacker_IsBlockFinished()
+  didn't work correctly for xz archives without checksum (CRC).
+
+
+18.00 beta     2019-01-10
+-------------------------
+- The BUG in xz encoder was fixed:
+  There was memory leak of 16 KB for each file compressed with 
+  xz compression method, if additional filter was used.
+
+
+17.01 beta     2017-08-28
+-------------------------
+- Minor speed optimization for LZMA2 (xz and 7z) multi-threading compression.
+  7-Zip now uses additional memory buffers for multi-block LZMA2 compression.
+  CPU utilization was slightly improved.
+- 7-zip now creates multi-block xz archives by default. Block size can be 
+  specified with -ms[Size]{m|g} switch.
+- xz decoder now can unpack random block from multi-block xz archives.
+- 7-Zip command line: @listfile now doesn't work after -- switch.
+  Use -i@listfile before -- switch instead.
+- The BUGs were fixed:
+  7-Zip 17.00 beta crashed for commands that write anti-item to 7z archive.
+
+
+17.00 beta     2017-04-29
+-------------------------
+- NewHandler.h / NewHandler.cpp: 
+    now it redefines operator new() only for old MSVC compilers (_MSC_VER < 1900).
+- C/7zTypes.h : the names of variables in interface structures were changed (vt).
+- Some bugs were fixed. 7-Zip could crash in some cases.
+- Some internal changes in code.
+
+
+16.04          2016-10-04
+-------------------------
+- The bug was fixed in DllSecur.c.
+
+
+16.03          2016-09-28
+-------------------------
+- SFX modules now use some protection against DLL preloading attack.
+- Some bugs in 7z code were fixed.
+
+
+16.02          2016-05-21
+-------------------------
+- The BUG in 16.00 - 16.01 was fixed:
+  Split Handler (SplitHandler.cpp) returned incorrect 
+  total size value (kpidSize) for split archives.
+
+
+16.01          2016-05-19
+-------------------------	
+- Some internal changes to reduce the number of compiler warnings.
+
+
+16.00          2016-05-10
+-------------------------	
+- Some bugs were fixed.
+
+
+15.12          2015-11-19
+-------------------------	
+- The BUG in C version of 7z decoder was fixed:
+  7zDec.c : SzDecodeLzma2()
+  7z decoder could mistakenly report about decoding error for some 7z archives
+  that use LZMA2 compression method.
+  The probability to get that mistaken decoding error report was about 
+  one error per 16384 solid blocks for solid blocks larger than 16 KB (compressed size). 
+- The BUG (in 9.26-15.11) in C version of 7z decoder was fixed:
+  7zArcIn.c : SzReadHeader2()
+  7z decoder worked incorrectly for 7z archives that contain 
+  empty solid blocks, that can be placed to 7z archive, if some file is 
+  unavailable for reading during archive creation.
+
+
+15.09 beta     2015-10-16
+-------------------------	
+- The BUG in LZMA / LZMA2 encoding code was fixed.
+  The BUG in LzFind.c::MatchFinder_ReadBlock() function.
+  If input data size is larger than (4 GiB - dictionary_size),
+  the following code worked incorrectly:
+  -  LZMA : LzmaEnc_MemEncode(), LzmaEncode() : LZMA encoding functions 
+     for compressing from memory to memory. 
+     That BUG is not related to LZMA encoder version that works via streams.
+  -  LZMA2 : multi-threaded version of LZMA2 encoder worked incorrectly, if 
+     default value of chunk size (CLzma2EncProps::blockSize) is changed 
+     to value larger than (4 GiB - dictionary_size).
+
+
+9.38 beta      2015-01-03
+-------------------------	
+- The BUG in 9.31-9.37 was fixed:
+  IArchiveGetRawProps interface was disabled for 7z archives.
+- The BUG in 9.26-9.36 was fixed:
+  Some code in CPP\7zip\Archive\7z\ worked correctly only under Windows.
+
+
+9.36 beta      2014-12-26
+-------------------------	
+- The BUG in command line version was fixed:
+  7-Zip created temporary archive in current folder during update archive
+  operation, if -w{Path} switch was not specified. 
+  The fixed 7-Zip creates temporary archive in folder that contains updated archive.
+- The BUG in 9.33-9.35 was fixed:
+  7-Zip silently ignored file reading errors during 7z or gz archive creation,
+  and the created archive contained only part of file that was read before error.
+  The fixed 7-Zip stops archive creation and it reports about error.
+
+
+9.35 beta      2014-12-07
+-------------------------	
+- 7zr.exe now support AES encryption.
+- SFX mudules were added to LZMA SDK
+- Some bugs were fixed.
+
+
+9.21 beta      2011-04-11
+-------------------------	
+- New class FString for file names at file systems.
+- Speed optimization in CRC code for big-endian CPUs.
+- The BUG in Lzma2Dec.c was fixed:
+    Lzma2Decode function didn't work.
+
+
+9.18 beta      2010-11-02
+-------------------------	
+- New small SFX module for installers (SfxSetup).
+
+
+9.12 beta      2010-03-24
+-------------------------
+- The BUG in LZMA SDK 9.* was fixed: LZMA2 codec didn't work,
+  if more than 10 threads were used (or more than 20 threads in some modes).
+
+
+9.11 beta      2010-03-15
+-------------------------
+- PPMd compression method support
+   
+
+9.09           2009-12-12
+-------------------------
+- The bug was fixed:
+   Utf16_To_Utf8 funstions in UTFConvert.cpp and 7zMain.c
+   incorrectly converted surrogate characters (the code >= 0x10000) to UTF-8.
+- Some bugs were fixed
+
+
+9.06           2009-08-17
+-------------------------
+- Some changes in ANSI-C 7z Decoder interfaces.
+
+
+9.04           2009-05-30
+-------------------------
+- LZMA2 compression method support
+- xz format support
+
+
+4.65           2009-02-03
+-------------------------
+- Some minor fixes
+
+
+4.63           2008-12-31
+-------------------------
+- Some minor fixes
+
+
+4.61 beta      2008-11-23
+-------------------------
+- The bug in ANSI-C LZMA Decoder was fixed:
+    If encoded stream was corrupted, decoder could access memory 
+    outside of allocated range.
+- Some changes in ANSI-C 7z Decoder interfaces.
+- LZMA SDK is placed in the public domain.
+
+
+4.60 beta      2008-08-19
+-------------------------
+- Some minor fixes.
+
+
+4.59 beta      2008-08-13
+-------------------------
+- The bug was fixed:
+    LZMA Encoder in fast compression mode could access memory outside of 
+    allocated range in some rare cases.
+
+
+4.58 beta      2008-05-05
+-------------------------
+- ANSI-C LZMA Decoder was rewritten for speed optimizations.
+- ANSI-C LZMA Encoder was included to LZMA SDK.
+- C++ LZMA code now is just wrapper over ANSI-C code.
+
+
+4.57           2007-12-12
+-------------------------
+- Speed optimizations in �++ LZMA Decoder. 
+- Small changes for more compatibility with some C/C++ compilers.
+
+
+4.49 beta      2007-07-05
+-------------------------
+- .7z ANSI-C Decoder:
+     - now it supports BCJ and BCJ2 filters
+     - now it supports files larger than 4 GB.
+     - now it supports "Last Write Time" field for files.
+- C++ code for .7z archives compressing/decompressing from 7-zip 
+  was included to LZMA SDK.
+  
+
+4.43           2006-06-04
+-------------------------
+- Small changes for more compatibility with some C/C++ compilers.
+  
+
+4.42           2006-05-15
+-------------------------
+- Small changes in .h files in ANSI-C version.
+  
+
+4.39 beta      2006-04-14
+-------------------------
+- The bug in versions 4.33b:4.38b was fixed:
+  C++ version of LZMA encoder could not correctly compress 
+  files larger than 2 GB with HC4 match finder (-mfhc4).
+  
+
+4.37 beta      2005-04-06
+-------------------------
+- Fixes in C++ code: code could no be compiled if _NO_EXCEPTIONS was defined. 
+
+
+4.35 beta      2005-03-02
+-------------------------
+- The bug was fixed in C++ version of LZMA Decoder:
+    If encoded stream was corrupted, decoder could access memory 
+    outside of allocated range.
+
+
+4.34 beta      2006-02-27
+-------------------------
+- Compressing speed and memory requirements for compressing were increased
+- LZMA now can use only these match finders: HC4, BT2, BT3, BT4
+
+
+4.32           2005-12-09
+-------------------------
+- Java version of LZMA SDK was included
+
+
+4.30           2005-11-20
+-------------------------
+- Compression ratio was improved in -a2 mode
+- Speed optimizations for compressing in -a2 mode
+- -fb switch now supports values up to 273
+- The bug in 7z_C (7zIn.c) was fixed:
+  It used Alloc/Free functions from different memory pools.
+  So if program used two memory pools, it worked incorrectly.
+- 7z_C: .7z format supporting was improved
+- LZMA# SDK (C#.NET version) was included
+
+
+4.27 (Updated) 2005-09-21
+-------------------------
+- Some GUIDs/interfaces in C++ were changed.
+ IStream.h:
+   ISequentialInStream::Read now works as old ReadPart
+   ISequentialOutStream::Write now works as old WritePart
+
+
+4.27           2005-08-07
+-------------------------
+- The bug in LzmaDecodeSize.c was fixed:
+   if _LZMA_IN_CB and _LZMA_OUT_READ were defined,
+   decompressing worked incorrectly.
+
+
+4.26           2005-08-05
+-------------------------
+- Fixes in 7z_C code and LzmaTest.c:
+  previous versions could work incorrectly,
+  if malloc(0) returns 0
+
+
+4.23           2005-06-29
+-------------------------
+- Small fixes in C++ code
+
+
+4.22           2005-06-10
+-------------------------
+- Small fixes
+
+
+4.21           2005-06-08
+-------------------------
+- Interfaces for ANSI-C LZMA Decoder (LzmaDecode.c) were changed
+- New additional version of ANSI-C LZMA Decoder with zlib-like interface:
+    - LzmaStateDecode.h
+    - LzmaStateDecode.c
+    - LzmaStateTest.c
+- ANSI-C LZMA Decoder now can decompress files larger than 4 GB
+
+
+4.17           2005-04-18
+-------------------------
+- New example for RAM->RAM compressing/decompressing: 
+  LZMA + BCJ (filter for x86 code):
+    - LzmaRam.h
+    - LzmaRam.cpp
+    - LzmaRamDecode.h
+    - LzmaRamDecode.c
+    - -f86 switch for lzma.exe
+
+
+4.16           2005-03-29
+-------------------------
+- The bug was fixed in LzmaDecode.c (ANSI-C LZMA Decoder): 
+   If _LZMA_OUT_READ was defined, and if encoded stream was corrupted,
+   decoder could access memory outside of allocated range.
+- Speed optimization of ANSI-C LZMA Decoder (now it's about 20% faster).
+  Old version of LZMA Decoder now is in file LzmaDecodeSize.c. 
+  LzmaDecodeSize.c can provide slightly smaller code than LzmaDecode.c
+- Small speed optimization in LZMA C++ code
+- filter for SPARC's code was added
+- Simplified version of .7z ANSI-C Decoder was included
+
+
+4.06           2004-09-05
+-------------------------
+- The bug in v4.05 was fixed:
+    LZMA-Encoder didn't release output stream in some cases.
+
+
+4.05           2004-08-25
+-------------------------
+- Source code of filters for x86, IA-64, ARM, ARM-Thumb 
+  and PowerPC code was included to SDK
+- Some internal minor changes
+
+
+4.04           2004-07-28
+-------------------------
+- More compatibility with some C++ compilers
+
+
+4.03           2004-06-18
+-------------------------
+- "Benchmark" command was added. It measures compressing 
+  and decompressing speed and shows rating values. 
+  Also it checks hardware errors.
+
+
+4.02           2004-06-10
+-------------------------
+- C++ LZMA Encoder/Decoder code now is more portable
+  and it can be compiled by GCC on Linux.
+
+
+4.01           2004-02-15
+-------------------------
+- Some detection of data corruption was enabled.
+    LzmaDecode.c / RangeDecoderReadByte
+    .....
+    {
+      rd->ExtraBytes = 1;
+      return 0xFF;
+    }
+
+
+4.00           2004-02-13
+-------------------------
+- Original version of LZMA SDK
+
+
+
+HISTORY of the LZMA
+-------------------
+  2001-2008:  Improvements to LZMA compressing/decompressing code, 
+              keeping compatibility with original LZMA format
+  1996-2001:  Development of LZMA compression format
+
+  Some milestones:
+
+  2001-08-30: LZMA compression was added to 7-Zip
+  1999-01-02: First version of 7-Zip was released
+  
+
+End of document
diff --git a/src/sdk/DOC/lzma-sdk.txt b/src/sdk/DOC/lzma-sdk.txt
index b0e14a2..57279bf 100644
--- a/src/sdk/DOC/lzma-sdk.txt
+++ b/src/sdk/DOC/lzma-sdk.txt
@@ -1,357 +1,437 @@
-LZMA SDK 19.00
---------------
-
-LZMA SDK provides the documentation, samples, header files,
-libraries, and tools you need to develop applications that 
-use 7z / LZMA / LZMA2 / XZ compression.
-
-LZMA is an improved version of famous LZ77 compression algorithm. 
-It was improved in way of maximum increasing of compression ratio,
-keeping high decompression speed and low memory requirements for 
-decompressing.
-
-LZMA2 is a LZMA based compression method. LZMA2 provides better 
-multithreading support for compression than LZMA and some other improvements.
-
-7z is a file format for data compression and file archiving.
-7z is a main file format for 7-Zip compression program (www.7-zip.org).
-7z format supports different compression methods: LZMA, LZMA2 and others.
-7z also supports AES-256 based encryption.
-
-XZ is a file format for data compression that uses LZMA2 compression.
-XZ format provides additional features: SHA/CRC check, filters for 
-improved compression ratio, splitting to blocks and streams,
-
-
-
-LICENSE
--------
-
-LZMA SDK is written and placed in the public domain by Igor Pavlov.
-
-Some code in LZMA SDK is based on public domain code from another developers:
-  1) PPMd var.H (2001): Dmitry Shkarin
-  2) SHA-256: Wei Dai (Crypto++ library)
-
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute the 
-original LZMA SDK code, either in source code form or as a compiled binary, for 
-any purpose, commercial or non-commercial, and by any means.
-
-LZMA SDK code is compatible with open source licenses, for example, you can 
-include it to GNU GPL or GNU LGPL code.
-
-
-LZMA SDK Contents
------------------
-
-  Source code:
-
-    - C / C++ / C# / Java   - LZMA compression and decompression
-    - C / C++               - LZMA2 compression and decompression
-    - C / C++               - XZ compression and decompression
-    - C                     - 7z decompression
-    -     C++               - 7z compression and decompression
-    - C                     - small SFXs for installers (7z decompression)
-    -     C++               - SFXs and SFXs for installers (7z decompression)
-
-  Precomiled binaries:
-
-    - console programs for lzma / 7z / xz compression and decompression
-    - SFX modules for installers.
-
-
-UNIX/Linux version 
-------------------
-To compile C++ version of file->file LZMA encoding, go to directory
-CPP/7zip/Bundles/LzmaCon
-and call make to recompile it:
-  make -f makefile.gcc clean all
-
-In some UNIX/Linux versions you must compile LZMA with static libraries.
-To compile with static libraries, you can use 
-LIB = -lm -static
-
-Also you can use p7zip (port of 7-Zip for POSIX systems like Unix or Linux):
-  
-  http://p7zip.sourceforge.net/
-
-
-Files
------
-
-DOC/7zC.txt          - 7z ANSI-C Decoder description
-DOC/7zFormat.txt     - 7z Format description
-DOC/installer.txt    - information about 7-Zip for installers
-DOC/lzma.txt         - LZMA compression description
-DOC/lzma-sdk.txt     - LZMA SDK description (this file)
-DOC/lzma-history.txt - history of LZMA SDK
-DOC/lzma-specification.txt - Specification of LZMA
-DOC/Methods.txt      - Compression method IDs for .7z
-
-bin/installer/   - example script to create installer that uses SFX module,
-
-bin/7zdec.exe    - simplified 7z archive decoder
-bin/7zr.exe      - 7-Zip console program (reduced version)
-bin/x64/7zr.exe  - 7-Zip console program (reduced version) (x64 version)
-bin/lzma.exe     - file->file LZMA encoder/decoder for Windows
-bin/7zS2.sfx     - small SFX module for installers (GUI version)
-bin/7zS2con.sfx  - small SFX module for installers (Console version)
-bin/7zSD.sfx     - SFX module for installers.
-
-
-7zDec.exe
----------
-7zDec.exe is simplified 7z archive decoder.
-It supports only LZMA, LZMA2, and PPMd methods.
-7zDec decodes whole solid block from 7z archive to RAM.
-The RAM consumption can be high.
-
-
-
-
-Source code structure
----------------------
-
-
-Asm/ - asm files (optimized code for CRC calculation and Intel-AES encryption)
-
-C/  - C files (compression / decompression and other)
-  Util/
-    7z       - 7z decoder program (decoding 7z files)
-    Lzma     - LZMA program (file->file LZMA encoder/decoder).
-    LzmaLib  - LZMA library (.DLL for Windows)
-    SfxSetup - small SFX module for installers 
-
-CPP/ -- CPP files
-
-  Common  - common files for C++ projects
-  Windows - common files for Windows related code
-
-  7zip    - files related to 7-Zip
-
-    Archive - files related to archiving
-
-      Common   - common files for archive handling
-      7z       - 7z C++ Encoder/Decoder
-
-    Bundles  - Modules that are bundles of other modules (files)
-  
-      Alone7z       - 7zr.exe: Standalone 7-Zip console program (reduced version)
-      Format7zExtractR  - 7zxr.dll: Reduced version of 7z DLL: extracting from 7z/LZMA/BCJ/BCJ2.
-      Format7zR         - 7zr.dll:  Reduced version of 7z DLL: extracting/compressing to 7z/LZMA/BCJ/BCJ2
-      LzmaCon       - lzma.exe: LZMA compression/decompression
-      LzmaSpec      - example code for LZMA Specification
-      SFXCon        - 7zCon.sfx: Console 7z SFX module
-      SFXSetup      - 7zS.sfx: 7z SFX module for installers
-      SFXWin        - 7z.sfx: GUI 7z SFX module
-
-    Common   - common files for 7-Zip
-
-    Compress - files for compression/decompression
-
-    Crypto   - files for encryption / decompression
-
-    UI       - User Interface files
-         
-      Client7z - Test application for 7za.dll, 7zr.dll, 7zxr.dll
-      Common   - Common UI files
-      Console  - Code for console program (7z.exe)
-      Explorer    - Some code from 7-Zip Shell extension
-      FileManager - Some GUI code from 7-Zip File Manager
-      GUI         - Some GUI code from 7-Zip
-
-
-CS/ - C# files
-  7zip
-    Common   - some common files for 7-Zip
-    Compress - files related to compression/decompression
-      LZ     - files related to LZ (Lempel-Ziv) compression algorithm
-      LZMA         - LZMA compression/decompression
-      LzmaAlone    - file->file LZMA compression/decompression
-      RangeCoder   - Range Coder (special code of compression/decompression)
-
-Java/  - Java files
-  SevenZip
-    Compression    - files related to compression/decompression
-      LZ           - files related to LZ (Lempel-Ziv) compression algorithm
-      LZMA         - LZMA compression/decompression
-      RangeCoder   - Range Coder (special code of compression/decompression)
-
-
-Note: 
-  Asm / C / C++ source code of LZMA SDK is part of 7-Zip's source code.
-  7-Zip's source code can be downloaded from 7-Zip's SourceForge page:
-
-  http://sourceforge.net/projects/sevenzip/
-
-
-
-LZMA features
--------------
-  - Variable dictionary size (up to 1 GB)
-  - Estimated compressing speed: about 2 MB/s on 2 GHz CPU
-  - Estimated decompressing speed: 
-      - 20-30 MB/s on modern 2 GHz cpu
-      - 1-2 MB/s on 200 MHz simple RISC cpu: (ARM, MIPS, PowerPC)
-  - Small memory requirements for decompressing (16 KB + DictionarySize)
-  - Small code size for decompressing: 5-8 KB
-
-LZMA decoder uses only integer operations and can be 
-implemented in any modern 32-bit CPU (or on 16-bit CPU with some conditions).
-
-Some critical operations that affect the speed of LZMA decompression:
-  1) 32*16 bit integer multiply
-  2) Mispredicted branches (penalty mostly depends from pipeline length)
-  3) 32-bit shift and arithmetic operations
-
-The speed of LZMA decompressing mostly depends from CPU speed.
-Memory speed has no big meaning. But if your CPU has small data cache, 
-overall weight of memory speed will slightly increase.
-
-
-How To Use
-----------
-
-Using LZMA encoder/decoder executable
---------------------------------------
-
-Usage:  LZMA <e|d> inputFile outputFile [<switches>...]
-
-  e: encode file
-
-  d: decode file
-
-  b: Benchmark. There are two tests: compressing and decompressing 
-     with LZMA method. Benchmark shows rating in MIPS (million 
-     instructions per second). Rating value is calculated from 
-     measured speed and it is normalized with Intel's Core 2 results.
-     Also Benchmark checks possible hardware errors (RAM 
-     errors in most cases). Benchmark uses these settings:
-     (-a1, -d21, -fb32, -mfbt4). You can change only -d parameter. 
-     Also you can change the number of iterations. Example for 30 iterations:
-       LZMA b 30
-     Default number of iterations is 10.
-
-<Switches>
-  
-
-  -a{N}:  set compression mode 0 = fast, 1 = normal
-          default: 1 (normal)
-
-  d{N}:   Sets Dictionary size - [0, 30], default: 23 (8MB)
-          The maximum value for dictionary size is 1 GB = 2^30 bytes.
-          Dictionary size is calculated as DictionarySize = 2^N bytes. 
-          For decompressing file compressed by LZMA method with dictionary 
-          size D = 2^N you need about D bytes of memory (RAM).
-
-  -fb{N}: set number of fast bytes - [5, 273], default: 128
-          Usually big number gives a little bit better compression ratio 
-          and slower compression process.
-
-  -lc{N}: set number of literal context bits - [0, 8], default: 3
-          Sometimes lc=4 gives gain for big files.
-
-  -lp{N}: set number of literal pos bits - [0, 4], default: 0
-          lp switch is intended for periodical data when period is 
-          equal 2^N. For example, for 32-bit (4 bytes) 
-          periodical data you can use lp=2. Often it's better to set lc0, 
-          if you change lp switch.
-
-  -pb{N}: set number of pos bits - [0, 4], default: 2
-          pb switch is intended for periodical data 
-          when period is equal 2^N.
-
-  -mf{MF_ID}: set Match Finder. Default: bt4. 
-              Algorithms from hc* group doesn't provide good compression 
-              ratio, but they often works pretty fast in combination with 
-              fast mode (-a0).
-
-              Memory requirements depend from dictionary size 
-              (parameter "d" in table below). 
-
-               MF_ID     Memory                   Description
-
-                bt2    d *  9.5 + 4MB  Binary Tree with 2 bytes hashing.
-                bt3    d * 11.5 + 4MB  Binary Tree with 3 bytes hashing.
-                bt4    d * 11.5 + 4MB  Binary Tree with 4 bytes hashing.
-                hc4    d *  7.5 + 4MB  Hash Chain with 4 bytes hashing.
-
-  -eos:   write End Of Stream marker. By default LZMA doesn't write 
-          eos marker, since LZMA decoder knows uncompressed size 
-          stored in .lzma file header.
-
-  -si:    Read data from stdin (it will write End Of Stream marker).
-  -so:    Write data to stdout
-
-
-Examples:
-
-1) LZMA e file.bin file.lzma -d16 -lc0 
-
-compresses file.bin to file.lzma with 64 KB dictionary (2^16=64K)  
-and 0 literal context bits. -lc0 allows to reduce memory requirements 
-for decompression.
-
-
-2) LZMA e file.bin file.lzma -lc0 -lp2
-
-compresses file.bin to file.lzma with settings suitable 
-for 32-bit periodical data (for example, ARM or MIPS code).
-
-3) LZMA d file.lzma file.bin
-
-decompresses file.lzma to file.bin.
-
-
-Compression ratio hints
------------------------
-
-Recommendations
----------------
-
-To increase the compression ratio for LZMA compressing it's desirable 
-to have aligned data (if it's possible) and also it's desirable to locate
-data in such order, where code is grouped in one place and data is 
-grouped in other place (it's better than such mixing: code, data, code,
-data, ...).
-
-
-Filters
--------
-You can increase the compression ratio for some data types, using
-special filters before compressing. For example, it's possible to 
-increase the compression ratio on 5-10% for code for those CPU ISAs: 
-x86, IA-64, ARM, ARM-Thumb, PowerPC, SPARC.
-
-You can find C source code of such filters in C/Bra*.* files
-
-You can check the compression ratio gain of these filters with such 
-7-Zip commands (example for ARM code):
-No filter:
-  7z a a1.7z a.bin -m0=lzma
-
-With filter for little-endian ARM code:
-  7z a a2.7z a.bin -m0=arm -m1=lzma        
-
-It works in such manner:
-Compressing    = Filter_encoding + LZMA_encoding
-Decompressing  = LZMA_decoding + Filter_decoding
-
-Compressing and decompressing speed of such filters is very high,
-so it will not increase decompressing time too much.
-Moreover, it reduces decompression time for LZMA_decoding, 
-since compression ratio with filtering is higher.
-
-These filters convert CALL (calling procedure) instructions 
-from relative offsets to absolute addresses, so such data becomes more 
-compressible.
-
-For some ISAs (for example, for MIPS) it's impossible to get gain from such filter.
-
-
-
----
-
-http://www.7-zip.org
-http://www.7-zip.org/sdk.html
-http://www.7-zip.org/support.html
+LZMA SDK 25.01
+--------------
+
+LZMA SDK provides the documentation, samples, header files,
+libraries, and tools you need to develop applications that 
+use 7z / LZMA / LZMA2 / XZ compression.
+
+LZMA is an improved version of famous LZ77 compression algorithm. 
+It was improved in way of maximum increasing of compression ratio,
+keeping high decompression speed and low memory requirements for 
+decompressing.
+
+LZMA2 is a LZMA based compression method. LZMA2 provides better 
+multithreading support for compression than LZMA and some other improvements.
+
+7z is a file format for data compression and file archiving.
+7z is a main file format for 7-Zip compression program (www.7-zip.org).
+7z format supports different compression methods: LZMA, LZMA2 and others.
+7z also supports AES-256 based encryption.
+
+XZ is a file format for data compression that uses LZMA2 compression.
+XZ format provides additional features: SHA/CRC check, filters for 
+improved compression ratio, splitting to blocks and streams,
+
+
+
+LICENSE
+-------
+
+LZMA SDK is written and placed in the public domain by Igor Pavlov.
+
+Some code in LZMA SDK is based on public domain code from another developers:
+  1) PPMd var.H (2001): Dmitry Shkarin
+  2) SHA-256: Wei Dai (Crypto++ library)
+
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute the 
+original LZMA SDK code, either in source code form or as a compiled binary, for 
+any purpose, commercial or non-commercial, and by any means.
+
+LZMA SDK code is compatible with open source licenses, for example, you can 
+include it to GNU GPL or GNU LGPL code.
+
+
+LZMA SDK Contents
+-----------------
+
+  Source code:
+
+    - C / C++ / C# / Java   - LZMA compression and decompression
+    - C / C++               - LZMA2 compression and decompression
+    - C / C++               - XZ compression and decompression
+    - C                     - 7z decompression
+    -     C++               - 7z compression and decompression
+    - C                     - small SFXs for installers (7z decompression)
+    -     C++               - SFXs and SFXs for installers (7z decompression)
+
+  Precomiled binaries:
+
+    - console programs for lzma / 7z / xz compression and decompression
+    - SFX modules for installers.
+
+How to compile with makefile in Windows
+---------------------------------------
+
+Some macronames can be defined for compiling with makefile:
+
+PLATFORM
+  with possible values: x64, x86, arm64, arm, ia64
+
+OLD_COMPILER
+  for old VC compiler, like MSCV 6.0.
+
+MY_DYNAMIC_LINK
+  for dynamic linking to the run-time library (msvcrt.dll). 
+  The default makefile option is static linking to the run-time library.
+
+To compile 7zr.exe file for x64 with Visual Studio 2022,
+use the following command sequence:
+
+  cd SRC\CPP\7zip\Bundles\Alone7z\
+  %comspec% /k "C:\Program Files\VS2022\VC\Auxiliary\Build\vcvars64.bat"
+  nmake
+
+You can use other "vcvars*.bat" files from the "VS2022\VC\Auxiliary\Build" directory
+to compile for other platforms:
+  vcvars64.bat
+  vcvarsamd64_arm64.bat
+  vcvarsamd64_x86.bat
+
+
+UNIX/Linux version 
+------------------
+There are several otpions to compile 7-Zip with different compilers: gcc and clang.
+Also 7-Zip code contains two versions for some critical parts of code: in C and in Assembeler.
+So if you compile the version with Assembeler code, you will get faster 7-Zip binary.
+
+7-Zip's assembler code uses the following syntax for different platforms:
+
+1) x86 and x86-64 (AMD64): MASM syntax. 
+   There are 2 programs that supports MASM syntax in Linux.
+'    'Asmc Macro Assembler and JWasm. But JWasm now doesn't support some 
+      cpu instructions used in 7-Zip.
+   So you must install Asmc Macro Assembler in Linux, if you want to compile fastest version
+   of 7-Zip  x86 and x86-64:
+     https://github.com/nidud/asmc
+
+2) arm64: GNU assembler for ARM64 with preprocessor. 
+   That systax of that arm64 assembler code in 7-Zip is supported by GCC and CLANG for ARM64.
+
+There are different binaries that can be compiled from 7-Zip source.
+There are 2 main files in folder for compiling:
+  makefile        - that can be used for compiling Windows version of 7-Zip with nmake command
+  makefile.gcc    - that can be used for compiling Linux/macOS versions of 7-Zip with make command
+
+At first you must change the current folder to folder that contains `makefile.gcc`:
+
+  cd CPP/7zip/Bundles/Alone7z
+
+Then you can compile `makefile.gcc` with the command:
+
+  make -j -f makefile.gcc
+
+Also there are additional "*.mak" files in folder "CPP/7zip/" that can be used to compile 
+7-Zip binaries with optimized code and optimzing options.
+
+To compile with GCC without assembler:
+  cd CPP/7zip/Bundles/Alone7z
+  make -j -f ../../cmpl_gcc.mak
+
+To compile with CLANG without assembler:
+  make -j -f ../../cmpl_clang.mak
+
+To compile 7-Zip for x86-64 with asmc assembler:
+  make -j -f ../../cmpl_gcc_x64.mak
+
+To compile 7-Zip for arm64 with assembler:
+  make -j -f ../../cmpl_gcc_arm64.mak
+
+To compile 7-Zip for arm64 for macOS:
+  make -j -f ../../cmpl_mac_arm64.mak
+
+Also you can change some compiler options in the mak files:
+  cmpl_gcc.mak
+  var_gcc.mak
+  warn_gcc.mak
+
+
+
+Also you can use p7zip (port of 7-Zip for POSIX systems like Unix or Linux):
+  
+  http://p7zip.sourceforge.net/
+
+
+Files
+-----
+
+DOC/7zC.txt          - 7z ANSI-C Decoder description
+DOC/7zFormat.txt     - 7z Format description
+DOC/installer.txt    - information about 7-Zip for installers
+DOC/lzma.txt         - LZMA compression description
+DOC/lzma-sdk.txt     - LZMA SDK description (this file)
+DOC/lzma-history.txt - history of LZMA SDK
+DOC/lzma-specification.txt - Specification of LZMA
+DOC/Methods.txt      - Compression method IDs for .7z
+
+bin/installer/   - example script to create installer that uses SFX module,
+
+bin/7zdec.exe    - simplified 7z archive decoder (x86 32-bit version)
+bin/7zr.exe      - 7-Zip console program (reduced version) (x86 32-bit version)
+bin/x64/7zr.exe  - 7-Zip console program (reduced version) (x64 version)
+bin/x64/7zdec.exe  - simplified 7z archive decoder (x64 version)
+bin/arm64/7zr.exe  - 7-Zip console program (reduced version) (arm64 version)
+bin/arm64/7zdec.exe  - simplified 7z archive decoder (arm64 version)
+bin/lzma.exe     - file->file LZMA encoder/decoder for Windows
+bin/7zS2.sfx     - small SFX module for installers (GUI version)
+bin/7zS2con.sfx  - small SFX module for installers (Console version)
+bin/7zSD.sfx     - SFX module for installers.
+
+
+7zDec.exe
+---------
+7zDec.exe is simplified 7z archive decoder.
+It supports only LZMA, LZMA2, and PPMd methods.
+7zDec decodes whole solid block from 7z archive to RAM.
+The RAM consumption can be high.
+
+
+
+
+Source code structure
+---------------------
+
+
+Asm/ - asm files (optimized code for CRC calculation and Intel-AES encryption)
+
+C/  - C files (compression / decompression and other)
+  Util/
+    7z       - 7z decoder program (decoding 7z files)
+    Lzma     - LZMA program (file->file LZMA encoder/decoder).
+    LzmaLib  - LZMA library (.DLL for Windows)
+    SfxSetup - small SFX module for installers 
+
+CPP/ -- CPP files
+
+  Common  - common files for C++ projects
+  Windows - common files for Windows related code
+
+  7zip    - files related to 7-Zip
+
+    Archive - files related to archiving
+
+      Common   - common files for archive handling
+      7z       - 7z C++ Encoder/Decoder
+
+    Bundles  - Modules that are bundles of other modules (files)
+  
+      Alone7z       - 7zr.exe: Standalone 7-Zip console program (reduced version)
+      Format7zExtractR  - 7zxr.dll: Reduced version of 7z DLL: extracting from 7z/LZMA/BCJ/BCJ2.
+      Format7zR         - 7zr.dll:  Reduced version of 7z DLL: extracting/compressing to 7z/LZMA/BCJ/BCJ2
+      LzmaCon       - lzma.exe: LZMA compression/decompression
+      LzmaSpec      - example code for LZMA Specification
+      SFXCon        - 7zCon.sfx: Console 7z SFX module
+      SFXSetup      - 7zS.sfx: 7z SFX module for installers
+      SFXWin        - 7z.sfx: GUI 7z SFX module
+
+    Common   - common files for 7-Zip
+
+    Compress - files for compression/decompression
+
+    Crypto   - files for encryption / decompression
+
+    UI       - User Interface files
+         
+      Client7z - Test application for 7za.dll, 7zr.dll, 7zxr.dll
+      Common   - Common UI files
+      Console  - Code for console program (7z.exe)
+      Explorer    - Some code from 7-Zip Shell extension
+      FileManager - Some GUI code from 7-Zip File Manager
+      GUI         - Some GUI code from 7-Zip
+
+
+CS/ - C# files
+  7zip
+    Common   - some common files for 7-Zip
+    Compress - files related to compression/decompression
+      LZ     - files related to LZ (Lempel-Ziv) compression algorithm
+      LZMA         - LZMA compression/decompression
+      LzmaAlone    - file->file LZMA compression/decompression
+      RangeCoder   - Range Coder (special code of compression/decompression)
+
+Java/  - Java files
+  SevenZip
+    Compression    - files related to compression/decompression
+      LZ           - files related to LZ (Lempel-Ziv) compression algorithm
+      LZMA         - LZMA compression/decompression
+      RangeCoder   - Range Coder (special code of compression/decompression)
+
+
+Note: 
+  Asm / C / C++ source code of LZMA SDK is part of 7-Zip's source code.
+  7-Zip's source code can be downloaded from 7-Zip's SourceForge page:
+
+  http://sourceforge.net/projects/sevenzip/
+
+
+
+LZMA features
+-------------
+  - Variable dictionary size (up to 4 GB)
+  - Estimated compressing speed: about 2 MB/s on 2 GHz CPU
+  - Estimated decompressing speed: 
+      - 20-30 MB/s on modern 2 GHz cpu
+      - 1-2 MB/s on 200 MHz simple RISC cpu: (ARM, MIPS, PowerPC)
+  - Small memory requirements for decompressing (16 KB + DictionarySize)
+  - Small code size for decompressing: 5-8 KB
+
+LZMA decoder uses only integer operations and can be 
+implemented in any modern 32-bit CPU (or on 16-bit CPU with some conditions).
+
+Some critical operations that affect the speed of LZMA decompression:
+  1) 32*16 bit integer multiply
+  2) Mispredicted branches (penalty mostly depends from pipeline length)
+  3) 32-bit shift and arithmetic operations
+
+The speed of LZMA decompressing mostly depends from CPU speed.
+Memory speed has no big meaning. But if your CPU has small data cache, 
+overall weight of memory speed will slightly increase.
+
+
+How To Use
+----------
+
+Using LZMA encoder/decoder executable
+--------------------------------------
+
+Usage:  LZMA <e|d> inputFile outputFile [<switches>...]
+
+  e: encode file
+
+  d: decode file
+
+  b: Benchmark. There are two tests: compressing and decompressing 
+     with LZMA method. Benchmark shows rating in MIPS (million 
+     instructions per second). Rating value is calculated from 
+     measured speed and it is normalized with Intel's Core 2 results.
+     Also Benchmark checks possible hardware errors (RAM 
+     errors in most cases). Benchmark uses these settings:
+     (-a1, -d21, -fb32, -mfbt4). You can change only -d parameter. 
+     Also you can change the number of iterations. Example for 30 iterations:
+       LZMA b 30
+     Default number of iterations is 10.
+
+<Switches>
+  
+
+  -a{N}:  set compression mode 0 = fast, 1 = normal
+          default: 1 (normal)
+
+  d{N}:   Sets Dictionary size - [0, 31], default: N=24 (32 MB)
+          The maximum value for dictionary size is N=31 (2 GB).
+          Dictionary size is calculated as DictionarySize = 2^N bytes. 
+          For decompressing file compressed by LZMA method with dictionary 
+          size D = 2^N you need about D bytes of memory (RAM).
+
+  -fb{N}: set number of fast bytes - [5, 273], default: 128
+          Usually big number gives a little bit better compression ratio 
+          and slower compression process.
+
+  -lc{N}: set number of literal context bits - [0, 8], default: 3
+          Sometimes lc=4 gives gain for big files.
+
+  -lp{N}: set number of literal pos bits - [0, 4], default: 0
+          lp switch is intended for periodical data when period is 
+          equal 2^N. For example, for 32-bit (4 bytes) 
+          periodical data you can use lp=2. Often it's better to set lc0, 
+          if you change lp switch.
+
+  -pb{N}: set number of pos bits - [0, 4], default: 2
+          pb switch is intended for periodical data 
+          when period is equal 2^N.
+
+  -mf{MF_ID}: set Match Finder. Default: bt4. 
+              Algorithms from hc* group doesn't provide good compression 
+              ratio, but they often works pretty fast in combination with 
+              fast mode (-a0).
+
+              Memory requirements depend from dictionary size 
+              (parameter "d" in table below). 
+
+               MF_ID     Memory                   Description
+
+                bt2    d *  9.5 + 4MB  Binary Tree with 2 bytes hashing.
+                bt3    d * 11.5 + 4MB  Binary Tree with 3 bytes hashing.
+                bt4    d * 11.5 + 4MB  Binary Tree with 4 bytes hashing.
+                bt5    d * 11.5 + 4MB  Binary Tree with 5 bytes hashing.
+                hc4    d *  7.5 + 4MB  Hash Chain with 4 bytes hashing.
+                hc5    d *  7.5 + 4MB  Hash Chain with 5 bytes hashing.
+
+  -eos:   write End Of Stream marker. By default LZMA doesn't write 
+          eos marker, since LZMA decoder knows uncompressed size 
+          stored in .lzma file header.
+
+  -si:    Read data from stdin (it will write End Of Stream marker).
+  -so:    Write data to stdout
+
+
+Examples:
+
+1) LZMA e file.bin file.lzma -d16 -lc0 
+
+compresses file.bin to file.lzma with 64 KB dictionary (2^16=64K)  
+and 0 literal context bits. -lc0 allows to reduce memory requirements 
+for decompression.
+
+
+2) LZMA e file.bin file.lzma -lc0 -lp2
+
+compresses file.bin to file.lzma with settings suitable 
+for 32-bit periodical data (for example, ARM or MIPS code).
+
+3) LZMA d file.lzma file.bin
+
+decompresses file.lzma to file.bin.
+
+
+Compression ratio hints
+-----------------------
+
+Recommendations
+---------------
+
+To increase the compression ratio for LZMA compressing it's desirable 
+to have aligned data (if it's possible) and also it's desirable to locate
+data in such order, where code is grouped in one place and data is 
+grouped in other place (it's better than such mixing: code, data, code,
+data, ...).
+
+
+Filters
+-------
+You can increase the compression ratio for some data types, using
+special filters before compressing. For example, it's possible to 
+increase the compression ratio on 5-10% for code for those CPU ISAs: 
+x86, IA-64, ARM, ARM-Thumb, PowerPC, SPARC.
+
+You can find C source code of such filters in C/Bra*.* files
+
+You can check the compression ratio gain of these filters with such 
+7-Zip commands (example for ARM code):
+No filter:
+  7z a a1.7z a.bin -m0=lzma
+
+With filter for little-endian ARM code:
+  7z a a2.7z a.bin -m0=arm -m1=lzma        
+
+It works in such manner:
+Compressing    = Filter_encoding + LZMA_encoding
+Decompressing  = LZMA_decoding + Filter_decoding
+
+Compressing and decompressing speed of such filters is very high,
+so it will not increase decompressing time too much.
+Moreover, it reduces decompression time for LZMA_decoding, 
+since compression ratio with filtering is higher.
+
+These filters convert CALL (calling procedure) instructions 
+from relative offsets to absolute addresses, so such data becomes more 
+compressible.
+
+For some ISAs (for example, for MIPS) it's impossible to get gain from such filter.
+
+
+
+---
+
+http://www.7-zip.org
+http://www.7-zip.org/sdk.html
+http://www.7-zip.org/support.html
diff --git a/src/sdk/DOC/lzma-specification.txt b/src/sdk/DOC/lzma-specification.txt
index ac0cce7..b6796df 100644
--- a/src/sdk/DOC/lzma-specification.txt
+++ b/src/sdk/DOC/lzma-specification.txt
@@ -1,1176 +1,1176 @@
-LZMA specification (DRAFT version)
-----------------------------------
-
-Author: Igor Pavlov
-Date: 2015-06-14
-
-This specification defines the format of LZMA compressed data and lzma file format.
-
-Notation 
---------
-
-We use the syntax of C++ programming language.
-We use the following types in C++ code:
-  unsigned - unsigned integer, at least 16 bits in size
-  int      - signed integer, at least 16 bits in size
-  UInt64   - 64-bit unsigned integer
-  UInt32   - 32-bit unsigned integer
-  UInt16   - 16-bit unsigned integer
-  Byte     - 8-bit unsigned integer
-  bool     - boolean type with two possible values: false, true
-
-
-lzma file format
-================
-
-The lzma file contains the raw LZMA stream and the header with related properties.
-
-The files in that format use ".lzma" extension.
-
-The lzma file format layout:
-
-Offset Size Description
-
-  0     1   LZMA model properties (lc, lp, pb) in encoded form
-  1     4   Dictionary size (32-bit unsigned integer, little-endian)
-  5     8   Uncompressed size (64-bit unsigned integer, little-endian)
- 13         Compressed data (LZMA stream)
-
-LZMA properties:
-
-    name  Range          Description
-
-      lc  [0, 8]         the number of "literal context" bits
-      lp  [0, 4]         the number of "literal pos" bits
-      pb  [0, 4]         the number of "pos" bits
-dictSize  [0, 2^32 - 1]  the dictionary size 
-
-The following code encodes LZMA properties:
-
-void EncodeProperties(Byte *properties)
-{
-  properties[0] = (Byte)((pb * 5 + lp) * 9 + lc);
-  Set_UInt32_LittleEndian(properties + 1, dictSize);
-}
-
-If the value of dictionary size in properties is smaller than (1 << 12),
-the LZMA decoder must set the dictionary size variable to (1 << 12).
-
-#define LZMA_DIC_MIN (1 << 12)
-
-  unsigned lc, pb, lp;
-  UInt32 dictSize;
-  UInt32 dictSizeInProperties;
-
-  void DecodeProperties(const Byte *properties)
-  {
-    unsigned d = properties[0];
-    if (d >= (9 * 5 * 5))
-      throw "Incorrect LZMA properties";
-    lc = d % 9;
-    d /= 9;
-    pb = d / 5;
-    lp = d % 5;
-    dictSizeInProperties = 0;
-    for (int i = 0; i < 4; i++)
-      dictSizeInProperties |= (UInt32)properties[i + 1] << (8 * i);
-    dictSize = dictSizeInProperties;
-    if (dictSize < LZMA_DIC_MIN)
-      dictSize = LZMA_DIC_MIN;
-  }
-
-If "Uncompressed size" field contains ones in all 64 bits, it means that
-uncompressed size is unknown and there is the "end marker" in stream,
-that indicates the end of decoding point.
-In opposite case, if the value from "Uncompressed size" field is not
-equal to ((2^64) - 1), the LZMA stream decoding must be finished after
-specified number of bytes (Uncompressed size) is decoded. And if there 
-is the "end marker", the LZMA decoder must read that marker also.
-
-
-The new scheme to encode LZMA properties
-----------------------------------------
-
-If LZMA compression is used for some another format, it's recommended to
-use a new improved scheme to encode LZMA properties. That new scheme was
-used in xz format that uses the LZMA2 compression algorithm.
-The LZMA2 is a new compression algorithm that is based on the LZMA algorithm.
-
-The dictionary size in LZMA2 is encoded with just one byte and LZMA2 supports
-only reduced set of dictionary sizes:
-  (2 << 11), (3 << 11),
-  (2 << 12), (3 << 12),
-  ...
-  (2 << 30), (3 << 30),
-  (2 << 31) - 1
-
-The dictionary size can be extracted from encoded value with the following code:
-
-  dictSize = (p == 40) ? 0xFFFFFFFF : (((UInt32)2 | ((p) & 1)) << ((p) / 2 + 11));
-
-Also there is additional limitation (lc + lp <= 4) in LZMA2 for values of 
-"lc" and "lp" properties:
-
-  if (lc + lp > 4)
-    throw "Unsupported properties: (lc + lp) > 4";
-
-There are some advantages for LZMA decoder with such (lc + lp) value
-limitation. It reduces the maximum size of tables allocated by decoder.
-And it reduces the complexity of initialization procedure, that can be 
-important to keep high speed of decoding of big number of small LZMA streams.
-
-It's recommended to use that limitation (lc + lp <= 4) for any new format
-that uses LZMA compression. Note that the combinations of "lc" and "lp" 
-parameters, where (lc + lp > 4), can provide significant improvement in 
-compression ratio only in some rare cases.
-
-The LZMA properties can be encoded into two bytes in new scheme:
-
-Offset Size Description
-
-  0     1   The dictionary size encoded with LZMA2 scheme
-  1     1   LZMA model properties (lc, lp, pb) in encoded form
-
-
-The RAM usage 
-=============
-
-The RAM usage for LZMA decoder is determined by the following parts:
-
-1) The Sliding Window (from 4 KiB to 4 GiB).
-2) The probability model counter arrays (arrays of 16-bit variables).
-3) Some additional state variables (about 10 variables of 32-bit integers).
-
-
-The RAM usage for Sliding Window
---------------------------------
-
-There are two main scenarios of decoding:
-
-1) The decoding of full stream to one RAM buffer.
-
-  If we decode full LZMA stream to one output buffer in RAM, the decoder 
-  can use that output buffer as sliding window. So the decoder doesn't 
-  need additional buffer allocated for sliding window.
-
-2) The decoding to some external storage.
-
-  If we decode LZMA stream to external storage, the decoder must allocate
-  the buffer for sliding window. The size of that buffer must be equal 
-  or larger than the value of dictionary size from properties of LZMA stream.
-
-In this specification we describe the code for decoding to some external
-storage. The optimized version of code for decoding of full stream to one
-output RAM buffer can require some minor changes in code.
-
-
-The RAM usage for the probability model counters
-------------------------------------------------
-
-The size of the probability model counter arrays is calculated with the 
-following formula:
-
-size_of_prob_arrays = 1846 + 768 * (1 << (lp + lc))
-
-Each probability model counter is 11-bit unsigned integer.
-If we use 16-bit integer variables (2-byte integers) for these probability 
-model counters, the RAM usage required by probability model counter arrays 
-can be estimated with the following formula:
-
-  RAM = 4 KiB + 1.5 KiB * (1 << (lp + lc))
-
-For example, for default LZMA parameters (lp = 0 and lc = 3), the RAM usage is
-
-  RAM_lc3_lp0 = 4 KiB + 1.5 KiB * 8 = 16 KiB
-
-The maximum RAM state usage is required for decoding the stream with lp = 4 
-and lc = 8:
-
-  RAM_lc8_lp4 = 4 KiB + 1.5 KiB * 4096 = 6148 KiB
-
-If the decoder uses LZMA2's limited property condition 
-(lc + lp <= 4), the RAM usage will be not larger than
-
-  RAM_lc_lp_4 = 4 KiB + 1.5 KiB * 16 = 28 KiB
-
-
-The RAM usage for encoder
--------------------------
-
-There are many variants for LZMA encoding code.
-These variants have different values for memory consumption.
-Note that memory consumption for LZMA Encoder can not be 
-smaller than memory consumption of LZMA Decoder for same stream.
-
-The RAM usage required by modern effective implementation of 
-LZMA Encoder can be estimated with the following formula:
-
-  Encoder_RAM_Usage = 4 MiB + 11 * dictionarySize.
-
-But there are some modes of the encoder that require less memory.
-
-
-LZMA Decoding
-=============
-
-The LZMA compression algorithm uses LZ-based compression with Sliding Window
-and Range Encoding as entropy coding method.
-
-
-Sliding Window
---------------
-
-LZMA uses Sliding Window compression similar to LZ77 algorithm.
-
-LZMA stream must be decoded to the sequence that consists
-of MATCHES and LITERALS:
-  
-  - a LITERAL is a 8-bit character (one byte).
-    The decoder just puts that LITERAL to the uncompressed stream.
-  
-  - a MATCH is a pair of two numbers (DISTANCE-LENGTH pair).
-    The decoder takes one byte exactly "DISTANCE" characters behind
-    current position in the uncompressed stream and puts it to 
-    uncompressed stream. The decoder must repeat it "LENGTH" times.
-
-The "DISTANCE" can not be larger than dictionary size.
-And the "DISTANCE" can not be larger than the number of bytes in
-the uncompressed stream that were decoded before that match.
-
-In this specification we use cyclic buffer to implement Sliding Window
-for LZMA decoder:
-
-class COutWindow
-{
-  Byte *Buf;
-  UInt32 Pos;
-  UInt32 Size;
-  bool IsFull;
-
-public:
-  unsigned TotalPos;
-  COutStream OutStream;
-
-  COutWindow(): Buf(NULL) {}
-  ~COutWindow() { delete []Buf; }
- 
-  void Create(UInt32 dictSize)
-  {
-    Buf = new Byte[dictSize];
-    Pos = 0;
-    Size = dictSize;
-    IsFull = false;
-    TotalPos = 0;
-  }
-
-  void PutByte(Byte b)
-  {
-    TotalPos++;
-    Buf[Pos++] = b;
-    if (Pos == Size)
-    {
-      Pos = 0;
-      IsFull = true;
-    }
-    OutStream.WriteByte(b);
-  }
-
-  Byte GetByte(UInt32 dist) const
-  {
-    return Buf[dist <= Pos ? Pos - dist : Size - dist + Pos];
-  }
-
-  void CopyMatch(UInt32 dist, unsigned len)
-  {
-    for (; len > 0; len--)
-      PutByte(GetByte(dist));
-  }
-
-  bool CheckDistance(UInt32 dist) const
-  {
-    return dist <= Pos || IsFull;
-  }
-
-  bool IsEmpty() const
-  {
-    return Pos == 0 && !IsFull;
-  }
-};
-
-
-In another implementation it's possible to use one buffer that contains 
-Sliding Window and the whole data stream after uncompressing.
-
-
-Range Decoder
--------------
-
-LZMA algorithm uses Range Encoding (1) as entropy coding method.
-
-LZMA stream contains just one very big number in big-endian encoding.
-LZMA decoder uses the Range Decoder to extract a sequence of binary
-symbols from that big number.
-
-The state of the Range Decoder:
-
-struct CRangeDecoder
-{
-  UInt32 Range; 
-  UInt32 Code;
-  InputStream *InStream;
-
-  bool Corrupted;
-}
-
-The notes about UInt32 type for the "Range" and "Code" variables:
-
-  It's possible to use 64-bit (unsigned or signed) integer type
-  for the "Range" and the "Code" variables instead of 32-bit unsigned,
-  but some additional code must be used to truncate the values to 
-  low 32-bits after some operations.
-
-  If the programming language does not support 32-bit unsigned integer type 
-  (like in case of JAVA language), it's possible to use 32-bit signed integer, 
-  but some code must be changed. For example, it's required to change the code
-  that uses comparison operations for UInt32 variables in this specification.
-
-The Range Decoder can be in some states that can be treated as 
-"Corruption" in LZMA stream. The Range Decoder uses the variable "Corrupted":
-
-  (Corrupted == false), if the Range Decoder has not detected any corruption.
-  (Corrupted == true), if the Range Decoder has detected some corruption.
-
-The reference LZMA Decoder ignores the value of the "Corrupted" variable.
-So it continues to decode the stream, even if the corruption can be detected
-in the Range Decoder. To provide the full compatibility with output of the 
-reference LZMA Decoder, another LZMA Decoder implementations must also 
-ignore the value of the "Corrupted" variable.
-
-The LZMA Encoder is required to create only such LZMA streams, that will not 
-lead the Range Decoder to states, where the "Corrupted" variable is set to true.
-
-The Range Decoder reads first 5 bytes from input stream to initialize
-the state:
-
-bool CRangeDecoder::Init()
-{
-  Corrupted = false;
-  Range = 0xFFFFFFFF;
-  Code = 0;
-
-  Byte b = InStream->ReadByte();
-  
-  for (int i = 0; i < 4; i++)
-    Code = (Code << 8) | InStream->ReadByte();
-  
-  if (b != 0 || Code == Range)
-    Corrupted = true;
-  return b == 0;
-}
-
-The LZMA Encoder always writes ZERO in initial byte of compressed stream.
-That scheme allows to simplify the code of the Range Encoder in the 
-LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must
-stop decoding and report error.
-
-After the last bit of data was decoded by Range Decoder, the value of the
-"Code" variable must be equal to 0. The LZMA Decoder must check it by 
-calling the IsFinishedOK() function:
-
-  bool IsFinishedOK() const { return Code == 0; }
-
-If there is corruption in data stream, there is big probability that
-the "Code" value will be not equal to 0 in the Finish() function. So that
-check in the IsFinishedOK() function provides very good feature for 
-corruption detection.
-
-The value of the "Range" variable before each bit decoding can not be smaller 
-than ((UInt32)1 << 24). The Normalize() function keeps the "Range" value in 
-described range.
-
-#define kTopValue ((UInt32)1 << 24)
-
-void CRangeDecoder::Normalize()
-{
-  if (Range < kTopValue)
-  {
-    Range <<= 8;
-    Code = (Code << 8) | InStream->ReadByte();
-  }
-}
-
-Notes: if the size of the "Code" variable is larger than 32 bits, it's
-required to keep only low 32 bits of the "Code" variable after the change
-in Normalize() function.
-
-If the LZMA Stream is not corrupted, the value of the "Code" variable is
-always smaller than value of the "Range" variable.
-But the Range Decoder ignores some types of corruptions, so the value of
-the "Code" variable can be equal or larger than value of the "Range" variable
-for some "Corrupted" archives.
-
-
-LZMA uses Range Encoding only with binary symbols of two types:
-  1) binary symbols with fixed and equal probabilities (direct bits)
-  2) binary symbols with predicted probabilities
-
-The DecodeDirectBits() function decodes the sequence of direct bits:
-
-UInt32 CRangeDecoder::DecodeDirectBits(unsigned numBits)
-{
-  UInt32 res = 0;
-  do
-  {
-    Range >>= 1;
-    Code -= Range;
-    UInt32 t = 0 - ((UInt32)Code >> 31);
-    Code += Range & t;
-    
-    if (Code == Range)
-      Corrupted = true;
-    
-    Normalize();
-    res <<= 1;
-    res += t + 1;
-  }
-  while (--numBits);
-  return res;
-}
-
-
-The Bit Decoding with Probability Model
----------------------------------------
-
-The task of Bit Probability Model is to estimate probabilities of binary
-symbols. And then it provides the Range Decoder with that information.
-The better prediction provides better compression ratio.
-The Bit Probability Model uses statistical data of previous decoded
-symbols.
-
-That estimated probability is presented as 11-bit unsigned integer value
-that represents the probability of symbol "0".
-
-#define kNumBitModelTotalBits 11
-
-Mathematical probabilities can be presented with the following formulas:
-     probability(symbol_0) = prob / 2048.
-     probability(symbol_1) =  1 - Probability(symbol_0) =  
-                           =  1 - prob / 2048 =  
-                           =  (2048 - prob) / 2048
-where the "prob" variable contains 11-bit integer probability counter.
-
-It's recommended to use 16-bit unsigned integer type, to store these 11-bit
-probability values:
-
-typedef UInt16 CProb;
-
-Each probability value must be initialized with value ((1 << 11) / 2),
-that represents the state, where probabilities of symbols 0 and 1 
-are equal to 0.5:
-
-#define PROB_INIT_VAL ((1 << kNumBitModelTotalBits) / 2)
-
-The INIT_PROBS macro is used to initialize the array of CProb variables:
-
-#define INIT_PROBS(p) \
- { for (unsigned i = 0; i < sizeof(p) / sizeof(p[0]); i++) p[i] = PROB_INIT_VAL; }
-
-
-The DecodeBit() function decodes one bit.
-The LZMA decoder provides the pointer to CProb variable that contains 
-information about estimated probability for symbol 0 and the Range Decoder 
-updates that CProb variable after decoding. The Range Decoder increases 
-estimated probability of the symbol that was decoded:
-
-#define kNumMoveBits 5
-
-unsigned CRangeDecoder::DecodeBit(CProb *prob)
-{
-  unsigned v = *prob;
-  UInt32 bound = (Range >> kNumBitModelTotalBits) * v;
-  unsigned symbol;
-  if (Code < bound)
-  {
-    v += ((1 << kNumBitModelTotalBits) - v) >> kNumMoveBits;
-    Range = bound;
-    symbol = 0;
-  }
-  else
-  {
-    v -= v >> kNumMoveBits;
-    Code -= bound;
-    Range -= bound;
-    symbol = 1;
-  }
-  *prob = (CProb)v;
-  Normalize();
-  return symbol;
-}
-
-
-The Binary Tree of bit model counters
--------------------------------------
-
-LZMA uses a tree of Bit model variables to decode symbol that needs
-several bits for storing. There are two versions of such trees in LZMA:
-  1) the tree that decodes bits from high bit to low bit (the normal scheme).
-  2) the tree that decodes bits from low bit to high bit (the reverse scheme).
-
-Each binary tree structure supports different size of decoded symbol
-(the size of binary sequence that contains value of symbol).
-If that size of decoded symbol is "NumBits" bits, the tree structure 
-uses the array of (2 << NumBits) counters of CProb type. 
-But only ((2 << NumBits) - 1) items are used by encoder and decoder.
-The first item (the item with index equal to 0) in array is unused.
-That scheme with unused array's item allows to simplify the code.
-
-unsigned BitTreeReverseDecode(CProb *probs, unsigned numBits, CRangeDecoder *rc)
-{
-  unsigned m = 1;
-  unsigned symbol = 0;
-  for (unsigned i = 0; i < numBits; i++)
-  {
-    unsigned bit = rc->DecodeBit(&probs[m]);
-    m <<= 1;
-    m += bit;
-    symbol |= (bit << i);
-  }
-  return symbol;
-}
-
-template <unsigned NumBits>
-class CBitTreeDecoder
-{
-  CProb Probs[(unsigned)1 << NumBits];
-
-public:
-
-  void Init()
-  {
-    INIT_PROBS(Probs);
-  }
-
-  unsigned Decode(CRangeDecoder *rc)
-  {
-    unsigned m = 1;
-    for (unsigned i = 0; i < NumBits; i++)
-      m = (m << 1) + rc->DecodeBit(&Probs[m]);
-    return m - ((unsigned)1 << NumBits);
-  }
-
-  unsigned ReverseDecode(CRangeDecoder *rc)
-  {
-    return BitTreeReverseDecode(Probs, NumBits, rc);
-  }
-};
-
-
-LZ part of LZMA 
----------------
-
-LZ part of LZMA describes details about the decoding of MATCHES and LITERALS.
-
-
-The Literal Decoding
---------------------
-
-The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where 
-each table contains 0x300 CProb values:
-
-  CProb *LitProbs;
-
-  void CreateLiterals()
-  {
-    LitProbs = new CProb[(UInt32)0x300 << (lc + lp)];
-  }
-  
-  void InitLiterals()
-  {
-    UInt32 num = (UInt32)0x300 << (lc + lp);
-    for (UInt32 i = 0; i < num; i++)
-      LitProbs[i] = PROB_INIT_VAL;
-  }
-
-To select the table for decoding it uses the context that consists of
-(lc) high bits from previous literal and (lp) low bits from value that
-represents current position in outputStream.
-
-If (State > 7), the Literal Decoder also uses "matchByte" that represents 
-the byte in OutputStream at position the is the DISTANCE bytes before 
-current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair
-of latest decoded match.
-
-The following code decodes one literal and puts it to Sliding Window buffer:
-
-  void DecodeLiteral(unsigned state, UInt32 rep0)
-  {
-    unsigned prevByte = 0;
-    if (!OutWindow.IsEmpty())
-      prevByte = OutWindow.GetByte(1);
-    
-    unsigned symbol = 1;
-    unsigned litState = ((OutWindow.TotalPos & ((1 << lp) - 1)) << lc) + (prevByte >> (8 - lc));
-    CProb *probs = &LitProbs[(UInt32)0x300 * litState];
-    
-    if (state >= 7)
-    {
-      unsigned matchByte = OutWindow.GetByte(rep0 + 1);
-      do
-      {
-        unsigned matchBit = (matchByte >> 7) & 1;
-        matchByte <<= 1;
-        unsigned bit = RangeDec.DecodeBit(&probs[((1 + matchBit) << 8) + symbol]);
-        symbol = (symbol << 1) | bit;
-        if (matchBit != bit)
-          break;
-      }
-      while (symbol < 0x100);
-    }
-    while (symbol < 0x100)
-      symbol = (symbol << 1) | RangeDec.DecodeBit(&probs[symbol]);
-    OutWindow.PutByte((Byte)(symbol - 0x100));
-  }
-
-
-The match length decoding
--------------------------
-
-The match length decoder returns normalized (zero-based value) 
-length of match. That value can be converted to real length of the match 
-with the following code:
-
-#define kMatchMinLen 2
-
-    matchLen = len + kMatchMinLen;
-
-The match length decoder can return the values from 0 to 271.
-And the corresponded real match length values can be in the range 
-from 2 to 273.
-
-The following scheme is used for the match length encoding:
-
-  Binary encoding    Binary Tree structure    Zero-based match length 
-  sequence                                    (binary + decimal):
-
-  0 xxx              LowCoder[posState]       xxx
-  1 0 yyy            MidCoder[posState]       yyy + 8
-  1 1 zzzzzzzz       HighCoder                zzzzzzzz + 16
-
-LZMA uses bit model variable "Choice" to decode the first selection bit.
-
-If the first selection bit is equal to 0, the decoder uses binary tree 
-  LowCoder[posState] to decode 3-bit zero-based match length (xxx).
-
-If the first selection bit is equal to 1, the decoder uses bit model 
-  variable "Choice2" to decode the second selection bit.
-
-  If the second selection bit is equal to 0, the decoder uses binary tree 
-    MidCoder[posState] to decode 3-bit "yyy" value, and zero-based match
-    length is equal to (yyy + 8).
-
-  If the second selection bit is equal to 1, the decoder uses binary tree 
-    HighCoder to decode 8-bit "zzzzzzzz" value, and zero-based 
-    match length is equal to (zzzzzzzz + 16).
-
-LZMA uses "posState" value as context to select the binary tree 
-from LowCoder and MidCoder binary tree arrays:
-
-    unsigned posState = OutWindow.TotalPos & ((1 << pb) - 1);
-
-The full code of the length decoder:
-
-class CLenDecoder
-{
-  CProb Choice;
-  CProb Choice2;
-  CBitTreeDecoder<3> LowCoder[1 << kNumPosBitsMax];
-  CBitTreeDecoder<3> MidCoder[1 << kNumPosBitsMax];
-  CBitTreeDecoder<8> HighCoder;
-
-public:
-
-  void Init()
-  {
-    Choice = PROB_INIT_VAL;
-    Choice2 = PROB_INIT_VAL;
-    HighCoder.Init();
-    for (unsigned i = 0; i < (1 << kNumPosBitsMax); i++)
-    {
-      LowCoder[i].Init();
-      MidCoder[i].Init();
-    }
-  }
-
-  unsigned Decode(CRangeDecoder *rc, unsigned posState)
-  {
-    if (rc->DecodeBit(&Choice) == 0)
-      return LowCoder[posState].Decode(rc);
-    if (rc->DecodeBit(&Choice2) == 0)
-      return 8 + MidCoder[posState].Decode(rc);
-    return 16 + HighCoder.Decode(rc);
-  }
-};
-
-The LZMA decoder uses two instances of CLenDecoder class.
-The first instance is for the matches of "Simple Match" type,
-and the second instance is for the matches of "Rep Match" type:
-
-  CLenDecoder LenDecoder;
-  CLenDecoder RepLenDecoder;
-
-
-The match distance decoding
----------------------------
-
-LZMA supports dictionary sizes up to 4 GiB minus 1.
-The value of match distance (decoded by distance decoder) can be 
-from 1 to 2^32. But the distance value that is equal to 2^32 is used to
-indicate the "End of stream" marker. So real largest match distance 
-that is used for LZ-window match is (2^32 - 1).
-
-LZMA uses normalized match length (zero-based length) 
-to calculate the context state "lenState" do decode the distance value:
-
-#define kNumLenToPosStates 4
-
-    unsigned lenState = len;
-    if (lenState > kNumLenToPosStates - 1)
-      lenState = kNumLenToPosStates - 1;
-
-The distance decoder returns the "dist" value that is zero-based value 
-of match distance. The real match distance can be calculated with the
-following code:
-  
-  matchDistance = dist + 1; 
-
-The state of the distance decoder and the initialization code: 
-
-  #define kEndPosModelIndex 14
-  #define kNumFullDistances (1 << (kEndPosModelIndex >> 1))
-  #define kNumAlignBits 4
-
-  CBitTreeDecoder<6> PosSlotDecoder[kNumLenToPosStates];
-  CProb PosDecoders[1 + kNumFullDistances - kEndPosModelIndex];
-  CBitTreeDecoder<kNumAlignBits> AlignDecoder;
-
-  void InitDist()
-  {
-    for (unsigned i = 0; i < kNumLenToPosStates; i++)
-      PosSlotDecoder[i].Init();
-    AlignDecoder.Init();
-    INIT_PROBS(PosDecoders);
-  }
-
-At first stage the distance decoder decodes 6-bit "posSlot" value with bit
-tree decoder from PosSlotDecoder array. It's possible to get 2^6=64 different 
-"posSlot" values.
-
-    unsigned posSlot = PosSlotDecoder[lenState].Decode(&RangeDec);
-
-The encoding scheme for distance value is shown in the following table:
-
-posSlot (decimal) /
-      zero-based distance (binary)
- 0    0
- 1    1
- 2    10
- 3    11
-
- 4    10 x
- 5    11 x
- 6    10 xx
- 7    11 xx
- 8    10 xxx
- 9    11 xxx
-10    10 xxxx
-11    11 xxxx
-12    10 xxxxx
-13    11 xxxxx
-
-14    10 yy zzzz
-15    11 yy zzzz
-16    10 yyy zzzz
-17    11 yyy zzzz
-...
-62    10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
-63    11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
-
-where 
-  "x ... x" means the sequence of binary symbols encoded with binary tree and 
-      "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13.
-  "y" means direct bit encoded with range coder.
-  "zzzz" means the sequence of four binary symbols encoded with binary
-      tree with "Reverse" scheme, where one common binary tree "AlignDecoder"
-      is used for all posSlot values.
-
-If (posSlot < 4), the "dist" value is equal to posSlot value.
-
-If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of
-  the high bits of "dist" value and the number of the low bits.
-
-  If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders.
-    (one separated bit tree decoder per one posSlot value) and "Reverse" scheme.
-    In this implementation we use one CProb array "PosDecoders" that contains 
-    all CProb variables for all these bit decoders.
-  
-  if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct 
-    bits from RangeDecoder and the low 4 bits are decoded with a bit tree 
-    decoder "AlignDecoder" with "Reverse" scheme.
-
-The code to decode zero-based match distance:
-  
-  unsigned DecodeDistance(unsigned len)
-  {
-    unsigned lenState = len;
-    if (lenState > kNumLenToPosStates - 1)
-      lenState = kNumLenToPosStates - 1;
-    
-    unsigned posSlot = PosSlotDecoder[lenState].Decode(&RangeDec);
-    if (posSlot < 4)
-      return posSlot;
-    
-    unsigned numDirectBits = (unsigned)((posSlot >> 1) - 1);
-    UInt32 dist = ((2 | (posSlot & 1)) << numDirectBits);
-    if (posSlot < kEndPosModelIndex)
-      dist += BitTreeReverseDecode(PosDecoders + dist - posSlot, numDirectBits, &RangeDec);
-    else
-    {
-      dist += RangeDec.DecodeDirectBits(numDirectBits - kNumAlignBits) << kNumAlignBits;
-      dist += AlignDecoder.ReverseDecode(&RangeDec);
-    }
-    return dist;
-  }
-
-
-
-LZMA Decoding modes
--------------------
-
-There are 2 types of LZMA streams:
-
-1) The stream with "End of stream" marker.
-2) The stream without "End of stream" marker.
-
-And the LZMA Decoder supports 3 modes of decoding:
-
-1) The unpack size is undefined. The LZMA decoder stops decoding after 
-   getting "End of stream" marker. 
-   The input variables for that case:
-    
-      markerIsMandatory = true
-      unpackSizeDefined = false
-      unpackSize contains any value
-
-2) The unpack size is defined and LZMA decoder supports both variants, 
-   where the stream can contain "End of stream" marker or the stream is
-   finished without "End of stream" marker. The LZMA decoder must detect 
-   any of these situations.
-   The input variables for that case:
-    
-      markerIsMandatory = false
-      unpackSizeDefined = true
-      unpackSize contains unpack size
-
-3) The unpack size is defined and the LZMA stream must contain 
-   "End of stream" marker
-   The input variables for that case:
-    
-      markerIsMandatory = true
-      unpackSizeDefined = true
-      unpackSize contains unpack size
-
-
-The main loop of decoder
-------------------------
-
-The main loop of LZMA decoder:
-
-Initialize the LZMA state.
-loop
-{
-  // begin of loop
-  Check "end of stream" conditions.
-  Decode Type of MATCH / LITERAL. 
-    If it's LITERAL, decode LITERAL value and put the LITERAL to Window.
-    If it's MATCH, decode the length of match and the match distance. 
-        Check error conditions, check end of stream conditions and copy
-        the sequence of match bytes from sliding window to current position
-        in window.
-  Go to begin of loop
-}
-
-The reference implementation of LZMA decoder uses "unpackSize" variable
-to keep the number of remaining bytes in output stream. So it reduces 
-"unpackSize" value after each decoded LITERAL or MATCH.
-
-The following code contains the "end of stream" condition check at the start
-of the loop:
-
-    if (unpackSizeDefined && unpackSize == 0 && !markerIsMandatory)
-      if (RangeDec.IsFinishedOK())
-        return LZMA_RES_FINISHED_WITHOUT_MARKER;
-
-LZMA uses three types of matches:
-
-1) "Simple Match" -     the match with distance value encoded with bit models.
-
-2) "Rep Match" -        the match that uses the distance from distance
-                        history table.
-
-3) "Short Rep Match" -  the match of single byte length, that uses the latest 
-                        distance from distance history table.
-
-The LZMA decoder keeps the history of latest 4 match distances that were used 
-by decoder. That set of 4 variables contains zero-based match distances and 
-these variables are initialized with zero values:
-
-  UInt32 rep0 = 0, rep1 = 0, rep2 = 0, rep3 = 0;
-
-The LZMA decoder uses binary model variables to select type of MATCH or LITERAL:
-
-#define kNumStates 12
-#define kNumPosBitsMax 4
-
-  CProb IsMatch[kNumStates << kNumPosBitsMax];
-  CProb IsRep[kNumStates];
-  CProb IsRepG0[kNumStates];
-  CProb IsRepG1[kNumStates];
-  CProb IsRepG2[kNumStates];
-  CProb IsRep0Long[kNumStates << kNumPosBitsMax];
-
-The decoder uses "state" variable value to select exact variable 
-from "IsRep", "IsRepG0", "IsRepG1" and "IsRepG2" arrays.
-The "state" variable can get the value from 0 to 11.
-Initial value for "state" variable is zero:
-
-  unsigned state = 0;
-
-The "state" variable is updated after each LITERAL or MATCH with one of the
-following functions:
-
-unsigned UpdateState_Literal(unsigned state)
-{
-  if (state < 4) return 0;
-  else if (state < 10) return state - 3;
-  else return state - 6;
-}
-unsigned UpdateState_Match   (unsigned state) { return state < 7 ? 7 : 10; }
-unsigned UpdateState_Rep     (unsigned state) { return state < 7 ? 8 : 11; }
-unsigned UpdateState_ShortRep(unsigned state) { return state < 7 ? 9 : 11; }
-
-The decoder calculates "state2" variable value to select exact variable from 
-"IsMatch" and "IsRep0Long" arrays:
-
-unsigned posState = OutWindow.TotalPos & ((1 << pb) - 1);
-unsigned state2 = (state << kNumPosBitsMax) + posState;
-
-The decoder uses the following code flow scheme to select exact 
-type of LITERAL or MATCH:
-
-IsMatch[state2] decode
-  0 - the Literal
-  1 - the Match
-    IsRep[state] decode
-      0 - Simple Match
-      1 - Rep Match
-        IsRepG0[state] decode
-          0 - the distance is rep0
-            IsRep0Long[state2] decode
-              0 - Short Rep Match
-              1 - Rep Match 0
-          1 - 
-            IsRepG1[state] decode
-              0 - Rep Match 1
-              1 - 
-                IsRepG2[state] decode
-                  0 - Rep Match 2
-                  1 - Rep Match 3
-
-
-LITERAL symbol
---------------
-If the value "0" was decoded with IsMatch[state2] decoding, we have "LITERAL" type.
-
-At first the LZMA decoder must check that it doesn't exceed 
-specified uncompressed size:
-
-      if (unpackSizeDefined && unpackSize == 0)
-        return LZMA_RES_ERROR;
-
-Then it decodes literal value and puts it to sliding window:
-
-      DecodeLiteral(state, rep0);
-
-Then the decoder must update the "state" value and "unpackSize" value;
-
-      state = UpdateState_Literal(state);
-      unpackSize--;
-
-Then the decoder must go to the begin of main loop to decode next Match or Literal.
-
-
-Simple Match
-------------
-
-If the value "1" was decoded with IsMatch[state2] decoding,
-we have the "Simple Match" type.
-
-The distance history table is updated with the following scheme:
-    
-      rep3 = rep2;
-      rep2 = rep1;
-      rep1 = rep0;
-
-The zero-based length is decoded with "LenDecoder":
-
-      len = LenDecoder.Decode(&RangeDec, posState);
-
-The state is update with UpdateState_Match function:
-
-      state = UpdateState_Match(state);
-
-and the new "rep0" value is decoded with DecodeDistance:
-
-      rep0 = DecodeDistance(len);
-
-That "rep0" will be used as zero-based distance for current match.
-
-If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have 
-"End of stream" marker, so we can stop decoding and check finishing 
-condition in Range Decoder:
-
-      if (rep0 == 0xFFFFFFFF)
-        return RangeDec.IsFinishedOK() ?
-            LZMA_RES_FINISHED_WITH_MARKER :
-            LZMA_RES_ERROR;
-
-If uncompressed size is defined, LZMA decoder must check that it doesn't 
-exceed that specified uncompressed size:
-
-      if (unpackSizeDefined && unpackSize == 0)
-        return LZMA_RES_ERROR;
-
-Also the decoder must check that "rep0" value is not larger than dictionary size
-and is not larger than the number of already decoded bytes:
-
-      if (rep0 >= dictSize || !OutWindow.CheckDistance(rep0))
-        return LZMA_RES_ERROR;
-
-Then the decoder must copy match bytes as described in 
-"The match symbols copying" section.
-
-
-Rep Match
----------
-
-If the LZMA decoder has decoded the value "1" with IsRep[state] variable,
-we have "Rep Match" type.
-
-At first the LZMA decoder must check that it doesn't exceed 
-specified uncompressed size:
-
-      if (unpackSizeDefined && unpackSize == 0)
-        return LZMA_RES_ERROR;
-
-Also the decoder must return error, if the LZ window is empty:
-
-      if (OutWindow.IsEmpty())
-        return LZMA_RES_ERROR;
-
-If the match type is "Rep Match", the decoder uses one of the 4 variables of
-distance history table to get the value of distance for current match.
-And there are 4 corresponding ways of decoding flow. 
-
-The decoder updates the distance history with the following scheme 
-depending from type of match:
-
-- "Rep Match 0" or "Short Rep Match":
-      ; LZMA doesn't update the distance history    
-
-- "Rep Match 1":
-      UInt32 dist = rep1;
-      rep1 = rep0;
-      rep0 = dist;
-
-- "Rep Match 2":
-      UInt32 dist = rep2;
-      rep2 = rep1;
-      rep1 = rep0;
-      rep0 = dist;
-
-- "Rep Match 3":
-      UInt32 dist = rep3;
-      rep3 = rep2;
-      rep2 = rep1;
-      rep1 = rep0;
-      rep0 = dist;
-
-Then the decoder decodes exact subtype of "Rep Match" using "IsRepG0", "IsRep0Long",
-"IsRepG1", "IsRepG2".
-
-If the subtype is "Short Rep Match", the decoder updates the state, puts 
-the one byte from window to current position in window and goes to next 
-MATCH/LITERAL symbol (the begin of main loop):
-
-          state = UpdateState_ShortRep(state);
-          OutWindow.PutByte(OutWindow.GetByte(rep0 + 1));
-          unpackSize--;
-          continue;
-
-In other cases (Rep Match 0/1/2/3), it decodes the zero-based 
-length of match with "RepLenDecoder" decoder:
-
-      len = RepLenDecoder.Decode(&RangeDec, posState);
-
-Then it updates the state:
-
-      state = UpdateState_Rep(state);
-
-Then the decoder must copy match bytes as described in 
-"The Match symbols copying" section.
-
-
-The match symbols copying
--------------------------
-
-If we have the match (Simple Match or Rep Match 0/1/2/3), the decoder must
-copy the sequence of bytes with calculated match distance and match length.
-If uncompressed size is defined, LZMA decoder must check that it doesn't 
-exceed that specified uncompressed size:
-
-    len += kMatchMinLen;
-    bool isError = false;
-    if (unpackSizeDefined && unpackSize < len)
-    {
-      len = (unsigned)unpackSize;
-      isError = true;
-    }
-    OutWindow.CopyMatch(rep0 + 1, len);
-    unpackSize -= len;
-    if (isError)
-      return LZMA_RES_ERROR;
-
-Then the decoder must go to the begin of main loop to decode next MATCH or LITERAL.
-
-
-
-NOTES
------
-
-This specification doesn't describe the variant of decoder implementation 
-that supports partial decoding. Such partial decoding case can require some 
-changes in "end of stream" condition checks code. Also such code 
-can use additional status codes, returned by decoder.
-
-This specification uses C++ code with templates to simplify describing.
-The optimized version of LZMA decoder doesn't need templates.
-Such optimized version can use just two arrays of CProb variables:
-  1) The dynamic array of CProb variables allocated for the Literal Decoder.
-  2) The one common array that contains all other CProb variables.
-
-
-References:      
-
-1. G. N. N. Martin, Range encoding: an algorithm for removing redundancy 
-   from a digitized message, Video & Data Recording Conference, 
-   Southampton, UK, July 24-27, 1979.
+LZMA specification (DRAFT version)
+----------------------------------
+
+Author: Igor Pavlov
+Date: 2015-06-14
+
+This specification defines the format of LZMA compressed data and lzma file format.
+
+Notation 
+--------
+
+We use the syntax of C++ programming language.
+We use the following types in C++ code:
+  unsigned - unsigned integer, at least 16 bits in size
+  int      - signed integer, at least 16 bits in size
+  UInt64   - 64-bit unsigned integer
+  UInt32   - 32-bit unsigned integer
+  UInt16   - 16-bit unsigned integer
+  Byte     - 8-bit unsigned integer
+  bool     - boolean type with two possible values: false, true
+
+
+lzma file format
+================
+
+The lzma file contains the raw LZMA stream and the header with related properties.
+
+The files in that format use ".lzma" extension.
+
+The lzma file format layout:
+
+Offset Size Description
+
+  0     1   LZMA model properties (lc, lp, pb) in encoded form
+  1     4   Dictionary size (32-bit unsigned integer, little-endian)
+  5     8   Uncompressed size (64-bit unsigned integer, little-endian)
+ 13         Compressed data (LZMA stream)
+
+LZMA properties:
+
+    name  Range          Description
+
+      lc  [0, 8]         the number of "literal context" bits
+      lp  [0, 4]         the number of "literal pos" bits
+      pb  [0, 4]         the number of "pos" bits
+dictSize  [0, 2^32 - 1]  the dictionary size 
+
+The following code encodes LZMA properties:
+
+void EncodeProperties(Byte *properties)
+{
+  properties[0] = (Byte)((pb * 5 + lp) * 9 + lc);
+  Set_UInt32_LittleEndian(properties + 1, dictSize);
+}
+
+If the value of dictionary size in properties is smaller than (1 << 12),
+the LZMA decoder must set the dictionary size variable to (1 << 12).
+
+#define LZMA_DIC_MIN (1 << 12)
+
+  unsigned lc, pb, lp;
+  UInt32 dictSize;
+  UInt32 dictSizeInProperties;
+
+  void DecodeProperties(const Byte *properties)
+  {
+    unsigned d = properties[0];
+    if (d >= (9 * 5 * 5))
+      throw "Incorrect LZMA properties";
+    lc = d % 9;
+    d /= 9;
+    pb = d / 5;
+    lp = d % 5;
+    dictSizeInProperties = 0;
+    for (int i = 0; i < 4; i++)
+      dictSizeInProperties |= (UInt32)properties[i + 1] << (8 * i);
+    dictSize = dictSizeInProperties;
+    if (dictSize < LZMA_DIC_MIN)
+      dictSize = LZMA_DIC_MIN;
+  }
+
+If "Uncompressed size" field contains ones in all 64 bits, it means that
+uncompressed size is unknown and there is the "end marker" in stream,
+that indicates the end of decoding point.
+In opposite case, if the value from "Uncompressed size" field is not
+equal to ((2^64) - 1), the LZMA stream decoding must be finished after
+specified number of bytes (Uncompressed size) is decoded. And if there 
+is the "end marker", the LZMA decoder must read that marker also.
+
+
+The new scheme to encode LZMA properties
+----------------------------------------
+
+If LZMA compression is used for some another format, it's recommended to
+use a new improved scheme to encode LZMA properties. That new scheme was
+used in xz format that uses the LZMA2 compression algorithm.
+The LZMA2 is a new compression algorithm that is based on the LZMA algorithm.
+
+The dictionary size in LZMA2 is encoded with just one byte and LZMA2 supports
+only reduced set of dictionary sizes:
+  (2 << 11), (3 << 11),
+  (2 << 12), (3 << 12),
+  ...
+  (2 << 30), (3 << 30),
+  (2 << 31) - 1
+
+The dictionary size can be extracted from encoded value with the following code:
+
+  dictSize = (p == 40) ? 0xFFFFFFFF : (((UInt32)2 | ((p) & 1)) << ((p) / 2 + 11));
+
+Also there is additional limitation (lc + lp <= 4) in LZMA2 for values of 
+"lc" and "lp" properties:
+
+  if (lc + lp > 4)
+    throw "Unsupported properties: (lc + lp) > 4";
+
+There are some advantages for LZMA decoder with such (lc + lp) value
+limitation. It reduces the maximum size of tables allocated by decoder.
+And it reduces the complexity of initialization procedure, that can be 
+important to keep high speed of decoding of big number of small LZMA streams.
+
+It's recommended to use that limitation (lc + lp <= 4) for any new format
+that uses LZMA compression. Note that the combinations of "lc" and "lp" 
+parameters, where (lc + lp > 4), can provide significant improvement in 
+compression ratio only in some rare cases.
+
+The LZMA properties can be encoded into two bytes in new scheme:
+
+Offset Size Description
+
+  0     1   The dictionary size encoded with LZMA2 scheme
+  1     1   LZMA model properties (lc, lp, pb) in encoded form
+
+
+The RAM usage 
+=============
+
+The RAM usage for LZMA decoder is determined by the following parts:
+
+1) The Sliding Window (from 4 KiB to 4 GiB).
+2) The probability model counter arrays (arrays of 16-bit variables).
+3) Some additional state variables (about 10 variables of 32-bit integers).
+
+
+The RAM usage for Sliding Window
+--------------------------------
+
+There are two main scenarios of decoding:
+
+1) The decoding of full stream to one RAM buffer.
+
+  If we decode full LZMA stream to one output buffer in RAM, the decoder 
+  can use that output buffer as sliding window. So the decoder doesn't 
+  need additional buffer allocated for sliding window.
+
+2) The decoding to some external storage.
+
+  If we decode LZMA stream to external storage, the decoder must allocate
+  the buffer for sliding window. The size of that buffer must be equal 
+  or larger than the value of dictionary size from properties of LZMA stream.
+
+In this specification we describe the code for decoding to some external
+storage. The optimized version of code for decoding of full stream to one
+output RAM buffer can require some minor changes in code.
+
+
+The RAM usage for the probability model counters
+------------------------------------------------
+
+The size of the probability model counter arrays is calculated with the 
+following formula:
+
+size_of_prob_arrays = 1846 + 768 * (1 << (lp + lc))
+
+Each probability model counter is 11-bit unsigned integer.
+If we use 16-bit integer variables (2-byte integers) for these probability 
+model counters, the RAM usage required by probability model counter arrays 
+can be estimated with the following formula:
+
+  RAM = 4 KiB + 1.5 KiB * (1 << (lp + lc))
+
+For example, for default LZMA parameters (lp = 0 and lc = 3), the RAM usage is
+
+  RAM_lc3_lp0 = 4 KiB + 1.5 KiB * 8 = 16 KiB
+
+The maximum RAM state usage is required for decoding the stream with lp = 4 
+and lc = 8:
+
+  RAM_lc8_lp4 = 4 KiB + 1.5 KiB * 4096 = 6148 KiB
+
+If the decoder uses LZMA2's limited property condition 
+(lc + lp <= 4), the RAM usage will be not larger than
+
+  RAM_lc_lp_4 = 4 KiB + 1.5 KiB * 16 = 28 KiB
+
+
+The RAM usage for encoder
+-------------------------
+
+There are many variants for LZMA encoding code.
+These variants have different values for memory consumption.
+Note that memory consumption for LZMA Encoder can not be 
+smaller than memory consumption of LZMA Decoder for same stream.
+
+The RAM usage required by modern effective implementation of 
+LZMA Encoder can be estimated with the following formula:
+
+  Encoder_RAM_Usage = 4 MiB + 11 * dictionarySize.
+
+But there are some modes of the encoder that require less memory.
+
+
+LZMA Decoding
+=============
+
+The LZMA compression algorithm uses LZ-based compression with Sliding Window
+and Range Encoding as entropy coding method.
+
+
+Sliding Window
+--------------
+
+LZMA uses Sliding Window compression similar to LZ77 algorithm.
+
+LZMA stream must be decoded to the sequence that consists
+of MATCHES and LITERALS:
+  
+  - a LITERAL is a 8-bit character (one byte).
+    The decoder just puts that LITERAL to the uncompressed stream.
+  
+  - a MATCH is a pair of two numbers (DISTANCE-LENGTH pair).
+    The decoder takes one byte exactly "DISTANCE" characters behind
+    current position in the uncompressed stream and puts it to 
+    uncompressed stream. The decoder must repeat it "LENGTH" times.
+
+The "DISTANCE" can not be larger than dictionary size.
+And the "DISTANCE" can not be larger than the number of bytes in
+the uncompressed stream that were decoded before that match.
+
+In this specification we use cyclic buffer to implement Sliding Window
+for LZMA decoder:
+
+class COutWindow
+{
+  Byte *Buf;
+  UInt32 Pos;
+  UInt32 Size;
+  bool IsFull;
+
+public:
+  unsigned TotalPos;
+  COutStream OutStream;
+
+  COutWindow(): Buf(NULL) {}
+  ~COutWindow() { delete []Buf; }
+ 
+  void Create(UInt32 dictSize)
+  {
+    Buf = new Byte[dictSize];
+    Pos = 0;
+    Size = dictSize;
+    IsFull = false;
+    TotalPos = 0;
+  }
+
+  void PutByte(Byte b)
+  {
+    TotalPos++;
+    Buf[Pos++] = b;
+    if (Pos == Size)
+    {
+      Pos = 0;
+      IsFull = true;
+    }
+    OutStream.WriteByte(b);
+  }
+
+  Byte GetByte(UInt32 dist) const
+  {
+    return Buf[dist <= Pos ? Pos - dist : Size - dist + Pos];
+  }
+
+  void CopyMatch(UInt32 dist, unsigned len)
+  {
+    for (; len > 0; len--)
+      PutByte(GetByte(dist));
+  }
+
+  bool CheckDistance(UInt32 dist) const
+  {
+    return dist <= Pos || IsFull;
+  }
+
+  bool IsEmpty() const
+  {
+    return Pos == 0 && !IsFull;
+  }
+};
+
+
+In another implementation it's possible to use one buffer that contains 
+Sliding Window and the whole data stream after uncompressing.
+
+
+Range Decoder
+-------------
+
+LZMA algorithm uses Range Encoding (1) as entropy coding method.
+
+LZMA stream contains just one very big number in big-endian encoding.
+LZMA decoder uses the Range Decoder to extract a sequence of binary
+symbols from that big number.
+
+The state of the Range Decoder:
+
+struct CRangeDecoder
+{
+  UInt32 Range; 
+  UInt32 Code;
+  InputStream *InStream;
+
+  bool Corrupted;
+}
+
+The notes about UInt32 type for the "Range" and "Code" variables:
+
+  It's possible to use 64-bit (unsigned or signed) integer type
+  for the "Range" and the "Code" variables instead of 32-bit unsigned,
+  but some additional code must be used to truncate the values to 
+  low 32-bits after some operations.
+
+  If the programming language does not support 32-bit unsigned integer type 
+  (like in case of JAVA language), it's possible to use 32-bit signed integer, 
+  but some code must be changed. For example, it's required to change the code
+  that uses comparison operations for UInt32 variables in this specification.
+
+The Range Decoder can be in some states that can be treated as 
+"Corruption" in LZMA stream. The Range Decoder uses the variable "Corrupted":
+
+  (Corrupted == false), if the Range Decoder has not detected any corruption.
+  (Corrupted == true), if the Range Decoder has detected some corruption.
+
+The reference LZMA Decoder ignores the value of the "Corrupted" variable.
+So it continues to decode the stream, even if the corruption can be detected
+in the Range Decoder. To provide the full compatibility with output of the 
+reference LZMA Decoder, another LZMA Decoder implementations must also 
+ignore the value of the "Corrupted" variable.
+
+The LZMA Encoder is required to create only such LZMA streams, that will not 
+lead the Range Decoder to states, where the "Corrupted" variable is set to true.
+
+The Range Decoder reads first 5 bytes from input stream to initialize
+the state:
+
+bool CRangeDecoder::Init()
+{
+  Corrupted = false;
+  Range = 0xFFFFFFFF;
+  Code = 0;
+
+  Byte b = InStream->ReadByte();
+  
+  for (int i = 0; i < 4; i++)
+    Code = (Code << 8) | InStream->ReadByte();
+  
+  if (b != 0 || Code == Range)
+    Corrupted = true;
+  return b == 0;
+}
+
+The LZMA Encoder always writes ZERO in initial byte of compressed stream.
+That scheme allows to simplify the code of the Range Encoder in the 
+LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must
+stop decoding and report error.
+
+After the last bit of data was decoded by Range Decoder, the value of the
+"Code" variable must be equal to 0. The LZMA Decoder must check it by 
+calling the IsFinishedOK() function:
+
+  bool IsFinishedOK() const { return Code == 0; }
+
+If there is corruption in data stream, there is big probability that
+the "Code" value will be not equal to 0 in the Finish() function. So that
+check in the IsFinishedOK() function provides very good feature for 
+corruption detection.
+
+The value of the "Range" variable before each bit decoding can not be smaller 
+than ((UInt32)1 << 24). The Normalize() function keeps the "Range" value in 
+described range.
+
+#define kTopValue ((UInt32)1 << 24)
+
+void CRangeDecoder::Normalize()
+{
+  if (Range < kTopValue)
+  {
+    Range <<= 8;
+    Code = (Code << 8) | InStream->ReadByte();
+  }
+}
+
+Notes: if the size of the "Code" variable is larger than 32 bits, it's
+required to keep only low 32 bits of the "Code" variable after the change
+in Normalize() function.
+
+If the LZMA Stream is not corrupted, the value of the "Code" variable is
+always smaller than value of the "Range" variable.
+But the Range Decoder ignores some types of corruptions, so the value of
+the "Code" variable can be equal or larger than value of the "Range" variable
+for some "Corrupted" archives.
+
+
+LZMA uses Range Encoding only with binary symbols of two types:
+  1) binary symbols with fixed and equal probabilities (direct bits)
+  2) binary symbols with predicted probabilities
+
+The DecodeDirectBits() function decodes the sequence of direct bits:
+
+UInt32 CRangeDecoder::DecodeDirectBits(unsigned numBits)
+{
+  UInt32 res = 0;
+  do
+  {
+    Range >>= 1;
+    Code -= Range;
+    UInt32 t = 0 - ((UInt32)Code >> 31);
+    Code += Range & t;
+    
+    if (Code == Range)
+      Corrupted = true;
+    
+    Normalize();
+    res <<= 1;
+    res += t + 1;
+  }
+  while (--numBits);
+  return res;
+}
+
+
+The Bit Decoding with Probability Model
+---------------------------------------
+
+The task of Bit Probability Model is to estimate probabilities of binary
+symbols. And then it provides the Range Decoder with that information.
+The better prediction provides better compression ratio.
+The Bit Probability Model uses statistical data of previous decoded
+symbols.
+
+That estimated probability is presented as 11-bit unsigned integer value
+that represents the probability of symbol "0".
+
+#define kNumBitModelTotalBits 11
+
+Mathematical probabilities can be presented with the following formulas:
+     probability(symbol_0) = prob / 2048.
+     probability(symbol_1) =  1 - Probability(symbol_0) =  
+                           =  1 - prob / 2048 =  
+                           =  (2048 - prob) / 2048
+where the "prob" variable contains 11-bit integer probability counter.
+
+It's recommended to use 16-bit unsigned integer type, to store these 11-bit
+probability values:
+
+typedef UInt16 CProb;
+
+Each probability value must be initialized with value ((1 << 11) / 2),
+that represents the state, where probabilities of symbols 0 and 1 
+are equal to 0.5:
+
+#define PROB_INIT_VAL ((1 << kNumBitModelTotalBits) / 2)
+
+The INIT_PROBS macro is used to initialize the array of CProb variables:
+
+#define INIT_PROBS(p) \
+ { for (unsigned i = 0; i < sizeof(p) / sizeof(p[0]); i++) p[i] = PROB_INIT_VAL; }
+
+
+The DecodeBit() function decodes one bit.
+The LZMA decoder provides the pointer to CProb variable that contains 
+information about estimated probability for symbol 0 and the Range Decoder 
+updates that CProb variable after decoding. The Range Decoder increases 
+estimated probability of the symbol that was decoded:
+
+#define kNumMoveBits 5
+
+unsigned CRangeDecoder::DecodeBit(CProb *prob)
+{
+  unsigned v = *prob;
+  UInt32 bound = (Range >> kNumBitModelTotalBits) * v;
+  unsigned symbol;
+  if (Code < bound)
+  {
+    v += ((1 << kNumBitModelTotalBits) - v) >> kNumMoveBits;
+    Range = bound;
+    symbol = 0;
+  }
+  else
+  {
+    v -= v >> kNumMoveBits;
+    Code -= bound;
+    Range -= bound;
+    symbol = 1;
+  }
+  *prob = (CProb)v;
+  Normalize();
+  return symbol;
+}
+
+
+The Binary Tree of bit model counters
+-------------------------------------
+
+LZMA uses a tree of Bit model variables to decode symbol that needs
+several bits for storing. There are two versions of such trees in LZMA:
+  1) the tree that decodes bits from high bit to low bit (the normal scheme).
+  2) the tree that decodes bits from low bit to high bit (the reverse scheme).
+
+Each binary tree structure supports different size of decoded symbol
+(the size of binary sequence that contains value of symbol).
+If that size of decoded symbol is "NumBits" bits, the tree structure 
+uses the array of (2 << NumBits) counters of CProb type. 
+But only ((2 << NumBits) - 1) items are used by encoder and decoder.
+The first item (the item with index equal to 0) in array is unused.
+That scheme with unused array's item allows to simplify the code.
+
+unsigned BitTreeReverseDecode(CProb *probs, unsigned numBits, CRangeDecoder *rc)
+{
+  unsigned m = 1;
+  unsigned symbol = 0;
+  for (unsigned i = 0; i < numBits; i++)
+  {
+    unsigned bit = rc->DecodeBit(&probs[m]);
+    m <<= 1;
+    m += bit;
+    symbol |= (bit << i);
+  }
+  return symbol;
+}
+
+template <unsigned NumBits>
+class CBitTreeDecoder
+{
+  CProb Probs[(unsigned)1 << NumBits];
+
+public:
+
+  void Init()
+  {
+    INIT_PROBS(Probs);
+  }
+
+  unsigned Decode(CRangeDecoder *rc)
+  {
+    unsigned m = 1;
+    for (unsigned i = 0; i < NumBits; i++)
+      m = (m << 1) + rc->DecodeBit(&Probs[m]);
+    return m - ((unsigned)1 << NumBits);
+  }
+
+  unsigned ReverseDecode(CRangeDecoder *rc)
+  {
+    return BitTreeReverseDecode(Probs, NumBits, rc);
+  }
+};
+
+
+LZ part of LZMA 
+---------------
+
+LZ part of LZMA describes details about the decoding of MATCHES and LITERALS.
+
+
+The Literal Decoding
+--------------------
+
+The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where 
+each table contains 0x300 CProb values:
+
+  CProb *LitProbs;
+
+  void CreateLiterals()
+  {
+    LitProbs = new CProb[(UInt32)0x300 << (lc + lp)];
+  }
+  
+  void InitLiterals()
+  {
+    UInt32 num = (UInt32)0x300 << (lc + lp);
+    for (UInt32 i = 0; i < num; i++)
+      LitProbs[i] = PROB_INIT_VAL;
+  }
+
+To select the table for decoding it uses the context that consists of
+(lc) high bits from previous literal and (lp) low bits from value that
+represents current position in outputStream.
+
+If (State > 7), the Literal Decoder also uses "matchByte" that represents 
+the byte in OutputStream at position the is the DISTANCE bytes before 
+current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair
+of latest decoded match.
+
+The following code decodes one literal and puts it to Sliding Window buffer:
+
+  void DecodeLiteral(unsigned state, UInt32 rep0)
+  {
+    unsigned prevByte = 0;
+    if (!OutWindow.IsEmpty())
+      prevByte = OutWindow.GetByte(1);
+    
+    unsigned symbol = 1;
+    unsigned litState = ((OutWindow.TotalPos & ((1 << lp) - 1)) << lc) + (prevByte >> (8 - lc));
+    CProb *probs = &LitProbs[(UInt32)0x300 * litState];
+    
+    if (state >= 7)
+    {
+      unsigned matchByte = OutWindow.GetByte(rep0 + 1);
+      do
+      {
+        unsigned matchBit = (matchByte >> 7) & 1;
+        matchByte <<= 1;
+        unsigned bit = RangeDec.DecodeBit(&probs[((1 + matchBit) << 8) + symbol]);
+        symbol = (symbol << 1) | bit;
+        if (matchBit != bit)
+          break;
+      }
+      while (symbol < 0x100);
+    }
+    while (symbol < 0x100)
+      symbol = (symbol << 1) | RangeDec.DecodeBit(&probs[symbol]);
+    OutWindow.PutByte((Byte)(symbol - 0x100));
+  }
+
+
+The match length decoding
+-------------------------
+
+The match length decoder returns normalized (zero-based value) 
+length of match. That value can be converted to real length of the match 
+with the following code:
+
+#define kMatchMinLen 2
+
+    matchLen = len + kMatchMinLen;
+
+The match length decoder can return the values from 0 to 271.
+And the corresponded real match length values can be in the range 
+from 2 to 273.
+
+The following scheme is used for the match length encoding:
+
+  Binary encoding    Binary Tree structure    Zero-based match length 
+  sequence                                    (binary + decimal):
+
+  0 xxx              LowCoder[posState]       xxx
+  1 0 yyy            MidCoder[posState]       yyy + 8
+  1 1 zzzzzzzz       HighCoder                zzzzzzzz + 16
+
+LZMA uses bit model variable "Choice" to decode the first selection bit.
+
+If the first selection bit is equal to 0, the decoder uses binary tree 
+  LowCoder[posState] to decode 3-bit zero-based match length (xxx).
+
+If the first selection bit is equal to 1, the decoder uses bit model 
+  variable "Choice2" to decode the second selection bit.
+
+  If the second selection bit is equal to 0, the decoder uses binary tree 
+    MidCoder[posState] to decode 3-bit "yyy" value, and zero-based match
+    length is equal to (yyy + 8).
+
+  If the second selection bit is equal to 1, the decoder uses binary tree 
+    HighCoder to decode 8-bit "zzzzzzzz" value, and zero-based 
+    match length is equal to (zzzzzzzz + 16).
+
+LZMA uses "posState" value as context to select the binary tree 
+from LowCoder and MidCoder binary tree arrays:
+
+    unsigned posState = OutWindow.TotalPos & ((1 << pb) - 1);
+
+The full code of the length decoder:
+
+class CLenDecoder
+{
+  CProb Choice;
+  CProb Choice2;
+  CBitTreeDecoder<3> LowCoder[1 << kNumPosBitsMax];
+  CBitTreeDecoder<3> MidCoder[1 << kNumPosBitsMax];
+  CBitTreeDecoder<8> HighCoder;
+
+public:
+
+  void Init()
+  {
+    Choice = PROB_INIT_VAL;
+    Choice2 = PROB_INIT_VAL;
+    HighCoder.Init();
+    for (unsigned i = 0; i < (1 << kNumPosBitsMax); i++)
+    {
+      LowCoder[i].Init();
+      MidCoder[i].Init();
+    }
+  }
+
+  unsigned Decode(CRangeDecoder *rc, unsigned posState)
+  {
+    if (rc->DecodeBit(&Choice) == 0)
+      return LowCoder[posState].Decode(rc);
+    if (rc->DecodeBit(&Choice2) == 0)
+      return 8 + MidCoder[posState].Decode(rc);
+    return 16 + HighCoder.Decode(rc);
+  }
+};
+
+The LZMA decoder uses two instances of CLenDecoder class.
+The first instance is for the matches of "Simple Match" type,
+and the second instance is for the matches of "Rep Match" type:
+
+  CLenDecoder LenDecoder;
+  CLenDecoder RepLenDecoder;
+
+
+The match distance decoding
+---------------------------
+
+LZMA supports dictionary sizes up to 4 GiB minus 1.
+The value of match distance (decoded by distance decoder) can be 
+from 1 to 2^32. But the distance value that is equal to 2^32 is used to
+indicate the "End of stream" marker. So real largest match distance 
+that is used for LZ-window match is (2^32 - 1).
+
+LZMA uses normalized match length (zero-based length) 
+to calculate the context state "lenState" do decode the distance value:
+
+#define kNumLenToPosStates 4
+
+    unsigned lenState = len;
+    if (lenState > kNumLenToPosStates - 1)
+      lenState = kNumLenToPosStates - 1;
+
+The distance decoder returns the "dist" value that is zero-based value 
+of match distance. The real match distance can be calculated with the
+following code:
+  
+  matchDistance = dist + 1; 
+
+The state of the distance decoder and the initialization code: 
+
+  #define kEndPosModelIndex 14
+  #define kNumFullDistances (1 << (kEndPosModelIndex >> 1))
+  #define kNumAlignBits 4
+
+  CBitTreeDecoder<6> PosSlotDecoder[kNumLenToPosStates];
+  CProb PosDecoders[1 + kNumFullDistances - kEndPosModelIndex];
+  CBitTreeDecoder<kNumAlignBits> AlignDecoder;
+
+  void InitDist()
+  {
+    for (unsigned i = 0; i < kNumLenToPosStates; i++)
+      PosSlotDecoder[i].Init();
+    AlignDecoder.Init();
+    INIT_PROBS(PosDecoders);
+  }
+
+At first stage the distance decoder decodes 6-bit "posSlot" value with bit
+tree decoder from PosSlotDecoder array. It's possible to get 2^6=64 different 
+"posSlot" values.
+
+    unsigned posSlot = PosSlotDecoder[lenState].Decode(&RangeDec);
+
+The encoding scheme for distance value is shown in the following table:
+
+posSlot (decimal) /
+      zero-based distance (binary)
+ 0    0
+ 1    1
+ 2    10
+ 3    11
+
+ 4    10 x
+ 5    11 x
+ 6    10 xx
+ 7    11 xx
+ 8    10 xxx
+ 9    11 xxx
+10    10 xxxx
+11    11 xxxx
+12    10 xxxxx
+13    11 xxxxx
+
+14    10 yy zzzz
+15    11 yy zzzz
+16    10 yyy zzzz
+17    11 yyy zzzz
+...
+62    10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
+63    11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
+
+where 
+  "x ... x" means the sequence of binary symbols encoded with binary tree and 
+      "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13.
+  "y" means direct bit encoded with range coder.
+  "zzzz" means the sequence of four binary symbols encoded with binary
+      tree with "Reverse" scheme, where one common binary tree "AlignDecoder"
+      is used for all posSlot values.
+
+If (posSlot < 4), the "dist" value is equal to posSlot value.
+
+If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of
+  the high bits of "dist" value and the number of the low bits.
+
+  If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders.
+    (one separated bit tree decoder per one posSlot value) and "Reverse" scheme.
+    In this implementation we use one CProb array "PosDecoders" that contains 
+    all CProb variables for all these bit decoders.
+  
+  if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct 
+    bits from RangeDecoder and the low 4 bits are decoded with a bit tree 
+    decoder "AlignDecoder" with "Reverse" scheme.
+
+The code to decode zero-based match distance:
+  
+  unsigned DecodeDistance(unsigned len)
+  {
+    unsigned lenState = len;
+    if (lenState > kNumLenToPosStates - 1)
+      lenState = kNumLenToPosStates - 1;
+    
+    unsigned posSlot = PosSlotDecoder[lenState].Decode(&RangeDec);
+    if (posSlot < 4)
+      return posSlot;
+    
+    unsigned numDirectBits = (unsigned)((posSlot >> 1) - 1);
+    UInt32 dist = ((2 | (posSlot & 1)) << numDirectBits);
+    if (posSlot < kEndPosModelIndex)
+      dist += BitTreeReverseDecode(PosDecoders + dist - posSlot, numDirectBits, &RangeDec);
+    else
+    {
+      dist += RangeDec.DecodeDirectBits(numDirectBits - kNumAlignBits) << kNumAlignBits;
+      dist += AlignDecoder.ReverseDecode(&RangeDec);
+    }
+    return dist;
+  }
+
+
+
+LZMA Decoding modes
+-------------------
+
+There are 2 types of LZMA streams:
+
+1) The stream with "End of stream" marker.
+2) The stream without "End of stream" marker.
+
+And the LZMA Decoder supports 3 modes of decoding:
+
+1) The unpack size is undefined. The LZMA decoder stops decoding after 
+   getting "End of stream" marker. 
+   The input variables for that case:
+    
+      markerIsMandatory = true
+      unpackSizeDefined = false
+      unpackSize contains any value
+
+2) The unpack size is defined and LZMA decoder supports both variants, 
+   where the stream can contain "End of stream" marker or the stream is
+   finished without "End of stream" marker. The LZMA decoder must detect 
+   any of these situations.
+   The input variables for that case:
+    
+      markerIsMandatory = false
+      unpackSizeDefined = true
+      unpackSize contains unpack size
+
+3) The unpack size is defined and the LZMA stream must contain 
+   "End of stream" marker
+   The input variables for that case:
+    
+      markerIsMandatory = true
+      unpackSizeDefined = true
+      unpackSize contains unpack size
+
+
+The main loop of decoder
+------------------------
+
+The main loop of LZMA decoder:
+
+Initialize the LZMA state.
+loop
+{
+  // begin of loop
+  Check "end of stream" conditions.
+  Decode Type of MATCH / LITERAL. 
+    If it's LITERAL, decode LITERAL value and put the LITERAL to Window.
+    If it's MATCH, decode the length of match and the match distance. 
+        Check error conditions, check end of stream conditions and copy
+        the sequence of match bytes from sliding window to current position
+        in window.
+  Go to begin of loop
+}
+
+The reference implementation of LZMA decoder uses "unpackSize" variable
+to keep the number of remaining bytes in output stream. So it reduces 
+"unpackSize" value after each decoded LITERAL or MATCH.
+
+The following code contains the "end of stream" condition check at the start
+of the loop:
+
+    if (unpackSizeDefined && unpackSize == 0 && !markerIsMandatory)
+      if (RangeDec.IsFinishedOK())
+        return LZMA_RES_FINISHED_WITHOUT_MARKER;
+
+LZMA uses three types of matches:
+
+1) "Simple Match" -     the match with distance value encoded with bit models.
+
+2) "Rep Match" -        the match that uses the distance from distance
+                        history table.
+
+3) "Short Rep Match" -  the match of single byte length, that uses the latest 
+                        distance from distance history table.
+
+The LZMA decoder keeps the history of latest 4 match distances that were used 
+by decoder. That set of 4 variables contains zero-based match distances and 
+these variables are initialized with zero values:
+
+  UInt32 rep0 = 0, rep1 = 0, rep2 = 0, rep3 = 0;
+
+The LZMA decoder uses binary model variables to select type of MATCH or LITERAL:
+
+#define kNumStates 12
+#define kNumPosBitsMax 4
+
+  CProb IsMatch[kNumStates << kNumPosBitsMax];
+  CProb IsRep[kNumStates];
+  CProb IsRepG0[kNumStates];
+  CProb IsRepG1[kNumStates];
+  CProb IsRepG2[kNumStates];
+  CProb IsRep0Long[kNumStates << kNumPosBitsMax];
+
+The decoder uses "state" variable value to select exact variable 
+from "IsRep", "IsRepG0", "IsRepG1" and "IsRepG2" arrays.
+The "state" variable can get the value from 0 to 11.
+Initial value for "state" variable is zero:
+
+  unsigned state = 0;
+
+The "state" variable is updated after each LITERAL or MATCH with one of the
+following functions:
+
+unsigned UpdateState_Literal(unsigned state)
+{
+  if (state < 4) return 0;
+  else if (state < 10) return state - 3;
+  else return state - 6;
+}
+unsigned UpdateState_Match   (unsigned state) { return state < 7 ? 7 : 10; }
+unsigned UpdateState_Rep     (unsigned state) { return state < 7 ? 8 : 11; }
+unsigned UpdateState_ShortRep(unsigned state) { return state < 7 ? 9 : 11; }
+
+The decoder calculates "state2" variable value to select exact variable from 
+"IsMatch" and "IsRep0Long" arrays:
+
+unsigned posState = OutWindow.TotalPos & ((1 << pb) - 1);
+unsigned state2 = (state << kNumPosBitsMax) + posState;
+
+The decoder uses the following code flow scheme to select exact 
+type of LITERAL or MATCH:
+
+IsMatch[state2] decode
+  0 - the Literal
+  1 - the Match
+    IsRep[state] decode
+      0 - Simple Match
+      1 - Rep Match
+        IsRepG0[state] decode
+          0 - the distance is rep0
+            IsRep0Long[state2] decode
+              0 - Short Rep Match
+              1 - Rep Match 0
+          1 - 
+            IsRepG1[state] decode
+              0 - Rep Match 1
+              1 - 
+                IsRepG2[state] decode
+                  0 - Rep Match 2
+                  1 - Rep Match 3
+
+
+LITERAL symbol
+--------------
+If the value "0" was decoded with IsMatch[state2] decoding, we have "LITERAL" type.
+
+At first the LZMA decoder must check that it doesn't exceed 
+specified uncompressed size:
+
+      if (unpackSizeDefined && unpackSize == 0)
+        return LZMA_RES_ERROR;
+
+Then it decodes literal value and puts it to sliding window:
+
+      DecodeLiteral(state, rep0);
+
+Then the decoder must update the "state" value and "unpackSize" value;
+
+      state = UpdateState_Literal(state);
+      unpackSize--;
+
+Then the decoder must go to the begin of main loop to decode next Match or Literal.
+
+
+Simple Match
+------------
+
+If the value "1" was decoded with IsMatch[state2] decoding,
+we have the "Simple Match" type.
+
+The distance history table is updated with the following scheme:
+    
+      rep3 = rep2;
+      rep2 = rep1;
+      rep1 = rep0;
+
+The zero-based length is decoded with "LenDecoder":
+
+      len = LenDecoder.Decode(&RangeDec, posState);
+
+The state is update with UpdateState_Match function:
+
+      state = UpdateState_Match(state);
+
+and the new "rep0" value is decoded with DecodeDistance:
+
+      rep0 = DecodeDistance(len);
+
+That "rep0" will be used as zero-based distance for current match.
+
+If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have 
+"End of stream" marker, so we can stop decoding and check finishing 
+condition in Range Decoder:
+
+      if (rep0 == 0xFFFFFFFF)
+        return RangeDec.IsFinishedOK() ?
+            LZMA_RES_FINISHED_WITH_MARKER :
+            LZMA_RES_ERROR;
+
+If uncompressed size is defined, LZMA decoder must check that it doesn't 
+exceed that specified uncompressed size:
+
+      if (unpackSizeDefined && unpackSize == 0)
+        return LZMA_RES_ERROR;
+
+Also the decoder must check that "rep0" value is not larger than dictionary size
+and is not larger than the number of already decoded bytes:
+
+      if (rep0 >= dictSize || !OutWindow.CheckDistance(rep0))
+        return LZMA_RES_ERROR;
+
+Then the decoder must copy match bytes as described in 
+"The match symbols copying" section.
+
+
+Rep Match
+---------
+
+If the LZMA decoder has decoded the value "1" with IsRep[state] variable,
+we have "Rep Match" type.
+
+At first the LZMA decoder must check that it doesn't exceed 
+specified uncompressed size:
+
+      if (unpackSizeDefined && unpackSize == 0)
+        return LZMA_RES_ERROR;
+
+Also the decoder must return error, if the LZ window is empty:
+
+      if (OutWindow.IsEmpty())
+        return LZMA_RES_ERROR;
+
+If the match type is "Rep Match", the decoder uses one of the 4 variables of
+distance history table to get the value of distance for current match.
+And there are 4 corresponding ways of decoding flow. 
+
+The decoder updates the distance history with the following scheme 
+depending from type of match:
+
+- "Rep Match 0" or "Short Rep Match":
+      ; LZMA doesn't update the distance history    
+
+- "Rep Match 1":
+      UInt32 dist = rep1;
+      rep1 = rep0;
+      rep0 = dist;
+
+- "Rep Match 2":
+      UInt32 dist = rep2;
+      rep2 = rep1;
+      rep1 = rep0;
+      rep0 = dist;
+
+- "Rep Match 3":
+      UInt32 dist = rep3;
+      rep3 = rep2;
+      rep2 = rep1;
+      rep1 = rep0;
+      rep0 = dist;
+
+Then the decoder decodes exact subtype of "Rep Match" using "IsRepG0", "IsRep0Long",
+"IsRepG1", "IsRepG2".
+
+If the subtype is "Short Rep Match", the decoder updates the state, puts 
+the one byte from window to current position in window and goes to next 
+MATCH/LITERAL symbol (the begin of main loop):
+
+          state = UpdateState_ShortRep(state);
+          OutWindow.PutByte(OutWindow.GetByte(rep0 + 1));
+          unpackSize--;
+          continue;
+
+In other cases (Rep Match 0/1/2/3), it decodes the zero-based 
+length of match with "RepLenDecoder" decoder:
+
+      len = RepLenDecoder.Decode(&RangeDec, posState);
+
+Then it updates the state:
+
+      state = UpdateState_Rep(state);
+
+Then the decoder must copy match bytes as described in 
+"The Match symbols copying" section.
+
+
+The match symbols copying
+-------------------------
+
+If we have the match (Simple Match or Rep Match 0/1/2/3), the decoder must
+copy the sequence of bytes with calculated match distance and match length.
+If uncompressed size is defined, LZMA decoder must check that it doesn't 
+exceed that specified uncompressed size:
+
+    len += kMatchMinLen;
+    bool isError = false;
+    if (unpackSizeDefined && unpackSize < len)
+    {
+      len = (unsigned)unpackSize;
+      isError = true;
+    }
+    OutWindow.CopyMatch(rep0 + 1, len);
+    unpackSize -= len;
+    if (isError)
+      return LZMA_RES_ERROR;
+
+Then the decoder must go to the begin of main loop to decode next MATCH or LITERAL.
+
+
+
+NOTES
+-----
+
+This specification doesn't describe the variant of decoder implementation 
+that supports partial decoding. Such partial decoding case can require some 
+changes in "end of stream" condition checks code. Also such code 
+can use additional status codes, returned by decoder.
+
+This specification uses C++ code with templates to simplify describing.
+The optimized version of LZMA decoder doesn't need templates.
+Such optimized version can use just two arrays of CProb variables:
+  1) The dynamic array of CProb variables allocated for the Literal Decoder.
+  2) The one common array that contains all other CProb variables.
+
+
+References:      
+
+1. G. N. N. Martin, Range encoding: an algorithm for removing redundancy 
+   from a digitized message, Video & Data Recording Conference, 
+   Southampton, UK, July 24-27, 1979.
diff --git a/src/sdk/DOC/lzma.txt b/src/sdk/DOC/lzma.txt
index a65988f..62cf094 100644
--- a/src/sdk/DOC/lzma.txt
+++ b/src/sdk/DOC/lzma.txt
@@ -1,328 +1,345 @@
-LZMA compression
-----------------
-Version: 9.35
-
-This file describes LZMA encoding and decoding functions written in C language.
-
-LZMA is an improved version of famous LZ77 compression algorithm. 
-It was improved in way of maximum increasing of compression ratio,
-keeping high decompression speed and low memory requirements for 
-decompressing.
-
-Note: you can read also LZMA Specification (lzma-specification.txt from LZMA SDK)
-
-Also you can look source code for LZMA encoding and decoding:
-  C/Util/Lzma/LzmaUtil.c
-
-
-LZMA compressed file format
----------------------------
-Offset Size Description
-  0     1   Special LZMA properties (lc,lp, pb in encoded form)
-  1     4   Dictionary size (little endian)
-  5     8   Uncompressed size (little endian). -1 means unknown size
- 13         Compressed data
-
-
-
-ANSI-C LZMA Decoder
-~~~~~~~~~~~~~~~~~~~
-
-Please note that interfaces for ANSI-C code were changed in LZMA SDK 4.58.
-If you want to use old interfaces you can download previous version of LZMA SDK
-from sourceforge.net site.
-
-To use ANSI-C LZMA Decoder you need the following files:
-1) LzmaDec.h + LzmaDec.c + 7zTypes.h + Precomp.h + Compiler.h
-
-Look example code:
-  C/Util/Lzma/LzmaUtil.c
-
-
-Memory requirements for LZMA decoding
--------------------------------------
-
-Stack usage of LZMA decoding function for local variables is not 
-larger than 200-400 bytes.
-
-LZMA Decoder uses dictionary buffer and internal state structure.
-Internal state structure consumes
-  state_size = (4 + (1.5 << (lc + lp))) KB
-by default (lc=3, lp=0), state_size = 16 KB.
-
-
-How To decompress data
-----------------------
-
-LZMA Decoder (ANSI-C version) now supports 2 interfaces:
-1) Single-call Decompressing
-2) Multi-call State Decompressing (zlib-like interface)
-
-You must use external allocator:
-Example:
-void *SzAlloc(void *p, size_t size) { p = p; return malloc(size); }
-void SzFree(void *p, void *address) { p = p; free(address); }
-ISzAlloc alloc = { SzAlloc, SzFree };
-
-You can use p = p; operator to disable compiler warnings.
-
-
-Single-call Decompressing
--------------------------
-When to use: RAM->RAM decompressing
-Compile files: LzmaDec.h + LzmaDec.c + 7zTypes.h
-Compile defines: no defines
-Memory Requirements:
-  - Input buffer: compressed size
-  - Output buffer: uncompressed size
-  - LZMA Internal Structures: state_size (16 KB for default settings) 
-
-Interface:
-  int LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
-      const Byte *propData, unsigned propSize, ELzmaFinishMode finishMode, 
-      ELzmaStatus *status, ISzAlloc *alloc);
-  In: 
-    dest     - output data
-    destLen  - output data size
-    src      - input data
-    srcLen   - input data size
-    propData - LZMA properties  (5 bytes)
-    propSize - size of propData buffer (5 bytes)
-    finishMode - It has meaning only if the decoding reaches output limit (*destLen).
-         LZMA_FINISH_ANY - Decode just destLen bytes.
-         LZMA_FINISH_END - Stream must be finished after (*destLen).
-                           You can use LZMA_FINISH_END, when you know that 
-                           current output buffer covers last bytes of stream. 
-    alloc    - Memory allocator.
-
-  Out: 
-    destLen  - processed output size 
-    srcLen   - processed input size 
-
-  Output:
-    SZ_OK
-      status:
-        LZMA_STATUS_FINISHED_WITH_MARK
-        LZMA_STATUS_NOT_FINISHED 
-        LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK
-    SZ_ERROR_DATA - Data error
-    SZ_ERROR_MEM  - Memory allocation error
-    SZ_ERROR_UNSUPPORTED - Unsupported properties
-    SZ_ERROR_INPUT_EOF - It needs more bytes in input buffer (src).
-
-  If LZMA decoder sees end_marker before reaching output limit, it returns OK result,
-  and output value of destLen will be less than output buffer size limit.
-
-  You can use multiple checks to test data integrity after full decompression:
-    1) Check Result and "status" variable.
-    2) Check that output(destLen) = uncompressedSize, if you know real uncompressedSize.
-    3) Check that output(srcLen) = compressedSize, if you know real compressedSize. 
-       You must use correct finish mode in that case. */ 
-
-
-Multi-call State Decompressing (zlib-like interface)
-----------------------------------------------------
-
-When to use: file->file decompressing 
-Compile files: LzmaDec.h + LzmaDec.c + 7zTypes.h
-
-Memory Requirements:
- - Buffer for input stream: any size (for example, 16 KB)
- - Buffer for output stream: any size (for example, 16 KB)
- - LZMA Internal Structures: state_size (16 KB for default settings) 
- - LZMA dictionary (dictionary size is encoded in LZMA properties header)
-
-1) read LZMA properties (5 bytes) and uncompressed size (8 bytes, little-endian) to header:
-   unsigned char header[LZMA_PROPS_SIZE + 8];
-   ReadFile(inFile, header, sizeof(header)
-
-2) Allocate CLzmaDec structures (state + dictionary) using LZMA properties
-
-  CLzmaDec state;
-  LzmaDec_Constr(&state);
-  res = LzmaDec_Allocate(&state, header, LZMA_PROPS_SIZE, &g_Alloc);
-  if (res != SZ_OK)
-    return res;
-
-3) Init LzmaDec structure before any new LZMA stream. And call LzmaDec_DecodeToBuf in loop
-
-  LzmaDec_Init(&state);
-  for (;;)
-  {
-    ... 
-    int res = LzmaDec_DecodeToBuf(CLzmaDec *p, Byte *dest, SizeT *destLen, 
-        const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode);
-    ...
-  }
-
-
-4) Free all allocated structures
-  LzmaDec_Free(&state, &g_Alloc);
-
-Look example code:
-  C/Util/Lzma/LzmaUtil.c
-
-
-How To compress data
---------------------
-
-Compile files: 
-  7zTypes.h
-  Threads.h	
-  LzmaEnc.h
-  LzmaEnc.c
-  LzFind.h
-  LzFind.c
-  LzFindMt.h
-  LzFindMt.c
-  LzHash.h
-
-Memory Requirements:
-  - (dictSize * 11.5 + 6 MB) + state_size
-
-Lzma Encoder can use two memory allocators:
-1) alloc - for small arrays.
-2) allocBig - for big arrays.
-
-For example, you can use Large RAM Pages (2 MB) in allocBig allocator for 
-better compression speed. Note that Windows has bad implementation for 
-Large RAM Pages. 
-It's OK to use same allocator for alloc and allocBig.
-
-
-Single-call Compression with callbacks
---------------------------------------
-
-Look example code:
-  C/Util/Lzma/LzmaUtil.c
-
-When to use: file->file compressing 
-
-1) you must implement callback structures for interfaces:
-ISeqInStream
-ISeqOutStream
-ICompressProgress
-ISzAlloc
-
-static void *SzAlloc(void *p, size_t size) { p = p; return MyAlloc(size); }
-static void SzFree(void *p, void *address) {  p = p; MyFree(address); }
-static ISzAlloc g_Alloc = { SzAlloc, SzFree };
-
-  CFileSeqInStream inStream;
-  CFileSeqOutStream outStream;
-
-  inStream.funcTable.Read = MyRead;
-  inStream.file = inFile;
-  outStream.funcTable.Write = MyWrite;
-  outStream.file = outFile;
-
-
-2) Create CLzmaEncHandle object;
-
-  CLzmaEncHandle enc;
-
-  enc = LzmaEnc_Create(&g_Alloc);
-  if (enc == 0)
-    return SZ_ERROR_MEM;
-
-
-3) initialize CLzmaEncProps properties;
-
-  LzmaEncProps_Init(&props);
-
-  Then you can change some properties in that structure.
-
-4) Send LZMA properties to LZMA Encoder
-
-  res = LzmaEnc_SetProps(enc, &props);
-
-5) Write encoded properties to header
-
-    Byte header[LZMA_PROPS_SIZE + 8];
-    size_t headerSize = LZMA_PROPS_SIZE;
-    UInt64 fileSize;
-    int i;
-
-    res = LzmaEnc_WriteProperties(enc, header, &headerSize);
-    fileSize = MyGetFileLength(inFile);
-    for (i = 0; i < 8; i++)
-      header[headerSize++] = (Byte)(fileSize >> (8 * i));
-    MyWriteFileAndCheck(outFile, header, headerSize)
-
-6) Call encoding function:
-      res = LzmaEnc_Encode(enc, &outStream.funcTable, &inStream.funcTable, 
-        NULL, &g_Alloc, &g_Alloc);
-
-7) Destroy LZMA Encoder Object
-  LzmaEnc_Destroy(enc, &g_Alloc, &g_Alloc);
-
-
-If callback function return some error code, LzmaEnc_Encode also returns that code
-or it can return the code like SZ_ERROR_READ, SZ_ERROR_WRITE or SZ_ERROR_PROGRESS.
-
-
-Single-call RAM->RAM Compression
---------------------------------
-
-Single-call RAM->RAM Compression is similar to Compression with callbacks,
-but you provide pointers to buffers instead of pointers to stream callbacks:
-
-SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
-    const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark, 
-    ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig);
-
-Return code:
-  SZ_OK               - OK
-  SZ_ERROR_MEM        - Memory allocation error 
-  SZ_ERROR_PARAM      - Incorrect paramater
-  SZ_ERROR_OUTPUT_EOF - output buffer overflow
-  SZ_ERROR_THREAD     - errors in multithreading functions (only for Mt version)
-
-
-
-Defines
--------
-
-_LZMA_SIZE_OPT - Enable some optimizations in LZMA Decoder to get smaller executable code.
-
-_LZMA_PROB32   - It can increase the speed on some 32-bit CPUs, but memory usage for 
-                 some structures will be doubled in that case.
-
-_LZMA_UINT32_IS_ULONG  - Define it if int is 16-bit on your compiler and long is 32-bit.
-
-_LZMA_NO_SYSTEM_SIZE_T  - Define it if you don't want to use size_t type.
-
-
-_7ZIP_PPMD_SUPPPORT - Define it if you don't want to support PPMD method in AMSI-C .7z decoder.
-
-
-C++ LZMA Encoder/Decoder 
-~~~~~~~~~~~~~~~~~~~~~~~~
-C++ LZMA code use COM-like interfaces. So if you want to use it, 
-you can study basics of COM/OLE.
-C++ LZMA code is just wrapper over ANSI-C code.
-
-
-C++ Notes
-~~~~~~~~~~~~~~~~~~~~~~~~
-If you use some C++ code folders in 7-Zip (for example, C++ code for .7z handling),
-you must check that you correctly work with "new" operator.
-7-Zip can be compiled with MSVC 6.0 that doesn't throw "exception" from "new" operator.
-So 7-Zip uses "CPP\Common\NewHandler.cpp" that redefines "new" operator:
-operator new(size_t size)
-{
-  void *p = ::malloc(size);
-  if (p == 0)
-    throw CNewException();
-  return p;
-}
-If you use MSCV that throws exception for "new" operator, you can compile without 
-"NewHandler.cpp". So standard exception will be used. Actually some code of 
-7-Zip catches any exception in internal code and converts it to HRESULT code.
-So you don't need to catch CNewException, if you call COM interfaces of 7-Zip.
-
----
-
-http://www.7-zip.org
-http://www.7-zip.org/sdk.html
-http://www.7-zip.org/support.html
+LZMA compression
+----------------
+Version: 24.07
+
+This file describes LZMA encoding and decoding functions written in C language.
+
+LZMA is an improved version of famous LZ77 compression algorithm. 
+It was improved in way of maximum increasing of compression ratio,
+keeping high decompression speed and low memory requirements for 
+decompressing.
+
+Note: you can read also LZMA Specification (lzma-specification.txt from LZMA SDK)
+
+Also you can look source code for LZMA encoding and decoding:
+  C/Util/Lzma/LzmaUtil.c
+
+
+LZMA compressed file format
+---------------------------
+Offset Size Description
+  0     1   Special LZMA properties (lc,lp, pb in encoded form)
+  1     4   Dictionary size (little endian)
+  5     8   Uncompressed size (little endian). -1 means unknown size
+ 13         Compressed data
+
+
+
+ANSI-C LZMA Decoder
+~~~~~~~~~~~~~~~~~~~
+
+Please note that interfaces for ANSI-C code were changed in LZMA SDK 4.58.
+If you want to use old interfaces you can download previous version of LZMA SDK
+from sourceforge.net site.
+
+To use ANSI-C LZMA Decoder you need the following files:
+1) LzmaDec.h + LzmaDec.c + 7zTypes.h + Precomp.h + Compiler.h
+
+Look example code:
+  C/Util/Lzma/LzmaUtil.c
+
+
+Memory requirements for LZMA decoding
+-------------------------------------
+
+Stack usage of LZMA decoding function for local variables is not 
+larger than 200-400 bytes.
+
+LZMA Decoder uses dictionary buffer and internal state structure.
+Internal state structure consumes
+  state_size = (4 + (1.5 << (lc + lp))) KB
+by default (lc=3, lp=0), state_size = 16 KB.
+
+
+How To decompress data
+----------------------
+
+LZMA Decoder (ANSI-C version) now supports 2 interfaces:
+1) Single-call Decompressing
+2) Multi-call State Decompressing (zlib-like interface)
+
+You must use external allocator:
+Example:
+void *SzAlloc(void *p, size_t size) { p = p; return malloc(size); }
+void SzFree(void *p, void *address) { p = p; free(address); }
+ISzAlloc alloc = { SzAlloc, SzFree };
+
+You can use p = p; operator to disable compiler warnings.
+
+
+Single-call Decompressing
+-------------------------
+When to use: RAM->RAM decompressing
+Compile files: LzmaDec.h + LzmaDec.c + 7zTypes.h
+Compile defines: no defines
+Memory Requirements:
+  - Input buffer: compressed size
+  - Output buffer: uncompressed size
+  - LZMA Internal Structures: state_size (16 KB for default settings) 
+
+Interface:
+  int LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
+      const Byte *propData, unsigned propSize, ELzmaFinishMode finishMode, 
+      ELzmaStatus *status, ISzAlloc *alloc);
+  In: 
+    dest     - output data
+    destLen  - output data size
+    src      - input data
+    srcLen   - input data size
+    propData - LZMA properties  (5 bytes)
+    propSize - size of propData buffer (5 bytes)
+    finishMode - It has meaning only if the decoding reaches output limit (*destLen).
+         LZMA_FINISH_ANY - Decode just destLen bytes.
+         LZMA_FINISH_END - Stream must be finished after (*destLen).
+                           You can use LZMA_FINISH_END, when you know that 
+                           current output buffer covers last bytes of stream. 
+    alloc    - Memory allocator.
+
+  Out: 
+    destLen  - processed output size 
+    srcLen   - processed input size 
+
+  Output:
+    SZ_OK
+      status:
+        LZMA_STATUS_FINISHED_WITH_MARK
+        LZMA_STATUS_NOT_FINISHED 
+        LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK
+    SZ_ERROR_DATA - Data error
+    SZ_ERROR_MEM  - Memory allocation error
+    SZ_ERROR_UNSUPPORTED - Unsupported properties
+    SZ_ERROR_INPUT_EOF - It needs more bytes in input buffer (src).
+
+  If LZMA decoder sees end_marker before reaching output limit, it returns OK result,
+  and output value of destLen will be less than output buffer size limit.
+
+  You can use multiple checks to test data integrity after full decompression:
+    1) Check Result and "status" variable.
+    2) Check that output(destLen) = uncompressedSize, if you know real uncompressedSize.
+    3) Check that output(srcLen) = compressedSize, if you know real compressedSize. 
+       You must use correct finish mode in that case. */ 
+
+
+Multi-call State Decompressing (zlib-like interface)
+----------------------------------------------------
+
+When to use: file->file decompressing 
+Compile files: LzmaDec.h + LzmaDec.c + 7zTypes.h
+
+Memory Requirements:
+ - Buffer for input stream: any size (for example, 16 KB)
+ - Buffer for output stream: any size (for example, 16 KB)
+ - LZMA Internal Structures: state_size (16 KB for default settings) 
+ - LZMA dictionary (dictionary size is encoded in LZMA properties header)
+
+1) read LZMA properties (5 bytes) and uncompressed size (8 bytes, little-endian) to header:
+   unsigned char header[LZMA_PROPS_SIZE + 8];
+   ReadFile(inFile, header, sizeof(header)
+
+2) Allocate CLzmaDec structures (state + dictionary) using LZMA properties
+
+  CLzmaDec state;
+  LzmaDec_Constr(&state);
+  res = LzmaDec_Allocate(&state, header, LZMA_PROPS_SIZE, &g_Alloc);
+  if (res != SZ_OK)
+    return res;
+
+3) Init LzmaDec structure before any new LZMA stream. And call LzmaDec_DecodeToBuf in loop
+
+  LzmaDec_Init(&state);
+  for (;;)
+  {
+    ... 
+    int res = LzmaDec_DecodeToBuf(CLzmaDec *p, Byte *dest, SizeT *destLen, 
+        const Byte *src, SizeT *srcLen, ELzmaFinishMode finishMode);
+    ...
+  }
+
+
+4) Free all allocated structures
+  LzmaDec_Free(&state, &g_Alloc);
+
+Look example code:
+  C/Util/Lzma/LzmaUtil.c
+
+
+How To compress data
+--------------------
+
+Compile files: 
+  7zTypes.h
+  Threads.h	
+  Threads.c	
+  LzmaEnc.h
+  LzmaEnc.c
+  LzFind.h
+  LzFind.c
+  LzFindMt.h
+  LzFindMt.c
+  LzFindOpt.c
+  LzHash.h
+
+Memory Requirements:
+  - (dictSize * 11.5 + 6 MB) + state_size
+
+Lzma Encoder can use two memory allocators:
+1) alloc - for small arrays.
+2) allocBig - for big arrays.
+
+For example, you can use Large RAM Pages (2 MB) in allocBig allocator for 
+better compression speed. Note that Windows has bad implementation for 
+Large RAM Pages. 
+It's OK to use same allocator for alloc and allocBig.
+
+
+Single-call Compression with callbacks
+--------------------------------------
+
+Look example code:
+  C/Util/Lzma/LzmaUtil.c
+
+When to use: file->file compressing 
+
+1) you must implement callback structures for interfaces:
+ISeqInStream
+ISeqOutStream
+ICompressProgress
+ISzAlloc
+
+static void *SzAlloc(void *p, size_t size) { p = p; return MyAlloc(size); }
+static void SzFree(void *p, void *address) {  p = p; MyFree(address); }
+static ISzAlloc g_Alloc = { SzAlloc, SzFree };
+
+  CFileSeqInStream inStream;
+  CFileSeqOutStream outStream;
+
+  inStream.funcTable.Read = MyRead;
+  inStream.file = inFile;
+  outStream.funcTable.Write = MyWrite;
+  outStream.file = outFile;
+
+
+2) Create CLzmaEncHandle object;
+
+  CLzmaEncHandle enc;
+
+  enc = LzmaEnc_Create(&g_Alloc);
+  if (enc == 0)
+    return SZ_ERROR_MEM;
+
+
+3) initialize CLzmaEncProps properties;
+
+  LzmaEncProps_Init(&props);
+
+  Then you can change some properties in that structure.
+
+4) Send LZMA properties to LZMA Encoder
+
+  res = LzmaEnc_SetProps(enc, &props);
+
+5) Write encoded properties to header
+
+    Byte header[LZMA_PROPS_SIZE + 8];
+    size_t headerSize = LZMA_PROPS_SIZE;
+    UInt64 fileSize;
+    int i;
+
+    res = LzmaEnc_WriteProperties(enc, header, &headerSize);
+    fileSize = MyGetFileLength(inFile);
+    for (i = 0; i < 8; i++)
+      header[headerSize++] = (Byte)(fileSize >> (8 * i));
+    MyWriteFileAndCheck(outFile, header, headerSize)
+
+6) Call encoding function:
+      res = LzmaEnc_Encode(enc, &outStream.funcTable, &inStream.funcTable, 
+        NULL, &g_Alloc, &g_Alloc);
+
+7) Destroy LZMA Encoder Object
+  LzmaEnc_Destroy(enc, &g_Alloc, &g_Alloc);
+
+
+If callback function return some error code, LzmaEnc_Encode also returns that code
+or it can return the code like SZ_ERROR_READ, SZ_ERROR_WRITE or SZ_ERROR_PROGRESS.
+
+
+Single-call RAM->RAM Compression
+--------------------------------
+
+Single-call RAM->RAM Compression is similar to Compression with callbacks,
+but you provide pointers to buffers instead of pointers to stream callbacks:
+
+SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
+    const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark, 
+    ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig);
+
+Return code:
+  SZ_OK               - OK
+  SZ_ERROR_MEM        - Memory allocation error 
+  SZ_ERROR_PARAM      - Incorrect paramater
+  SZ_ERROR_OUTPUT_EOF - output buffer overflow
+  SZ_ERROR_THREAD     - errors in multithreading functions (only for Mt version)
+
+
+
+Defines
+-------
+
+Z7_LZMA_SIZE_OPT - Enable some code size optimizations in LZMA Decoder to get smaller executable code.
+
+Z7_LZMA_PROB32   - It can increase the speed on some 32-bit CPUs, but memory usage for 
+                   some structures will be doubled in that case.
+
+Z7_DECL_Int32_AS_long  - Define it if int is 16-bit on your compiler and long is 32-bit.
+
+Z7_DECL_SizeT_AS_unsigned_int  - Define it if you don't want to use size_t type.
+
+
+Defines for 7z decoder written in C
+-----------------------------------
+These defines are for 7zDec.c only (the decoder in C). 
+C++ 7z decoder doesn't uses these macros.
+
+Z7_PPMD_SUPPORT        - define it if you need PPMD method support.
+Z7_NO_METHODS_FILTERS  - do not use filters (except of BCJ2 filter).
+Z7_USE_NATIVE_BRANCH_FILTER - use filter for native ISA:
+                                 use x86 filter, if compiled to x86 executable,
+                		 use arm64 filter, if compiled to arm64 executable. 
+
+
+C++ LZMA Encoder/Decoder 
+~~~~~~~~~~~~~~~~~~~~~~~~
+C++ LZMA code use COM-like interfaces. So if you want to use it, 
+you can study basics of COM/OLE.
+C++ LZMA code is just wrapper over ANSI-C code.
+
+
+C++ Notes
+~~~~~~~~~~~~~~~~~~~~~~~~
+If you use some C++ code folders in 7-Zip (for example, C++ code for 7z archive handling),
+you must check that you correctly work with "new" operator.
+7-Zip can be compiled with MSVC 6.0 that doesn't throw "exception" from "new" operator.
+So 7-Zip uses "CPP\Common\NewHandler.cpp" that redefines "new" operator,
+if compiled by old MSVC compilers (MSVC before version VS 2010):
+
+operator new(size_t size)
+{
+  void *p = ::malloc(size);
+  if (!p)
+    throw CNewException();
+  return p;
+}
+
+If the compiler is VS 2010 or newer, NewHandler.cpp doesn't redefine "new" operator.
+Sp if you use new compiler (VS 2010 or newer), you still can include "NewHandler.cpp" 
+to compilation, and it will not redefine operator new.
+Also you can compile without "NewHandler.cpp" with new compilers. 
+If 7-zip doesn't redefine operator "new", standard exception will be used instead of CNewException. 
+Some code of 7-Zip catches any exception in internal code and converts it to HRESULT code.
+So you don't need to catch CNewException, if you call COM interfaces of 7-Zip.
+
+---
+
+http://www.7-zip.org
+http://www.7-zip.org/sdk.html
+http://www.7-zip.org/support.html