diff --git a/.github/workflows/main-jdk17-build.yml b/.github/workflows/main-jdk17-build.yml index 5dd7d568bcc..73005b208d0 100644 --- a/.github/workflows/main-jdk17-build.yml +++ b/.github/workflows/main-jdk17-build.yml @@ -20,8 +20,12 @@ name: main jdk17 build on: pull_request: branches: [ main ] + paths-ignore: + - 'docs/**' push: branches: [ main ] + paths-ignore: + - 'docs/**' jobs: build: diff --git a/.github/workflows/main-jdk17-windows-build-multi-locale.yml b/.github/workflows/main-jdk17-windows-build-multi-locale.yml index c545dfeb574..cd07d76ec37 100644 --- a/.github/workflows/main-jdk17-windows-build-multi-locale.yml +++ b/.github/workflows/main-jdk17-windows-build-multi-locale.yml @@ -20,8 +20,12 @@ name: main jdk17 windows build (multi-locale) on: pull_request: branches: [ main ] + paths-ignore: + - 'docs/**' push: branches: [ main ] + paths-ignore: + - 'docs/**' jobs: build: diff --git a/.github/workflows/main-jdk17-windows-build.yml b/.github/workflows/main-jdk17-windows-build.yml index 49f14377b61..26a288043ff 100644 --- a/.github/workflows/main-jdk17-windows-build.yml +++ b/.github/workflows/main-jdk17-windows-build.yml @@ -20,8 +20,12 @@ name: main jdk17 windows build on: pull_request: branches: [ main ] + paths-ignore: + - 'docs/**' push: branches: [ main ] + paths-ignore: + - 'docs/**' jobs: build: diff --git a/.github/workflows/main-jdk21-build.yml b/.github/workflows/main-jdk21-build.yml index 3d219166809..02b970716ef 100644 --- a/.github/workflows/main-jdk21-build.yml +++ b/.github/workflows/main-jdk21-build.yml @@ -20,6 +20,8 @@ name: main jdk21 build on: push: branches: [ main ] + paths-ignore: + - 'docs/**' jobs: build: diff --git a/.github/workflows/main-jdk25-build.yml b/.github/workflows/main-jdk25-build.yml index 2109aea8f6e..20dbcde48c4 100644 --- a/.github/workflows/main-jdk25-build.yml +++ b/.github/workflows/main-jdk25-build.yml @@ -20,6 +20,8 @@ name: main jdk25 build on: push: branches: [ main ] + paths-ignore: + - 'docs/**' jobs: build: diff --git a/docs/pom.xml b/docs/pom.xml new file mode 100644 index 00000000000..040f0cc66c3 --- /dev/null +++ b/docs/pom.xml @@ -0,0 +1,96 @@ + + + + 4.0.0 + + org.apache.tika + tika + 4.0.0-SNAPSHOT + + + tika-docs + pom + Apache Tika Documentation + + + + 3.2.3 + + + + + + org.asciidoctor + asciidoctor-maven-plugin + 3.2.0 + + + output-html + generate-resources + + process-asciidoc + + + article + + coderay + + false + font + ${tika.stable.version} + + ${project.basedir}/../tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples + ${project.basedir}/../tika-server/tika-server-core/src/test/resources/config-examples + ${project.basedir}/../tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples + + + + + + src/main/asciidoc + true + + + + + + maven-assembly-plugin + + + make-docs-archive + package + + single + + + + src/assembly/docs.xml + + ${project.artifactId}-${project.version} + + + + + + + + diff --git a/docs/src/assembly/docs.xml b/docs/src/assembly/docs.xml new file mode 100644 index 00000000000..5a4b5c57467 --- /dev/null +++ b/docs/src/assembly/docs.xml @@ -0,0 +1,37 @@ + + + docs + + tar.gz + + false + + + ${project.build.directory}/generated-docs + / + + **/* + + + + diff --git a/docs/src/main/asciidoc/advanced/index.adoc b/docs/src/main/asciidoc/advanced/index.adoc new file mode 100644 index 00000000000..f8350c86b8f --- /dev/null +++ b/docs/src/main/asciidoc/advanced/index.adoc @@ -0,0 +1,31 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Advanced Topics + +This section covers advanced usage and internals of Apache Tika. + +== Topics + +* xref:robustness.adoc[Robustness] - Process isolation and fault tolerance when parsing untrusted content +* xref:spooling.adoc[TikaInputStream and Spooling] - Understanding how TikaInputStream handles buffering, caching, and spooling to disk + +// Add links to specific topics as they are created +// * link:custom-parsers.html[Writing Custom Parsers] +// * link:custom-detectors.html[Writing Custom Detectors] +// * link:configuration.html[Advanced Configuration] +// * link:performance.html[Performance Tuning] diff --git a/docs/src/main/asciidoc/advanced/robustness.adoc b/docs/src/main/asciidoc/advanced/robustness.adoc new file mode 100644 index 00000000000..7547cf8eb20 --- /dev/null +++ b/docs/src/main/asciidoc/advanced/robustness.adoc @@ -0,0 +1,137 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += The Robustness of Apache Tika + +Running parsers on untrusted data carries inherent risks. In rare cases, Tika can +encounter infinite loops or allocate unexpected amounts of memory (OutOfMemoryErrors). +When processing documents at scale, you must implement protective measures. + +IMPORTANT: Avoid running Tika in the same process as critical infrastructure like +indexers or search systems. + +== Process Isolation + +The primary defense against parser failures is process isolation. By running parsers +in separate processes, you protect your main application from: + +* OutOfMemoryErrors +* Infinite loops +* Native code crashes +* Resource exhaustion + +=== Tika 4.x + +**In Tika 4.x, xref:../pipes/index.adoc[Tika Pipes] is the recommended approach for +robust document processing.** It provides: + +* Automatic process isolation +* Fault tolerance and recovery +* Scalable parallel processing +* Unified architecture for all deployment scenarios + +Pipes can be used in multiple ways: + +* **Programmatically** - Via `PipesForkParser` in the `tika-pipes-fork-parser` module + (see xref:../using-tika/java-api/getting-started.adoc[Java API Getting Started]) +* **Via tika-server** - REST endpoints for pipes-based processing +* **Via tika-grpc** - gRPC interface with pipes backend + +In Tika 4.x, the approach to robustness has been simplified. Previous versions offered +four different forking mechanisms: + +[cols="1,2,1"] +|=== +|Mechanism |Description |Status in 4.x + +|ForkParser +|Spawned child processes for individual parse operations +|Deprecated + +|tika-batch +|Desktop/VM-scale batch processing +|Deprecated + +|tika-server (forked mode) +|REST server with forked parsing processes +|Available, but Pipes recommended + +|tika-pipes +|Scalable, fault-tolerant pipeline processing +|*Recommended approach* +|=== + +=== Tika 3.x and Earlier + +If you are using Tika 3.x or earlier, you have several options for process isolation: + +ForkParser:: +Spawns child processes to protect against out-of-memory errors and infinite loops. +Suitable for programmatic use in Java applications. + +tika-batch:: +For desktop/VM-scale processing (not cloud-scale): ++ +[source,bash] +---- +java -jar tika-app.jar -i -o +---- + +tika-server:: +In version 2.x and later, parsing defaults to forked processes. Clients must handle +tika-server restarts gracefully. + +tika-pipes:: +Available through programmatic use, tika-app `-a` option, or tika-server's `/async` +and `/pipes` endpoints. + +== Security Testing and Prevention + +The Apache Tika team implements several measures to identify and prevent vulnerabilities: + +* **Regression testing** against ~2 million files from Common Crawl before releases +* **Code reviews** of dependencies to identify vulnerability patterns +* **Fuzzing modules** for automated vulnerability discovery +* **Collaboration** with security researchers +* **Maintained forks** of parsers with critical fixes (released independently when needed) +* **Public documentation** of vulnerabilities at xref:../security.adoc[security page] + +== MockParser for Testing + +Tika provides a `MockParser` tool for testing your system's robustness. You can +configure it to simulate various failure modes: + +* Infinite loops +* OutOfMemoryErrors +* Excessive runtime +* Large output generation + +This allows you to verify that your integration handles parser failures gracefully. + +== Recommendations + +1. **Use Tika Pipes** (4.x) for production workloads with untrusted content +2. **Isolate Tika** from critical systems - never run in the same JVM as your indexer +3. **Set timeouts** for all parsing operations +4. **Monitor memory usage** and set appropriate limits +5. **Plan for failures** - your system should handle parser crashes gracefully +6. **Stay updated** - apply security updates promptly + +== Further Reading + +* xref:../pipes/index.adoc[Tika Pipes] - Recommended approach for robust processing +* xref:../security.adoc[Security] - Known vulnerabilities and security model diff --git a/docs/spooling.adoc b/docs/src/main/asciidoc/advanced/spooling.adoc similarity index 100% rename from docs/spooling.adoc rename to docs/src/main/asciidoc/advanced/spooling.adoc diff --git a/docs/src/main/asciidoc/configuration/index.adoc b/docs/src/main/asciidoc/configuration/index.adoc new file mode 100644 index 00000000000..215e1f4c71d --- /dev/null +++ b/docs/src/main/asciidoc/configuration/index.adoc @@ -0,0 +1,40 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Configuration + +This section covers configuring Apache Tika. + +== Overview + +Tika 4.x uses JSON configuration files. Configuration controls parsers, detectors, +content handlers, and other components. + +NOTE: Tika 3.x and earlier used XML configuration (`tika-config.xml`). See the +xref:../migration-to-4x/index.adoc[Migration Guide] for details on converting to JSON. + +== Topics + +=== Parser Configuration + +* xref:parsers/pdf-parser.adoc[PDFParser] - PDF parsing options +* xref:parsers/tesseract-ocr-parser.adoc[TesseractOCRParser] - OCR options for image-based text extraction + +// Add links to specific topics as they are created +// * xref:json-config.adoc[JSON Configuration Reference] +// * xref:detectors.adoc[Configuring Detectors] +// * xref:mime-types.adoc[MIME Type Configuration] diff --git a/docs/src/main/asciidoc/configuration/parsers/pdf-parser.adoc b/docs/src/main/asciidoc/configuration/parsers/pdf-parser.adoc new file mode 100644 index 00000000000..cee58a3b70e --- /dev/null +++ b/docs/src/main/asciidoc/configuration/parsers/pdf-parser.adoc @@ -0,0 +1,43 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += PDFParser Configuration + +This page documents the configuration options for `PDFParser` in Tika 4.x. + +== Basic Configuration + +[source,json] +---- +include::{parser-examples}/pdf-parser-basic.json[] +---- +icon:github[] https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json[View source on GitHub] + +== Full Configuration + +The following example shows all available configuration options with their default values. +Comments indicate the available options for enum fields. + +[source,json] +---- +include::{parser-examples}/pdf-parser-full.json[] +---- +icon:github[] https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json[View source on GitHub] + +== Changes from 3.x + +See xref:../../migration-to-4x/migrating-to-4x.adoc[Migrating to 4.x] for general migration guidance. diff --git a/docs/src/main/asciidoc/configuration/parsers/tesseract-ocr-parser.adoc b/docs/src/main/asciidoc/configuration/parsers/tesseract-ocr-parser.adoc new file mode 100644 index 00000000000..5b1b2b67e64 --- /dev/null +++ b/docs/src/main/asciidoc/configuration/parsers/tesseract-ocr-parser.adoc @@ -0,0 +1,67 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += TesseractOCRParser Configuration + +This page documents the configuration options for `TesseractOCRParser` in Tika 4.x. + +== Basic Configuration + +[source,json] +---- +include::{parser-examples}/tesseract-basic.json[] +---- +icon:github[] https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json[View source on GitHub] + +== Full Configuration + +The following example shows all available configuration options with their default values. +Comments indicate the available options for enum fields. + +[source,json] +---- +include::{parser-examples}/tesseract-full.json[] +---- +icon:github[] https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json[View source on GitHub] + +== Changes from 3.x + +In Tika 3.x, the `otherTesseractSettings` was a list of space-delimited key-value strings: + +[source,xml] +---- + + + textord_initialx_ile 0.75 + textord_noise_hfract 0.15625 + +---- + +In Tika 4.x, this is replaced with `otherTesseractConfig` as a proper map: + +[source,json] +---- +// 4.x JSON format +"otherTesseractConfig": { + "textord_initialx_ile": "0.75", + "textord_noise_hfract": "0.15625" +} +---- + +The automatic converter handles this transformation. + +See xref:../../migration-to-4x/migrating-to-4x.adoc[Migrating to 4.x] for general migration guidance. diff --git a/docs/src/main/asciidoc/faq.adoc b/docs/src/main/asciidoc/faq.adoc new file mode 100644 index 00000000000..168c9a95474 --- /dev/null +++ b/docs/src/main/asciidoc/faq.adoc @@ -0,0 +1,28 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += FAQ and Troubleshooting + +This page covers frequently asked questions and common issues when using Apache Tika. + +== Frequently Asked Questions + +// TODO: Add FAQs + +== Troubleshooting + +// TODO: Add common issues and solutions diff --git a/docs/src/main/asciidoc/index.adoc b/docs/src/main/asciidoc/index.adoc new file mode 100644 index 00000000000..5edc9e54ee3 --- /dev/null +++ b/docs/src/main/asciidoc/index.adoc @@ -0,0 +1,72 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Apache Tika Documentation + +WARNING: This reference guide was generated with the assistance of AI and requires +human review before it can be fully trusted. This documentation serves as an example +and a starting point, but more work remains. Contributions and corrections are welcome. + +== Overview + +Apache Tika is a content detection and extraction framework written in Java. + +== Using Tika + +* xref:using-tika/index.adoc[Getting Started] - Choose your integration method +* xref:pipes/index.adoc[Pipes] - Scalable, fault-tolerant document processing + +== Configuration + +* xref:configuration/index.adoc[Configuration] - JSON configuration options + +== Migration + +* xref:migration-to-4x/index.adoc[Migrating to 4.x] - Guides and background for upgrading to Tika 4.x + +== Advanced + +* xref:advanced/index.adoc[Advanced Topics] - Custom parsers, performance tuning, internals + +== FAQ + +* xref:faq.adoc[FAQ and Troubleshooting] - Common questions and issues + +== Security + +* xref:security.adoc[Security] - Security considerations and reporting vulnerabilities + +== Roadmap + +* xref:roadmap.adoc[Roadmap] - Planned features and improvements for upcoming releases + +== For Maintainers + +* xref:maintainers/index.adoc[Maintainer Documentation] - Release guides and project maintenance + +== Links + +* https://tika.apache.org/[Apache Tika Website] - Official project website +* https://tika.apache.org/{tika-stable-version}/formats.html[Supported Formats] - File formats Tika can parse +* https://tika.apache.org/{tika-stable-version}/api/[API Documentation] - Javadoc +* https://issues.apache.org/jira/projects/TIKA[JIRA] - Issue tracker +* https://repository.apache.org/content/repositories/snapshots/org/apache/tika/[Maven Snapshots] - SNAPSHOT builds in Apache's Maven repository +* https://ci-builds.apache.org/job/Tika/[CI Builds] - Continuous integration builds +* https://cwiki.apache.org/confluence/display/TIKA/[Confluence Wiki] - Legacy wiki documentation ++ +NOTE: As of Tika 4.x, we are migrating content from Confluence to these AsciiDoc pages. +The Confluence wiki will eventually be retired. diff --git a/docs/src/main/asciidoc/maintainers/index.adoc b/docs/src/main/asciidoc/maintainers/index.adoc new file mode 100644 index 00000000000..bab767b7073 --- /dev/null +++ b/docs/src/main/asciidoc/maintainers/index.adoc @@ -0,0 +1,29 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += For Maintainers + +This section contains documentation for Apache Tika project maintainers and committers. + +== Topics + +* xref:release-guides/index.adoc[Release Guides] - How to release Apache Tika + +// Add links to specific topics as they are created +// * link:voting.html[Voting Procedures] +// * link:ci.html[Continuous Integration] +// * link:website.html[Website Maintenance] diff --git a/docs/src/main/asciidoc/maintainers/release-guides/docker.adoc b/docs/src/main/asciidoc/maintainers/release-guides/docker.adoc new file mode 100644 index 00000000000..a8f2f8cbc72 --- /dev/null +++ b/docs/src/main/asciidoc/maintainers/release-guides/docker.adoc @@ -0,0 +1,133 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Releasing Tika Docker Images + +This guide covers the process for releasing Apache Tika Docker images. + +== Prerequisites + +=== DockerHub Access + +You need permissions on the `apache/tika` repository on DockerHub. To obtain access, +create an INFRA JIRA ticket with the "Docker" label. + +=== Repository Access + +Clone the tika-docker repository: + +[source,bash] +---- +git clone https://github.com/apache/tika-docker +cd tika-docker +---- + +== Image Types + +The tika-docker repository produces two types of images: + +Minimal:: +Apache Tika with base dependencies (Java only) + +Full:: +Apache Tika plus Tesseract OCR and GDAL + +== Helper Tools + +docker-tool.sh:: +Automates building, testing, and publishing Docker images + +republish-images.sh:: +Legacy script for batch republishing images + +NOTE: The repository also contains Docker Compose files for advanced scenarios +(Vision, Grobid, OCR, NER), but these are not used for official releases. + +== Release Process + +=== Step 1: Update README + +Update the "Available Tags" section in `README.md` to include the new version. + +=== Step 2: Update Version + +Increment the TAG version in the `.env` file. + +=== Step 3: Update Changelog + +Update `CHANGES.md` with release information and date. + +=== Step 4: Test Locally + +Test the release locally before publishing: + +[source,bash] +---- +./docker-tool.sh build +./docker-tool.sh test +---- + +=== Step 5: Commit Changes + +Commit all changes: + +[source,bash] +---- +git add README.md .env CHANGES.md +git commit -m "Prepare for Docker release " +git push +---- + +=== Step 6: Build and Publish + +Build and publish the images using the docker-tool script. + +Example for version 3.1.0.0 based on Tika 3.1.0: + +[source,bash] +---- +# Build the images +./docker-tool.sh build 3.1.0.0 3.1.0 + +# Test the images +./docker-tool.sh test 3.1.0.0 + +# Publish to DockerHub +./docker-tool.sh publish 3.1.0.0 3.1.0 +---- + +NOTE: Multi-architecture building takes time. The publish step automatically +updates the `-latest` tag on DockerHub. + +=== Step 7: Tag the Release + +Create and push a git tag for the release: + +[source,bash] +---- +git tag -a 3.1.0.0 -m "New release for 3.1.0.0" +git push --tags +---- + +== Post-Release + +After publishing the Docker images: + +* Verify the images are available on DockerHub at https://hub.docker.com/r/apache/tika +* Test pulling and running the new images +* Update the main Tika website if needed +* Proceed to release the link:helm.html[Helm charts] if applicable diff --git a/docs/src/main/asciidoc/maintainers/release-guides/grpc.adoc b/docs/src/main/asciidoc/maintainers/release-guides/grpc.adoc new file mode 100644 index 00000000000..0576d23bb87 --- /dev/null +++ b/docs/src/main/asciidoc/maintainers/release-guides/grpc.adoc @@ -0,0 +1,32 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Releasing Tika gRPC + +This guide covers the process for releasing Apache Tika gRPC components. + +== Prerequisites + +// TODO: Add prerequisites + +== Release Process + +// TODO: Add release steps + +== Post-Release + +// TODO: Add post-release steps diff --git a/docs/src/main/asciidoc/maintainers/release-guides/helm.adoc b/docs/src/main/asciidoc/maintainers/release-guides/helm.adoc new file mode 100644 index 00000000000..aa80120c6fb --- /dev/null +++ b/docs/src/main/asciidoc/maintainers/release-guides/helm.adoc @@ -0,0 +1,138 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Releasing Tika Helm Charts + +This guide covers the process for releasing Apache Tika Helm charts. + +== Prerequisites + +=== Apache JFrog Artifactory Access + +You need permissions to release the Apache Tika Helm chart to the Apache Infra +Artifactory instance. Access is controlled by ASF Infra and can be requested +via a JIRA ticket with the "Artifactory" label. + +=== Repository Access + +Clone the tika-helm repository: + +[source,bash] +---- +git clone https://github.com/apache/tika-helm +cd tika-helm +---- + +Apache Tika committers should have existing access to this repository. + +=== Install Helm and Plugins + +Install Helm and the Artifactory plugin: + +[source,bash] +---- +# Install Helm (macOS) +brew install helm + +# Install the Artifactory push plugin +helm plugin install https://github.com/belitre/helm-push-artifactory-plugin --version 1.0.2 +---- + +== Docker Image Types + +The Helm chart deploys one of two upstream Docker image types: + +Minimal:: +Contains Apache Tika and base dependencies (Java only) + +Full:: +Includes Tika, dependencies, Tesseract OCR, GDAL, etc. + +The Helm Chart uses the *Full* image by default, though either can be specified +during Kubernetes deployment. + +== Versioning + +tika-helm Charts follow the https://semver.org/spec/v2.0.0.html[Semantic Versioning 2.0.0] +specification, regardless of upstream container image versioning. + +== Release Process + +=== Step 1: Update Chart Configuration + +For each new upstream tika-docker FULL release, update the following files: + +Chart.yaml:: +* Line 22: Update `version` (chart version) +* Line 23: Update `appVersion` (must match upstream tika-docker FULL release tag) + +values.yaml:: +* Line 26: Update the default image tag + +=== Step 2: Commit and Tag + +Commit the changes and create a release tag: + +[source,bash] +---- +export RELEASE_VERSION=v3.2.2 + +git add -A +git commit -m "Release tika-helm $RELEASE_VERSION" +git push origin main + +git tag -a $RELEASE_VERSION -m "Release tika-helm $RELEASE_VERSION" +git push --tags +---- + +=== Step 3: Create GitHub Release + +. Navigate to the pushed tag on GitHub +. Click the three-dot menu +. Select "Create release" +. Add release notes and publish + +=== Step 4: Publish to Apache JFrog Artifactory + +Add the Tika Helm repository and push the chart: + +[source,bash] +---- +# Add the Tika Helm repository +helm repo add tika https://apache.jfrog.io/artifactory/tika + +# Set your credentials +export HELM_REPO_USERNAME="your-apache-id" +export HELM_REPO_PASSWORD="your-password" + +# Push the chart to Artifactory +helm push-artifactory . https://apache.jfrog.io/artifactory/tika +---- + +== Post-Release + +After publishing the Helm chart: + +* Verify the chart is available at https://apache.jfrog.io/artifactory/tika +* Test installing the chart in a Kubernetes cluster +* Update any documentation referencing the chart version + +== Questions + +For questions about the Helm release process, contact: + +* dev@tika.apache.org mailing list diff --git a/docs/src/main/asciidoc/maintainers/release-guides/index.adoc b/docs/src/main/asciidoc/maintainers/release-guides/index.adoc new file mode 100644 index 00000000000..1f618e9892a --- /dev/null +++ b/docs/src/main/asciidoc/maintainers/release-guides/index.adoc @@ -0,0 +1,32 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Release Guides + +This section contains guides for releasing Apache Tika components. + +== Overview + +Apache Tika follows the standard Apache release process. This section provides +step-by-step guides for releasing the various Tika components. + +== Topics + +* xref:tika.adoc[Releasing Apache Tika] - Main Tika project release process +* xref:docker.adoc[Releasing Tika Docker Images] - Docker image release process +* xref:helm.adoc[Releasing Tika Helm Charts] - Helm chart release process +* xref:grpc.adoc[Releasing Tika gRPC] - gRPC component release process diff --git a/docs/src/main/asciidoc/maintainers/release-guides/tika.adoc b/docs/src/main/asciidoc/maintainers/release-guides/tika.adoc new file mode 100644 index 00000000000..a967c80421b --- /dev/null +++ b/docs/src/main/asciidoc/maintainers/release-guides/tika.adoc @@ -0,0 +1,271 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Releasing Apache Tika + +This guide covers the process for releasing the main Apache Tika project. + +== Prerequisites + +Before starting the release process, ensure you have: + +* Commit access to the Apache Tika repository +* A valid GPG key published to a public keyserver +* Maven credentials configured in `~/.m2/settings.xml` +* Access to Apache's Nexus repository manager + +== Pre-Release Checks + +Before starting the release, run vulnerability and dependency audits: + +[source,bash] +---- +# Identify vulnerable dependencies +mvn ossindex:audit -Dossindex.fail=true + +# Check for outdated plugins +mvn versions:display-plugin-updates + +# Check for outdated dependencies +mvn versions:display-dependency-updates + +# Run full regression tests +mvn -Prelease-profile clean verify +---- + +== Release Process + +=== Step 1: Clone the Repository + +Clone the repository if you haven't already: + +[source,bash] +---- +git clone https://github.com/apache/tika.git +cd tika +---- + +=== Step 2: Update Documentation + +Update `CHANGES.txt` with the release date: + +[source] +---- +Release X.Y.Z - MM/dd/yyyy +---- + +Add any changelog entries as needed. + +=== Step 3: JIRA Management + +. Create versions X.Y.Z, X.(Y+1), and X.(Y+2) in JIRA if they don't exist +. Reassign any unresolved X.Y.Z issues to X.(Y+1) via bulk change + +=== Step 4: Verify License Headers + +Run the Apache RAT plugin to verify all files have proper license headers: + +[source,bash] +---- +mvn apache-rat:check +---- + +=== Step 5: Commit Changes + +Commit the CHANGES.txt updates: + +[source,bash] +---- +git add CHANGES.txt +git commit -m "Prepare for X.Y.Z release" +git push +---- + +=== Step 6: Set Maven Memory + +Configure Maven memory settings: + +[source,bash] +---- +export MAVEN_OPTS="-Xms128m -Xmx256m" +---- + +=== Step 7: Prepare the Release + +Execute the Maven release prepare goal: + +[source,bash] +---- +mvn release:prepare +---- + +This will prompt you to confirm: + +* The release version (X.Y.Z) +* The SCM tag name +* The next development version + +=== Step 8: Perform the Release + +Execute the Maven release perform goal: + +[source,bash] +---- +mvn release:perform +---- + +Ensure you have valid Maven credentials in `~/.m2/settings.xml`: + +[source,xml] +---- + + + apache.releases.https + your-apache-id + your-password + + +---- + +=== Step 9: Verify Staging Repository + +. Access Apache's Nexus at https://repository.apache.org +. Log in with your Apache credentials +. Navigate to "Staging Repositories" +. Find the org.apache.tika staging repository +. Verify it contains all expected artifacts +. Click "Close" with an appropriate message + +=== Step 10: Upload Distribution Artifacts + +Upload artifacts to `dist.apache.org`: + +[source,bash] +---- +svn co https://dist.apache.org/repos/dist/dev/tika tika-dist-dev +cd tika-dist-dev +---- + +Upload the following files with their signatures (.asc) and checksums (.sha512): + +* `tika-X.Y.Z-src.zip` +* `tika-app-X.Y.Z.jar` +* `tika-server-standard-X.Y.Z.jar` + +Also: + +* Rename `CHANGES.txt` to `CHANGES-X.Y.Z.txt` +* Ensure the `KEYS` file contains all contributor signatures + +=== Step 11: Call the Vote + +Send a vote request to the dev@tika.apache.org mailing list: + +[source] +---- +Subject: [VOTE] Release Apache Tika X.Y.Z + +Hi all, + +I have created a candidate build for Apache Tika X.Y.Z. + +The release candidate artifacts can be found at: +https://dist.apache.org/repos/dist/dev/tika/ + +The staging repository is: +https://repository.apache.org/content/repositories/orgapachetika-XXXX + +The Git tag is: +https://github.com/apache/tika/tree/X.Y.Z + +Please vote: +[ ] +1 Release this package +[ ] +0 No opinion +[ ] -1 Do not release (please provide reason) + +This vote will remain open for at least 72 hours. +---- + +=== Step 12: Release the Artifacts + +Upon successful vote (at least 3 +1 votes from PMC members): + +. Release the Nexus staging repository (click "Release" button) +. Move artifacts from dev to release distribution: + +[source,bash] +---- +svn mv https://dist.apache.org/repos/dist/dev/tika/X.Y.Z \ + https://dist.apache.org/repos/dist/release/tika/X.Y.Z \ + -m "Release Apache Tika X.Y.Z" +---- + +== Post-Release + +=== Update Unreleased Modules + +Update any modules that weren't part of the release to the next SNAPSHOT version. + +=== Update Website + +Refresh the website documentation to reflect the new release: + +* Update download links +* Update version numbers in documentation +* Add release notes + +=== Release Docker and Helm Images + +Follow the separate guides for releasing: + +* link:docker.html[Docker images] +* link:helm.html[Helm charts] + +=== Send Announcements + +Send release announcements to: + +* user@tika.apache.org +* dev@tika.apache.org +* announce@apache.org + +[source] +---- +Subject: [ANNOUNCE] Apache Tika X.Y.Z Released + +The Apache Tika team is pleased to announce the release of Apache Tika X.Y.Z. + +Apache Tika is a toolkit for detecting and extracting metadata and text +from various types of files. + +This release includes: +[List major changes/features] + +For a complete list of changes, see: +https://tika.apache.org/X.Y.Z/changes.html + +Download: +https://tika.apache.org/download.html + +Thanks to everyone who contributed to this release! + +The Apache Tika Team +---- + +=== Register the Release + +Register the release at https://reporter.apache.org diff --git a/docs/src/main/asciidoc/migration-to-4x/design-notes-4x.adoc b/docs/src/main/asciidoc/migration-to-4x/design-notes-4x.adoc new file mode 100644 index 00000000000..006c4775f97 --- /dev/null +++ b/docs/src/main/asciidoc/migration-to-4x/design-notes-4x.adoc @@ -0,0 +1,127 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Design Notes for Tika 4.x + +This document captures the design decisions and architectural changes in Apache Tika 4.x. + +== Metadata Keys + +The design addresses security concerns by implementing namespaced metadata keys. This prevents +user-controlled data from potentially overwriting existing metadata values in the Metadata object. + +See link:migrating-to-4x.html[Migrating to Tika 4.x] for details on specific +metadata key changes. + +== Fat Jars and Maven Shade Strategy + +Tika 4.x moves away from fat jar/shaded artifacts. The `tika-app` and `tika-server` now use +separate `lib` and `plugins` directories alongside the jar file, enabling standard `java -jar` +execution. + +== Plugins and PF4J Framework + +=== Plugin Packaging + +PF4J plugins are packaged exclusively as zips (not jars) to align with the move away from fat +jars. Custom code addresses race conditions during the unzipping process across threads and +processes. + +=== Classloader Management + +The team disabled PF4J's default classpath loading to avoid complexity in unit tests. A +configured plugins directory is now required. + +This strict boundary prevents issues when components are loaded separately. For example, JSON +strings replace `JsonNode` objects to avoid problems with independent Jackson loading in plugins. + +IMPORTANT: We tried to have as few Tika dependencies in the plugins as possible. + +== Serialization Architecture + +=== Design Principles + +* Maximize Jackson usage while minimizing custom serialization code +* Exclude Jackson from `tika-core` and `tika-parsers-standard-modules` dependencies +* Enable runtime configuration updates via Jackson's `readerForUpdating` + +=== Security Model + +Configuration files at initialization are treated as trusted sources. Runtime +serialization/deserialization uses an allowlist of permitted packages via +`PolymorphicObjectMapperFactory`. + +Custom components can add patterns to `META-INF/tika-serialization-allowlist.txt`. + +=== Implementation Challenges + +* Converted code to true Java beans with matching getters/setters +* Used `ObjectMapper.DefaultTyping.OBJECT_AND_NON_CONCRETE` for polymorphic typing +* Replaced generic collections (`List`, `Set`) with concrete types (`ArrayList`, `HashSet`) +* Converted `Path` fields to `String` due to Jackson constraints +* Avoided Java records to enable `readerForUpdating` functionality + +== Annotations System + +The `@TikaComponent` annotation handles: + +* Automatic service file generation at build time +* Creation of `META-INF/tika/*.idx` mapping files +* Kebab-case conversion of class names to friendly identifiers (e.g., `PDFParser` → `pdf-parser`) +* Manual name overrides via `name` attribute +* Optional `spi=false` setting for non-service-file registration + +== Migration Strategy + +The plan is to stabilize 4.x structures before backporting capabilities to 3.x and deprecating +`TikaConfig` and `tika-config.xml`. + +A converter tool for transforming `tika-config.xml` to `tika-config.json` is planned, with +support focused on components in `tika-parsers-standard-modules`. + +== Development Tips + +=== Common Issues + +* Plugin directories and `@TikaComponent` annotations becoming out of sync across modules +* IntelliJ conflicts with command-line builds +* Checkstyle running before Spotless, causing preventable failures + +=== Recommended Build Commands + +For faster builds during development: + +[source,bash] +---- +mvn clean install -am -pl :tika-app -Pfast +---- + +To apply formatting and build: + +[source,bash] +---- +mvn clean spotless:apply install +---- + +== Outstanding Tasks + +* Implement flexible component loading without `@TikaComponent` requirements +* Enable friendly name usage throughout the codebase +* Resolve gRPC issues +* Fix mutool renderer byte-passing in open containers +* Simplify and strengthen serialization code +* Consider relocating `TikaConfig` and `ForkParser` to legacy module diff --git a/docs/src/main/asciidoc/migration-to-4x/index.adoc b/docs/src/main/asciidoc/migration-to-4x/index.adoc new file mode 100644 index 00000000000..c8d5be9f5d2 --- /dev/null +++ b/docs/src/main/asciidoc/migration-to-4x/index.adoc @@ -0,0 +1,32 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Migrating to Tika 4.x + +This section provides guides and background documentation for migrating to Apache Tika 4.x. + +See the xref:../roadmap.adoc[Roadmap] for version timelines and support schedules. + +== Migration Guides + +* xref:migrating-to-4x.adoc[Migration Guide] - Step-by-step guide for upgrading from Tika 3.x to 4.x +* xref:metadata-changes-4x.adoc[Metadata Changes] - Detailed metadata key changes and migration examples + +== Background Documentation + +* xref:design-notes-4x.adoc[Design Notes] - Architectural decisions and design rationale +* xref:serialization-4x.adoc[Serialization] - JSON serialization design and implementation details diff --git a/docs/src/main/asciidoc/migration-to-4x/metadata-changes-4x.adoc b/docs/src/main/asciidoc/migration-to-4x/metadata-changes-4x.adoc new file mode 100644 index 00000000000..e129d330083 --- /dev/null +++ b/docs/src/main/asciidoc/migration-to-4x/metadata-changes-4x.adoc @@ -0,0 +1,121 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Metadata Changes in Tika 4.x + +This document details the metadata key changes in Apache Tika 4.x. + +== Overview + +Tika 4.x prefixes all "user generated" metadata keys to prevent overwrites and improve +namespace clarity. This is a security-focused change that prevents user-controlled data +from potentially overwriting existing metadata values in the Metadata object. + +== Metadata Key Changes + +[cols="2,2,3"] +|=== +|Category |Change |Details + +|HTML custom metadata +|Prefixed with `html:` +|Custom metadata from HTML documents now uses the `html:` prefix + +|MAPI metadata +|Prefix changed to `mapi:` +|Microsoft MAPI properties now use the `mapi:` prefix + +|Resource name +|Renamed +|`resourceName` changed to `X-TIKA:resourceName` + +|Unrecognized image metadata +|Prefixed with `img:` +|Unrecognized image metadata keys now use the `img:` prefix + +|Office metadata +|Prefix changed +|Changed from `meta` prefix to `office` prefix +|=== + +== Migration Steps + +When upgrading to Tika 4.x, you will need to update any code that references metadata keys +directly: + +=== HTML Metadata + +[source,java] +---- +// Before (3.x) +String value = metadata.get("custom-key"); + +// After (4.x) +String value = metadata.get("html:custom-key"); +---- + +=== MAPI Metadata + +[source,java] +---- +// Before (3.x) +String value = metadata.get("mapi:some-property"); + +// After (4.x) - prefix remains mapi: but verify specific keys +String value = metadata.get("mapi:some-property"); +---- + +=== Resource Name + +[source,java] +---- +// Before (3.x) +String name = metadata.get("resourceName"); + +// After (4.x) +String name = metadata.get("X-TIKA:resourceName"); +---- + +=== Image Metadata + +[source,java] +---- +// Before (3.x) +String value = metadata.get("unknown-image-key"); + +// After (4.x) +String value = metadata.get("img:unknown-image-key"); +---- + +=== Office Metadata + +[source,java] +---- +// Before (3.x) +String value = metadata.get("meta:some-property"); + +// After (4.x) +String value = metadata.get("office:some-property"); +---- + +== Rationale + +The namespacing of metadata keys provides several benefits: + +* *Security*: Prevents user-controlled content from overwriting internal metadata +* *Clarity*: Makes it clear which parser or source generated a metadata key +* *Consistency*: Provides a uniform approach to metadata naming across all parsers diff --git a/docs/src/main/asciidoc/migration-to-4x/migrating-to-4x.adoc b/docs/src/main/asciidoc/migration-to-4x/migrating-to-4x.adoc new file mode 100644 index 00000000000..ba26d25accb --- /dev/null +++ b/docs/src/main/asciidoc/migration-to-4x/migrating-to-4x.adoc @@ -0,0 +1,157 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Migrating to Tika 4.x + +This guide covers the changes required when upgrading from Apache Tika 3.x to 4.x. + +See the xref:../roadmap.adoc[Roadmap] for version timelines and support schedules. + +== Requirements + +* Java 17 or later (upgraded from Java 11 in 3.x) + +== Configuration: XML to JSON + +Tika 4.x uses JSON configuration files instead of XML. The legacy `tika-config.xml` format +is no longer supported. + +=== Automatic Conversion + +Tika provides a conversion tool in `tika-app` to help migrate your XML configuration: + +[source,bash] +---- +java -jar tika-app.jar --convert-config-xml-to-json=tika-config.xml,tika-config.json +---- + +The converter currently supports: + +* **Parsers section** - parser declarations with parameters and exclusions +* **Parameter types** - bool, int, long, double, float, string, list, and map +* **Special handling** - TesseractOCR's `otherTesseractSettings` list is automatically + converted to the `otherTesseractConfig` map format + +=== Example Conversion + +**XML Format (3.x):** +[source,xml] +---- + + + + + true + 1000000 + + + + + + + +---- + +**JSON Format (4.x):** +[source,json] +---- +{ + "parsers": [ + { + "pdf-parser": { + "sortByPosition": true, + "maxMainMemoryBytes": 1000000 + } + }, + { + "default-parser": { + "_exclude": ["pdf-parser"] + } + } + ] +} +---- + +=== Key Differences + +[cols="1,1,2"] +|=== +|Aspect |XML (3.x) |JSON (4.x) + +|Class references +|Full class name (`org.apache.tika.parser.pdf.PDFParser`) +|Kebab-case component name (`pdf-parser`) + +|Parameters +|`value` +|Direct key-value pairs + +|Exclusions +|`` +|`"_exclude": ["component-name"]` +|=== + +NOTE: When you configure a parser with specific settings in JSON, the loader automatically +excludes it from SPI loading. Explicit exclusions are only needed when you want to disable +a parser entirely without providing custom configuration. + +=== Limitations + +The automatic converter has some limitations: + +* Only the `parsers` section is currently converted +* Detectors and other sections require manual migration +* Custom or third-party parsers not in the registry will use kebab-case name conversion + +=== Parser Configuration Changes + +WARNING: The configuration options for `PDFParser` and `TesseractOCRParser` have changed +significantly in 4.x. The automatic converter will migrate your parameter names, but you +should review the updated documentation to ensure your configuration is optimal. + +See: + +* xref:../configuration/parsers/pdf-parser.adoc[PDFParser Configuration] - Updated options for PDF parsing +* xref:../configuration/parsers/tesseract-ocr-parser.adoc[TesseractOCRParser Configuration] - Updated OCR options + +=== Full Configuration Example + +Below is a complete example of a Tika 4.x JSON configuration file with commonly configured parsers: + +[source,json] +---- +include::{parser-examples}/migration-full-example.json[] +---- + +NOTE: This example shows common options. See the individual parser configuration pages for +complete documentation of all available options. + +== Metadata Key Changes + +Tika 4.x prefixes all "user generated" metadata keys to prevent overwrites and improve +namespace clarity. + +See xref:metadata-changes-4x.adoc[Metadata Changes in 4.x] for complete details, including +a full table of changes and code migration examples. + +== API Changes + +// TODO: Document API changes + +== Deprecations and Removals + +// TODO: Document deprecated and removed features diff --git a/docs/src/main/asciidoc/migration-to-4x/serialization-4x.adoc b/docs/src/main/asciidoc/migration-to-4x/serialization-4x.adoc new file mode 100644 index 00000000000..e11bdc4959e --- /dev/null +++ b/docs/src/main/asciidoc/migration-to-4x/serialization-4x.adoc @@ -0,0 +1,101 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Serialization in Tika 4.x + +This document describes the JSON serialization design and implementation details for Apache Tika 4.x. + +== High-Level Goals + +=== Jackson Framework Integration + +Use Jackson as much as possible with as few custom serializers and as few annotations as possible. +Jackson dependencies are kept out of core modules to maintain flexibility. + +=== Friendly Naming Conventions + +Implementation uses friendly names like `pdf-parser` rather than full class names. These friendly +names are applied to configured items rather than configuration class names. + +=== Custom Class Support + +The design permits users to add custom classes through Jackson's polymorphic handling: + +* `org.apache.tika` patterns are allowed by default +* Users can define additional inclusion patterns for security + +=== Configuration Consistency + +The approach seeks to make initialization and runtime configuration look exactly the same and use +the same underlying code where possible. However, security constraints may require differences in +which fields are modifiable at runtime. + +=== Configuration Objects Over Annotations + +Preference for config objects rather than field annotations to support multithreading. Parsers +retrieve settings from `ParseContext` at runtime. + +=== Cross-System Configuration Flow + +Configuration must pass seamlessly from: + +. User clients +. Through tika-server REST APIs +. Into tika-pipes infrastructure + +== Initialization Structure + +=== Tier 1 Objects + +ID Objects:: +Fetchers, emitters - components with unique identifiers + +Composite Objects:: +Parsers, detectors - components that aggregate other components + +Single Objects:: +Pipes, gRPC, server configurations + +=== Tier 2 Objects + +Components that can be read via friendly names using `@TikaComponent` annotations in an +`other-config` section. + +== Runtime Patterns + +=== Backwards Compatibility + +The design maintains backwards compatibility by allowing `ParseContext` additions where the +interface serves as the key. + +=== Partial Configuration Updates + +Users can specify only updates to the initialization configuration through partial JSON objects, +rather than requiring complete configuration documents. + +=== Self-Configuring Components in Pipes + +In the pipes infrastructure, objects should configure themselves to avoid classloading +dependencies on components like `PDFParser`. + +== Security Considerations + +* Configuration files at initialization are treated as trusted sources +* Runtime serialization/deserialization uses an allowlist of permitted packages +* Custom components can register patterns in `META-INF/tika-serialization-allowlist.txt` + +See link:design-notes-4x.html[Design Notes for 4.x] for additional architectural context. diff --git a/docs/src/main/asciidoc/pipes/index.adoc b/docs/src/main/asciidoc/pipes/index.adoc new file mode 100644 index 00000000000..e7b49ebc3cf --- /dev/null +++ b/docs/src/main/asciidoc/pipes/index.adoc @@ -0,0 +1,37 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Tika Pipes + +This section covers Tika Pipes for scalable, fault-tolerant document processing. + +== Overview + +Tika Pipes provides a framework for processing large volumes of documents with: + +* **Fetchers** - Retrieve documents from various sources (filesystem, S3, HTTP, etc.) +* **Emitters** - Send parsed results to various destinations (filesystem, OpenSearch, Solr, etc.) +* **Pipelines** - Configure processing workflows + +== Topics + +// Add links to specific topics as they are created +// * link:getting-started.html[Getting Started] +// * link:fetchers.html[Fetchers] +// * link:emitters.html[Emitters] +// * link:configuration.html[Configuration] +// * link:async.html[Async Processing] diff --git a/docs/src/main/asciidoc/roadmap.adoc b/docs/src/main/asciidoc/roadmap.adoc new file mode 100644 index 00000000000..3e28829a434 --- /dev/null +++ b/docs/src/main/asciidoc/roadmap.adoc @@ -0,0 +1,96 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Apache Tika Roadmap + +This page outlines the planned features and improvements for Apache Tika releases. + +NOTE: All dates are in Open Source Standard Time which does not always neatly align +with traditional calendars. + +== Release Timeline + +[cols="1,3"] +|=== +|Date |Milestone + +|October 2024 +|Release 3.0.0 + +|October 2024 +|Move main branch to 4.x (Java 17) after 3.0.0 release + +|April 2025 +|End support for 2.x (and Java 8) + +|January 2026 +|Release 4.0.0 + +|June 2026 +|End support for 3.x (and Java 11) +|=== + +== Version Support Matrix + +[cols="1,1,1,2,2"] +|=== +|Version |Java |Jakarta/javax |Availability |Planned EOL + +|2.x +|8 +|javax +|Now +|April 2025 + +|3.x +|11 +|jakarta +|October 2024 +|June 2026 or 6 months after 4.0.0 release + +|4.x +|17 +|jakarta +|January 2026 +|TBD + +|5.x +|21 +|jakarta +|TBD +|TBD + +|6.x +|25 +|jakarta +|TBD +|TBD +|=== + +== Metadata Changes in 4.x + +Tika 4.x implements namespaced metadata keys to prevent overwrites and improve namespace clarity. + +See xref:migration-to-4x/metadata-changes-4x.adoc[Metadata Changes in 4.x] for complete details and +migration examples. + +== Long-term Goals + +// Add long-term goals as they are defined +// * Improved streaming support +// * Enhanced language detection +// * Better support for modern document formats diff --git a/docs/src/main/asciidoc/security.adoc b/docs/src/main/asciidoc/security.adoc new file mode 100644 index 00000000000..ddc09b72152 --- /dev/null +++ b/docs/src/main/asciidoc/security.adoc @@ -0,0 +1,34 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Security + +This page covers security considerations when using Apache Tika. + +== Security Model + +Apache Tika's security model describes the trust boundaries and assumptions that govern +how Tika processes content. Understanding this model is essential for deploying Tika securely. + +* https://tika.apache.org/security-model.html[Apache Tika Security Model] + +== Known Vulnerabilities + +For information about known security vulnerabilities (CVEs) in Apache Tika and their +remediation, please see: + +* https://tika.apache.org/security.html[Apache Tika Security Vulnerabilities] diff --git a/docs/src/main/asciidoc/using-tika/cli/index.adoc b/docs/src/main/asciidoc/using-tika/cli/index.adoc new file mode 100644 index 00000000000..56105528d73 --- /dev/null +++ b/docs/src/main/asciidoc/using-tika/cli/index.adoc @@ -0,0 +1,39 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Tika Command Line Interface + +This section covers using Apache Tika from the command line via `tika-app`. + +== Overview + +The Tika application (`tika-app.jar`) provides a command-line interface for +parsing documents, detecting content types, and extracting metadata. + +== Basic Usage + +[source,bash] +---- +java -jar tika-app.jar [options] +---- + +== Topics + +// Add links to specific topics as they are created +// * link:installation.html[Installation] +// * link:options.html[Command Line Options] +// * link:batch.html[Batch Processing] diff --git a/docs/src/main/asciidoc/using-tika/grpc/index.adoc b/docs/src/main/asciidoc/using-tika/grpc/index.adoc new file mode 100644 index 00000000000..2f1eb24adb2 --- /dev/null +++ b/docs/src/main/asciidoc/using-tika/grpc/index.adoc @@ -0,0 +1,32 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Tika gRPC + +This section covers using Apache Tika via gRPC. + +== Overview + +Tika gRPC provides a high-performance gRPC interface for parsing documents. +This is useful for microservices architectures and polyglot environments. + +== Topics + +// Add links to specific topics as they are created +// * link:getting-started.html[Getting Started] +// * link:api.html[gRPC API] +// * link:clients.html[Client Libraries] diff --git a/docs/src/main/asciidoc/using-tika/index.adoc b/docs/src/main/asciidoc/using-tika/index.adoc new file mode 100644 index 00000000000..ada34abc4c4 --- /dev/null +++ b/docs/src/main/asciidoc/using-tika/index.adoc @@ -0,0 +1,65 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Getting Started with Apache Tika + +Apache Tika can be used in several ways depending on your needs. Choose the approach +that best fits your use case. + +== Choose Your Integration Method + +xref:java-api/index.adoc[Java API]:: +Use Tika directly in your Java application. Best for tight integration and full control +over parsing behavior. + +xref:cli/index.adoc[Command Line (tika-app)]:: +Run Tika from the command line. Best for quick extraction, scripting, and one-off tasks. + +xref:server/index.adoc[Server (REST API)]:: +Run Tika as a standalone server with a REST API. Best for language-agnostic integration +and microservice architectures. + +xref:grpc/index.adoc[gRPC]:: +Use Tika via gRPC protocol. Best for high-performance, cross-language communication. + +== Which Should I Use? + +[cols="1,3"] +|=== +|Use Case |Recommended Approach + +|Java application needing content extraction +|Java API + +|Shell scripts or batch processing +|Command Line + +|Non-Java application (Python, Node.js, etc.) +|Server (REST) or gRPC + +|High-throughput processing pipeline +|Server or gRPC with xref:../pipes/index.adoc[Pipes] + +|Quick one-time extraction +|Command Line +|=== + +== Scalable Processing + +For processing large volumes of documents, see xref:../pipes/index.adoc[Tika Pipes], +which provides fault-tolerant, scalable document processing and works with all of the +above integration methods. diff --git a/docs/src/main/asciidoc/using-tika/java-api/getting-started.adoc b/docs/src/main/asciidoc/using-tika/java-api/getting-started.adoc new file mode 100644 index 00000000000..ff8df846d4b --- /dev/null +++ b/docs/src/main/asciidoc/using-tika/java-api/getting-started.adoc @@ -0,0 +1,130 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Getting Started with the Java API + +== Before You Start + +Before embedding Tika directly in your Java application, consider whether a +client-server architecture would better suit your needs. + +=== Recommended: Use tika-server or tika-grpc + +For most use cases, we recommend running Tika as a separate service rather than +embedding it directly: + +* **xref:../server/index.adoc[tika-server]** - REST API, language-agnostic +* **xref:../grpc/index.adoc[tika-grpc]** - High-performance gRPC protocol + +**Why?** + +* **Process isolation** - Parser crashes don't affect your application +* **Easier deployment** - Use official Docker images +* **Language flexibility** - Call from any language, not just Java +* **Simpler upgrades** - Update Tika independently of your application + +Docker images are available at https://hub.docker.com/r/apache/tika[Docker Hub]. + +=== When to Use the Java API + +The Java API is appropriate when you: + +* Need tight integration with Tika internals +* Cannot use a network service +* Have specific customization requirements + +== Using PipesForkParser (Recommended) + +If you must use Tika as a library, use `PipesForkParser` from the +`tika-pipes-fork-parser` module. It provides process isolation to protect your +application from parser crashes, memory leaks, and infinite loops. + +=== Maven Dependency + +[source,xml] +---- + + org.apache.tika + tika-pipes-fork-parser + ${tika.version} + +---- + +=== Basic Example + +[source,java] +---- +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.pipes.fork.PipesForkParser; +import org.apache.tika.pipes.fork.PipesForkResult; + +try (PipesForkParser parser = new PipesForkParser(); + TikaInputStream tis = TikaInputStream.get(filePath)) { + + PipesForkResult result = parser.parse(tis); + + if (result.isSuccess()) { + String content = result.getContent(); + // process content... + } else { + // handle failure + } +} +---- + +=== Key Features + +* **Process isolation** - Parsing runs in a separate JVM +* **Automatic restart** - If the forked process crashes, it restarts automatically +* **Configurable timeouts** - Prevent infinite loops +* **Thread-safe** - Reuse across multiple threads + +=== Complete Examples + +See +https://github.com/apache/tika/blob/main/tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java[PipesForkParserExample.java] +in the `tika-example` module for comprehensive examples including: + +* Basic parsing +* Handling embedded documents +* Custom configuration +* Error handling +* Batch processing + +== Without Pipes: Understanding the Risks + +If you choose not to use `PipesForkParser` and instead use Tika's parsers directly +(e.g., `AutoDetectParser`), you are responsible for handling the risks of parsing +untrusted content. + +WARNING: Running parsers directly on untrusted data can cause OutOfMemoryErrors, +infinite loops, and crashes that will affect your entire application. + +Before proceeding without process isolation, read: + +* xref:../../advanced/robustness.adoc[The Robustness of Apache Tika] - Understanding parser risks and mitigations +* https://tika.apache.org/security-model.html[Apache Tika Security Model] - Trust boundaries and assumptions + +If you still need to use parsers directly, your application is responsible for +implementing its own process isolation so that you can: + +* Set parse timeouts (Tika cannot enforce timeouts without process isolation) +* Configure memory limits (requires separate JVM) +* Kill runaway processes +* Recover from crashes + +Never run Tika in the same JVM as critical infrastructure. diff --git a/docs/src/main/asciidoc/using-tika/java-api/index.adoc b/docs/src/main/asciidoc/using-tika/java-api/index.adoc new file mode 100644 index 00000000000..703a2cf2c22 --- /dev/null +++ b/docs/src/main/asciidoc/using-tika/java-api/index.adoc @@ -0,0 +1,38 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Using Tika as a Library (Java API) + +This section covers using Apache Tika programmatically in your Java applications. + +== Overview + +Tika can be embedded directly into your Java applications as a library. This gives you +full control over parsing, detection, and configuration. + +However, for most use cases we recommend using xref:../server/index.adoc[tika-server] +or xref:../grpc/index.adoc[tika-grpc] instead. See +xref:getting-started.adoc[Getting Started] for guidance on choosing the right approach. + +== Topics + +* xref:getting-started.adoc[Getting Started] - Recommendations and PipesForkParser usage + +// Add links to specific topics as they are created +// * link:parsing.html[Parsing Documents] +// * link:detection.html[Content Detection] +// * link:configuration.html[Configuration] diff --git a/docs/src/main/asciidoc/using-tika/server/index.adoc b/docs/src/main/asciidoc/using-tika/server/index.adoc new file mode 100644 index 00000000000..accfc027007 --- /dev/null +++ b/docs/src/main/asciidoc/using-tika/server/index.adoc @@ -0,0 +1,42 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Tika Server + +This section covers running Apache Tika as a REST server via `tika-server`. + +== Overview + +Tika Server provides a RESTful HTTP interface for parsing documents and extracting +content. It can be deployed as a standalone service or in a containerized environment. + +== Basic Usage + +[source,bash] +---- +java -jar tika-server-standard.jar +---- + +The server starts on port 9998 by default. + +== Topics + +// Add links to specific topics as they are created +// * link:installation.html[Installation] +// * link:endpoints.html[REST Endpoints] +// * link:configuration.html[Configuration] +// * link:docker.html[Docker Deployment] diff --git a/pom.xml b/pom.xml index 9e451d1006e..417aab5f35a 100644 --- a/pom.xml +++ b/pom.xml @@ -62,6 +62,9 @@ apache-release + + docs + ${user.name} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/ConfigExamplesTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/ConfigExamplesTest.java new file mode 100644 index 00000000000..1429984cbf0 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/ConfigExamplesTest.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.parser.Parser; + +/** + * Validates configuration examples used in documentation. + *

+ * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + * This test class validates that each example is valid and can be loaded by TikaLoader. + *

+ * Important: When modifying examples in the config-examples directory, + * ensure the JSON remains valid and these tests pass. The documentation will automatically + * reflect your changes. + *

+ * TODO: Consider auto-generating the full config JSON files from the actual config classes + * (e.g., PDFParserConfig, TesseractOCRConfig) during the build process. This would: + *

    + *
  • Guarantee JSON always matches actual defaults
  • + *
  • Automatically catch when fields are added/removed
  • + *
  • Use Jackson's ORDER_MAP_ENTRIES_BY_KEYS for consistent ordering
  • + *
+ * Challenge: Jackson doesn't write comments in JSON output, so enum options would need + * to be documented via annotations and a post-processor, or in the AsciiDoc directly. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + + @TempDir + Path tempDir; + + private Parser loadAndValidate(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + String json = new String(is.readAllBytes(), StandardCharsets.UTF_8); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + Parser parser = loader.loadParsers(); + assertNotNull(parser, "Parser should not be null for: " + resourceName); + return parser; + } + } + + @Test + public void testPdfParserBasicConfig() throws Exception { + loadAndValidate("pdf-parser-basic.json"); + } + + @Test + public void testPdfParserFullConfig() throws Exception { + loadAndValidate("pdf-parser-full.json"); + } + + @Test + public void testTesseractBasicConfig() throws Exception { + loadAndValidate("tesseract-basic.json"); + } + + @Test + public void testTesseractFullConfig() throws Exception { + loadAndValidate("tesseract-full.json"); + } + + @Test + public void testFullMigrationExample() throws Exception { + loadAndValidate("migration-full-example.json"); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/migration-full-example.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/migration-full-example.json new file mode 100644 index 00000000000..014a7b69d7e --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/migration-full-example.json @@ -0,0 +1,26 @@ +{ + "parsers": [ + { + "pdf-parser": { + "extractInlineImages": true, + "extractUniqueInlineImagesOnly": true, + "sortByPosition": true, + "maxMainMemoryBytes": 1000000000 + } + }, + { + "tesseract-ocr-parser": { + "language": "eng+fra", + "pageSegMode": "1", + "timeoutSeconds": 300, + "otherTesseractConfig": { + "textord_initialx_ile": "0.75", + "textord_noise_hfract": "0.15625" + } + } + }, + { + "default-parser": {} + } + ] +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json new file mode 100644 index 00000000000..591e214ee67 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-basic.json @@ -0,0 +1,10 @@ +{ + "parsers": [ + { + "pdf-parser": { + "extractInlineImages": true, + "sortByPosition": true + } + } + ] +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json new file mode 100644 index 00000000000..9f455918de2 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/pdf-parser-full.json @@ -0,0 +1,53 @@ +{ + "parsers": [ + { + "pdf-parser": { + // Options: DONT_CHECK, ALLOW_EXTRACTION_FOR_ACCESSIBILITY, IGNORE_ACCESSIBILITY_ALLOWANCE + "accessCheckMode": "DONT_CHECK", + "averageCharTolerance": 0.3, + "catchIntermediateIOExceptions": true, + "detectAngles": false, + "dropThreshold": 2.5, + "enableAutoSpace": true, + "extractAcroFormContent": true, + "extractActions": false, + "extractAnnotationText": true, + "extractBookmarksText": true, + "extractFontNames": false, + "extractIncrementalUpdateInfo": true, + "extractInlineImageMetadataOnly": false, + "extractInlineImages": false, + "extractMarkedContent": false, + "extractUniqueInlineImagesOnly": true, + "ifXFAExtractOnlyXFA": false, + "ignoreContentStreamSpaceGlyphs": false, + // Options: NONE, RAW_IMAGES, RENDER_PAGES_BEFORE_PARSE, RENDER_PAGES_AT_PAGE_END + "imageStrategy": "NONE", + "maxIncrementalUpdates": 10, + "maxMainMemoryBytes": 536870912, + "ocr": { + "dpi": 300, + // Options: PNG, TIFF, JPEG + "imageFormat": "PNG", + "imageQuality": 1.0, + // Options: RGB, GRAY + "imageType": "GRAY", + // Options: NO_TEXT, TEXT_ONLY, VECTOR_GRAPHICS_ONLY, ALL + "renderingStrategy": "ALL", + // Options: AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION + "strategy": "AUTO", + "strategyAuto": { + "totalCharsPerPage": 10, + "unmappedUnicodeCharsPerPage": 10 + } + }, + "parseIncrementalUpdates": false, + "setKCMS": false, + "sortByPosition": false, + "spacingTolerance": 0.5, + "suppressDuplicateOverlappingText": false, + "throwOnEncryptedPayload": false + } + } + ] +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json new file mode 100644 index 00000000000..f41a367acca --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-basic.json @@ -0,0 +1,10 @@ +{ + "parsers": [ + { + "tesseract-ocr-parser": { + "language": "eng", + "timeoutSeconds": 120 + } + } + ] +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json new file mode 100644 index 00000000000..4e3e75aeaed --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/config-examples/tesseract-full.json @@ -0,0 +1,35 @@ +{ + "parsers": [ + { + "tesseract-ocr-parser": { + "applyRotation": false, + "colorspace": "gray", + "density": 300, + "depth": 4, + "enableImagePreprocessing": false, + "filter": "triangle", + "imageMagickPath": "", + "inlineContent": false, + "language": "eng", + "maxFileSizeToOcr": 2147483647, + "minFileSizeToOcr": 0, + // Additional Tesseract configuration parameters as key-value pairs + "otherTesseractConfig": { + "preserve_interword_spaces": "1", + "textord_initialx_ile": "0.75", + "textord_noise_hfract": "0.15625" + }, + // Options: TXT, HOCR + "outputType": "TXT", + "pageSeparator": "", + "pageSegMode": "1", + "preserveInterwordSpacing": false, + "resize": 200, + "skipOcr": false, + "tessdataPath": "", + "tesseractPath": "", + "timeoutSeconds": 120 + } + } + ] +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fs/ConfigExamplesTest.java b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fs/ConfigExamplesTest.java new file mode 100644 index 00000000000..70fe7947bb3 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/java/org/apache/tika/pipes/fs/ConfigExamplesTest.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fs; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; + +/** + * Validates file system fetcher/emitter configuration examples used in documentation. + *

+ * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + + @TempDir + Path tempDir; + + private void loadAndValidate(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + String json = new String(is.readAllBytes(), StandardCharsets.UTF_8); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + } + + @Test + public void testFileSystemFetcherConfig() throws Exception { + loadAndValidate("file-system-fetcher.json"); + } + + @Test + public void testFileSystemEmitterConfig() throws Exception { + loadAndValidate("file-system-emitter.json"); + } + + @Test + public void testFileSystemPipelineConfig() throws Exception { + loadAndValidate("file-system-pipeline.json"); + } +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-emitter.json b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-emitter.json new file mode 100644 index 00000000000..4f01761e450 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-emitter.json @@ -0,0 +1,13 @@ +{ + "emitters": [ + { + "file-system-emitter": { + "id": "my-emitter", + "basePath": "/data/output", + "fileExtension": "json", + "onExists": "REPLACE", + "prettyPrint": true + } + } + ] +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-fetcher.json b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-fetcher.json new file mode 100644 index 00000000000..201d4fa099e --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-fetcher.json @@ -0,0 +1,11 @@ +{ + "fetchers": [ + { + "file-system-fetcher": { + "id": "my-fetcher", + "basePath": "/data/documents", + "extractFileSystemMetadata": true + } + } + ] +} diff --git a/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-pipeline.json b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-pipeline.json new file mode 100644 index 00000000000..3d95755eff9 --- /dev/null +++ b/tika-pipes/tika-pipes-plugins/tika-pipes-file-system/src/test/resources/config-examples/file-system-pipeline.json @@ -0,0 +1,27 @@ +{ + "fetchers": [ + { + "file-system-fetcher": { + "id": "input-fetcher", + "basePath": "/data/input", + "extractFileSystemMetadata": true + } + } + ], + "emitters": [ + { + "file-system-emitter": { + "id": "output-emitter", + "basePath": "/data/output", + "fileExtension": "json", + "onExists": "SKIP", + "prettyPrint": false + } + } + ], + "parsers": [ + { + "default-parser": {} + } + ] +} diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java index 39849524858..b8b7e4389d4 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaObjectMapperFactory.java @@ -16,6 +16,7 @@ */ package org.apache.tika.config.loader; +import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; @@ -68,6 +69,9 @@ public static synchronized ObjectMapper getMapper() { public static ObjectMapper createMapper() { ObjectMapper mapper = new ObjectMapper(); + // Allow comments in JSON config files (// and /* */ style) + mapper.configure(JsonParser.Feature.ALLOW_COMMENTS, true); + // Fail on unknown properties to catch configuration errors early mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, true); diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/ConfigExamplesTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/ConfigExamplesTest.java new file mode 100644 index 00000000000..43dd1391e30 --- /dev/null +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/ConfigExamplesTest.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.server.core; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; + +/** + * Validates server configuration examples used in documentation. + *

+ * The JSON configuration examples are stored in {@code src/test/resources/config-examples/} + * and are included directly in the AsciiDoc documentation via the {@code include::} directive. + */ +public class ConfigExamplesTest { + + private static final String EXAMPLES_DIR = "/config-examples/"; + + @TempDir + Path tempDir; + + private void loadAndValidate(String resourceName) throws Exception { + try (InputStream is = getClass().getResourceAsStream(EXAMPLES_DIR + resourceName)) { + assertNotNull(is, "Resource not found: " + resourceName); + String json = new String(is.readAllBytes(), StandardCharsets.UTF_8); + Path configFile = tempDir.resolve("tika-config.json"); + Files.writeString(configFile, json, StandardCharsets.UTF_8); + TikaLoader loader = TikaLoader.load(configFile); + assertNotNull(loader, "TikaLoader should not be null for: " + resourceName); + } + } + + @Test + public void testServerBasicConfig() throws Exception { + loadAndValidate("server-basic.json"); + } + + @Test + public void testServerWithParsersConfig() throws Exception { + loadAndValidate("server-with-parsers.json"); + } +} diff --git a/tika-server/tika-server-core/src/test/resources/config-examples/server-basic.json b/tika-server/tika-server-core/src/test/resources/config-examples/server-basic.json new file mode 100644 index 00000000000..d133c0deeb1 --- /dev/null +++ b/tika-server/tika-server-core/src/test/resources/config-examples/server-basic.json @@ -0,0 +1,13 @@ +{ + "server": { + "port": 9998, + "host": "localhost", + "taskTimeoutMillis": 300000, + "enableUnsecureFeatures": false + }, + "parsers": [ + { + "default-parser": {} + } + ] +} diff --git a/tika-server/tika-server-core/src/test/resources/config-examples/server-with-parsers.json b/tika-server/tika-server-core/src/test/resources/config-examples/server-with-parsers.json new file mode 100644 index 00000000000..fadb08a55f7 --- /dev/null +++ b/tika-server/tika-server-core/src/test/resources/config-examples/server-with-parsers.json @@ -0,0 +1,24 @@ +{ + "server": { + "port": 9998, + "host": "0.0.0.0", + "taskTimeoutMillis": 600000, + "returnStackTrace": true + }, + "parsers": [ + { + "pdf-parser": { + "extractInlineImages": true, + "ocrStrategy": "AUTO" + } + }, + { + "default-parser": {} + } + ], + "detectors": [ + { + "default-detector": {} + } + ] +}