flyteorg
diff --git a/‎Dockerfile.connector‎
Lines changed: 9 additions & 2 deletions b/‎Dockerfile.connector‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎plugins/flytekit-spark/flytekitplugins/spark/connector.py‎
Lines changed: 378 additions & 34 deletions b/‎plugins/flytekit-spark/flytekitplugins/spark/connector.py‎
Lines changed: 378 additions & 34 deletions
diff --git a/‎plugins/flytekit-spark/flytekitplugins/spark/task.py‎
Lines changed: 231 additions & 8 deletions b/‎plugins/flytekit-spark/flytekitplugins/spark/task.py‎
Lines changed: 231 additions & 8 deletions
diff --git a/‎plugins/flytekit-spark/flytekitplugins/spark/utils.py‎
Lines changed: 18 additions & 0 deletions b/‎plugins/flytekit-spark/flytekitplugins/spark/utils.py‎
Lines changed: 18 additions & 0 deletions
@@ -7,8 +7,15 @@ ARG VERSION
 
 RUN apt-get update && apt-get install build-essential -y \
     && pip install uv
-
-RUN uv pip install --system --no-cache-dir -U flytekit[connector]==$VERSION \
+# Pin pendulum<3.0: Apache Airflow (via flytekitplugins-airflow) imports
+# pendulum.tz.timezone() at module load time (airflow/settings.py).
+# Pendulum 3.x changed the tz API, causing the connector to crash on startup:
+#   airflow/settings.py → TIMEZONE = pendulum.tz.timezone("UTC") → AttributeError
+# Without this pin, uv resolves to pendulum 3.x which breaks the import chain:
+#   pyflyte serve connector → load_implicit_plugins → airflow → pendulum → crash
+RUN uv pip install --system --no-cache-dir -U \
+  "pendulum>=2.0.0,<3.0" \
+  flytekit[connector]==$VERSION \
   flytekitplugins-airflow==$VERSION \
   flytekitplugins-bigquery==$VERSION \
   flytekitplugins-k8sdataservice==$VERSION \
 
@@ -18,6 +18,7 @@
 from flytekit.models.task import K8sPod
 
 from .models import SparkJob, SparkType
+from .utils import is_serverless_config
 
 pyspark_sql = lazy_module("pyspark.sql")
 SparkSession = pyspark_sql.SparkSession
@@ -73,17 +74,133 @@ def __post_init__(self):
 class DatabricksV2(Spark):
     """
     Use this to configure a Databricks task. Task's marked with this will automatically execute
-    natively onto databricks platform as a distributed execution of spark
+    natively onto databricks platform as a distributed execution of spark.
 
-    Args:
-        databricks_conf: Databricks job configuration compliant with API version 2.1, supporting 2.0 use cases.
-        For the configuration structure, visit here.https://docs.databricks.com/dev-tools/api/2.0/jobs.html#request-structure
-        For updates in API 2.1, refer to: https://docs.databricks.com/en/workflows/jobs/jobs-api-updates.html
-        databricks_instance: Domain name of your deployment. Use the form <account>.cloud.databricks.com.
+    Supports both classic compute (clusters) and serverless compute.
+
+    Attributes:
+        databricks_conf (Optional[Dict[str, Union[str, dict]]]): Databricks job configuration
+            compliant with API version 2.1, supporting 2.0 use cases.
+            For the configuration structure, visit: https://docs.databricks.com/dev-tools/api/2.0/jobs.html#request-structure
+            For updates in API 2.1, refer to: https://docs.databricks.com/en/workflows/jobs/jobs-api-updates.html
+        databricks_instance (Optional[str]): Domain name of your deployment.
+            Use the form <account>.cloud.databricks.com.
+        databricks_service_credential_provider (Optional[str]): Provider name for Databricks
+            Service Credentials for S3 access. Falls back to FLYTE_DATABRICKS_SERVICE_CREDENTIAL_PROVIDER env var.
+        databricks_token_secret (Optional[str]): Custom name for the K8s secret containing
+            the Databricks token. Defaults to 'databricks-token' if not specified.
+        notebook_path (Optional[str]): Path to Databricks notebook
+            (e.g., "/Users/[email protected]/notebook").
+        notebook_base_parameters (Optional[Dict[str, str]]): Parameters to pass to the notebook.
+
+    Compute Modes:
+        The connector auto-detects the compute mode based on the databricks_conf contents:
+
+        1. Classic Compute (existing cluster):
+            Provide `existing_cluster_id` in databricks_conf.
+
+        2. Classic Compute (new cluster):
+            Provide `new_cluster` configuration in databricks_conf.
+
+        3. Serverless Compute (pre-configured environment):
+            Provide `environment_key` referencing a pre-configured environment in Databricks.
+            Do not include `existing_cluster_id` or `new_cluster`.
+
+        4. Serverless Compute (inline environment spec):
+            Provide `environments` array with environment specifications.
+            Optionally include `environment_key` to specify which environment to use.
+            Do not include `existing_cluster_id` or `new_cluster`.
+
+    Example - Classic Compute with new cluster::
+
+        DatabricksV2(
+            databricks_conf={
+                "run_name": "my-spark-job",
+                "new_cluster": {
+                    "spark_version": "13.3.x-scala2.12",
+                    "node_type_id": "m5.xlarge",
+                    "num_workers": 2,
+                },
+            },
+            databricks_instance="my-workspace.cloud.databricks.com",
+        )
+
+    Example - Serverless Compute with pre-configured environment::
+
+        DatabricksV2(
+            databricks_conf={
+                "run_name": "my-serverless-job",
+                "environment_key": "my-preconfigured-env",
+            },
+            databricks_instance="my-workspace.cloud.databricks.com",
+        )
+
+    Example - Serverless Compute with inline environment spec::
+
+        DatabricksV2(
+            databricks_conf={
+                "run_name": "my-serverless-job",
+                "environment_key": "default",
+                "environments": [{
+                    "environment_key": "default",
+                    "spec": {
+                        "client": "1",
+                        "dependencies": ["pandas==2.0.0", "numpy==1.24.0"],
+                    }
+                }],
+            },
+            databricks_instance="my-workspace.cloud.databricks.com",
+        )
+
+    Note:
+        Serverless compute has certain limitations compared to classic compute:
+        - Only Python and SQL are supported (no Scala or R)
+        - Only Spark Connect APIs are supported (no RDD APIs)
+        - Must use Unity Catalog for external data sources
+        - No support for compute-scoped init scripts or libraries
+        For full details, see: https://docs.databricks.com/en/compute/serverless/limitations.html
+
+    Serverless Entrypoint:
+        Both classic and serverless use the same ``flytetools`` repo for their entrypoints.
+        Classic uses ``flytekitplugins/databricks/entrypoint.py`` and serverless uses
+        ``flytekitplugins/databricks/entrypoint_serverless.py``. No additional configuration needed.
+
+        To override the default, provide ``git_source`` and ``python_file`` in ``databricks_conf``.
+
+    AWS Credentials for Serverless:
+        Databricks serverless does not provide AWS credentials via instance metadata.
+        To access S3 (for Flyte data), configure a Databricks Service Credential.
+
+        The provider name is resolved in this order:
+        1. ``databricks_service_credential_provider`` in the task config (per-task override)
+        2. ``FLYTE_DATABRICKS_SERVICE_CREDENTIAL_PROVIDER`` environment variable on the connector (default for all tasks)
+
+        The entrypoint will use this to obtain AWS credentials via:
+        dbutils.credentials.getServiceCredentialsProvider(provider_name)
+
+    Notebook Support:
+        To run a Databricks notebook instead of a Python file, set `notebook_path`.
+        Parameters can be passed via `notebook_base_parameters`.
+
+        Example - Running a notebook::
+
+            DatabricksV2(
+                databricks_conf={
+                    "run_name": "my-notebook-job",
+                    "new_cluster": {...},
+                },
+                databricks_instance="my-workspace.cloud.databricks.com",
+                notebook_path="/Users/[email protected]/my-notebook",
+                notebook_base_parameters={"param1": "value1"},
+            )
     """
 
     databricks_conf: Optional[Dict[str, Union[str, dict]]] = None
     databricks_instance: Optional[str] = None
+    databricks_service_credential_provider: Optional[str] = None
+    databricks_token_secret: Optional[str] = None
+    notebook_path: Optional[str] = None
+    notebook_base_parameters: Optional[Dict[str, str]] = None
 
 
 # This method does not reset the SparkSession since it's a bit hard to handle multiple
@@ -187,7 +304,22 @@ def get_custom(self, settings: SerializationSettings) -> Dict[str, Any]:
             job._databricks_conf = cfg.databricks_conf
             job._databricks_instance = cfg.databricks_instance
 
-        return MessageToDict(job.to_flyte_idl())
+        # Serialize to dict
+        custom_dict = MessageToDict(job.to_flyte_idl())
+
+        # Add DatabricksV2-specific fields (not part of protobuf)
+        if isinstance(self.task_config, DatabricksV2):
+            cfg = cast(DatabricksV2, self.task_config)
+            if cfg.databricks_service_credential_provider:
+                custom_dict["databricksServiceCredentialProvider"] = cfg.databricks_service_credential_provider
+            if cfg.databricks_token_secret:
+                custom_dict["databricksTokenSecret"] = cfg.databricks_token_secret
+            if cfg.notebook_path:
+                custom_dict["notebookPath"] = cfg.notebook_path
+            if cfg.notebook_base_parameters:
+                custom_dict["notebookBaseParameters"] = cfg.notebook_base_parameters
+
+        return custom_dict
 
     def to_k8s_pod(self, pod_template: Optional[PodTemplate] = None) -> Optional[K8sPod]:
         """
@@ -210,10 +342,101 @@ def to_k8s_pod(self, pod_template: Optional[PodTemplate] = None) -> Optional[K8s
 
         return K8sPod.from_pod_template(pod_template)
 
+    def _is_databricks_serverless(self) -> bool:
+        """
+        Detect if we're running in Databricks serverless environment.
+
+        Serverless uses Spark Connect and requires different SparkSession handling.
+        """
+        # Check for explicit serverless markers set by our entrypoint
+        if os.environ.get("DATABRICKS_SERVERLESS") == "true":
+            return True
+        if os.environ.get("SPARK_CONNECT_MODE") == "true":
+            return True
+
+        is_databricks = "DATABRICKS_RUNTIME_VERSION" in os.environ
+
+        is_serverless_cfg = False
+        if isinstance(self.task_config, DatabricksV2):
+            conf = self.task_config.databricks_conf or {}
+            if is_serverless_config(conf):
+                is_serverless_cfg = True
+
+        return is_databricks and (is_serverless_cfg or "SPARK_HOME" not in os.environ)
+
+    def _get_databricks_serverless_spark_session(self) -> Optional[SparkSession]:
+        """
+        Get SparkSession in Databricks serverless environment.
+
+        The entrypoint injects the SparkSession into:
+        1. Custom module '_flyte_spark_session' in sys.modules (most reliable)
+        2. builtins.spark (backup)
+
+        Returns:
+            Optional[SparkSession]: SparkSession or None if not available.
+        """
+        import sys
+
+        # Method 1: Try custom module (most reliable - survives module reloads)
+        try:
+            if "_flyte_spark_session" in sys.modules:
+                spark_module = sys.modules["_flyte_spark_session"]
+                if hasattr(spark_module, "spark") and spark_module.spark is not None:
+                    logger.info("Got SparkSession from _flyte_spark_session module")
+                    return spark_module.spark
+        except Exception as e:
+            logger.debug(f"Could not get spark from _flyte_spark_session: {e}")
+
+        # Method 2: Try builtins (backup location)
+        try:
+            import builtins
+
+            if hasattr(builtins, "spark") and builtins.spark is not None:
+                logger.info("Got SparkSession from builtins")
+                return builtins.spark
+        except Exception as e:
+            logger.debug(f"Could not get spark from builtins: {e}")
+
+        # Method 3: Try __main__ module
+        try:
+            import __main__
+
+            if hasattr(__main__, "spark") and __main__.spark is not None:
+                logger.info("Got SparkSession from __main__")
+                return __main__.spark
+        except Exception as e:
+            logger.debug(f"Could not get spark from __main__: {e}")
+
+        # Method 4: Try active session
+        try:
+            from pyspark.sql import SparkSession
+
+            active = SparkSession.getActiveSession()
+            if active:
+                logger.info("Got active SparkSession")
+                return active
+        except Exception as e:
+            logger.debug(f"Could not get active SparkSession: {e}")
+
+        logger.warning("Could not obtain SparkSession in serverless environment")
+        return None
+
     def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters:
         import pyspark as _pyspark
 
         ctx = FlyteContextManager.current_context()
+
+        # Databricks serverless uses Spark Connect - SparkSession is pre-configured
+        if self._is_databricks_serverless():
+            logger.info("Detected Databricks serverless environment - using pre-configured SparkSession")
+            self.sess = self._get_databricks_serverless_spark_session()
+
+            if self.sess is None:
+                logger.warning("No SparkSession available - task will run without Spark")
+
+            return user_params.builder().add_attr("SPARK_SESSION", self.sess).build()
+
+        # Standard Spark session creation for non-serverless environments
         sess_builder = _pyspark.sql.SparkSession.builder.appName(f"FlyteSpark: {user_params.execution_id}")
         if not (ctx.execution_state and ctx.execution_state.mode == ExecutionState.Mode.TASK_EXECUTION):
             # If either of above cases is not true, then we are in local execution of this task
@@ -259,7 +482,7 @@ def execute(self, **kwargs) -> Any:
                 if ctx.execution_state and ctx.execution_state.is_local_execution():
                     return AsyncConnectorExecutorMixin.execute(self, **kwargs)
             except Exception as e:
-                click.secho(f"❌ Connector failed to run the task with error: {e}", fg="red")
+                click.secho(f"Connector failed to run the task with error: {e}", fg="red")
                 click.secho("Falling back to local execution", fg="red")
         return PythonFunctionTask.execute(self, **kwargs)
 
 
@@ -0,0 +1,18 @@
+def is_serverless_config(databricks_conf: dict) -> bool:
+    """
+    Detect if the Databricks configuration is for serverless compute.
+
+    Serverless is indicated by having ``environment_key`` or ``environments``
+    without any cluster config (``existing_cluster_id`` or ``new_cluster``).
+
+    Args:
+        databricks_conf (dict): The databricks job configuration dict.
+
+    Returns:
+        bool: True if the configuration targets serverless compute.
+    """
+    has_cluster_config = (
+        databricks_conf.get("existing_cluster_id") is not None or databricks_conf.get("new_cluster") is not None
+    )
+    has_serverless_config = bool(databricks_conf.get("environment_key") or databricks_conf.get("environments"))
+    return not has_cluster_config and has_serverless_config