diff --git a/daft/io/lance/lance_data_sink.py b/daft/io/lance/lance_data_sink.py index 9b259466e9..9b46b1d150 100644 --- a/daft/io/lance/lance_data_sink.py +++ b/daft/io/lance/lance_data_sink.py @@ -118,8 +118,10 @@ def _load_existing_dataset(self, lance_module: ModuleType) -> Any | None: try: return lance_module.dataset(self._table_uri, storage_options=self._storage_options) except (ValueError, FileNotFoundError, OSError) as e: - # Check if this is specifically a "dataset not found" error - if "not found" in str(e).lower() or "does not exist" in str(e).lower(): + err_msg = str(e).lower() + # Check if this is specifically a "dataset not found" error or a "not a directory" error + # (e.g. when the target path points to a file instead of a directory) + if "not found" in err_msg or "does not exist" in err_msg or "not a directory" in err_msg: if self._mode == "append": raise ValueError("Cannot append to non-existent Lance dataset.") return None diff --git a/daft/io/lance/lance_scalar_index.py b/daft/io/lance/lance_scalar_index.py index 451c9f98b5..e6ee465c8a 100644 --- a/daft/io/lance/lance_scalar_index.py +++ b/daft/io/lance/lance_scalar_index.py @@ -27,7 +27,7 @@ def __init__( column: str, index_type: str, name: str, - fragment_uuid: str, + index_uuid: str, replace: bool, **kwargs: Any, ) -> None: @@ -35,7 +35,7 @@ def __init__( self.column = column self.index_type = index_type self.name = name - self.fragment_uuid = fragment_uuid + self.index_uuid = index_uuid self.replace = replace self.kwargs = kwargs @@ -51,7 +51,7 @@ def __call__(self, fragment_ids: list[int]) -> bool: index_type=self.index_type, name=self.name, replace=self.replace, - fragment_uuid=self.fragment_uuid, + index_uuid=self.index_uuid, fragment_ids=fragment_ids, **self.kwargs, ) @@ -127,9 +127,8 @@ def create_scalar_index_internal( existing_indices = [] try: existing_indices = lance_ds.list_indices() - except Exception: - # If we can't check existing indices, continue - pass + except Exception as e: + logger.warning("Could not fetch existing indices for removal; old index may not be cleaned up: %s", e) existing_names = {idx["name"] for idx in existing_indices} if name in existing_names: raise ValueError(f"Index with name '{name}' already exists. Set replace=True to replace it.") @@ -180,7 +179,7 @@ def create_scalar_index_internal( column=column, index_type=index_type, name=name, - fragment_uuid=index_id, + index_uuid=index_id, replace=replace, **kwargs, ) @@ -208,9 +207,30 @@ def create_scalar_index_internal( fragment_ids=set(fragment_ids_to_use), index_version=0, ) + + # When replacing, find and remove existing indices with the same name + removed_indices = [] + if replace: + try: + existing_indices = lance_ds.list_indices() + for idx in existing_indices: + if idx["name"] == name: + removed_indices.append( + lance.Index( + uuid=idx["uuid"], + name=idx["name"], + fields=[lance_ds.schema.get_field_index(f) for f in idx["fields"]], + dataset_version=idx.get("dataset_version", lance_ds.version), + fragment_ids=idx.get("fragment_ids", set()), + index_version=0, + ) + ) + except Exception as e: + logger.warning("Could not fetch existing indices for removal; old index may not be cleaned up: %s", e) + create_index_op = lance.LanceOperation.CreateIndex( new_indices=[index], - removed_indices=[], + removed_indices=removed_indices, ) # Commit the index operation atomically diff --git a/pyproject.toml b/pyproject.toml index fc73264289..82eb46deb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ gravitino = ["requests>=2.28.0,<3.0.0"] hudi = ["pyarrow >= 8.0.0,<22.1.0"] huggingface = ["huggingface-hub<1.5.0", "datasets<4.6.0"] iceberg = ["pyiceberg >= 0.7.0, <= 0.11.0, != 0.9.1, != 0.10.0"] -lance = ["pylance<0.40.0"] +lance = ["pylance>=4.0.0,<5.0.0"] numpy = ["numpy<2.4.0"] openai = ["openai<2.21.0", "numpy<2.4.0", "pillow==12.1.1"] pandas = ["pandas<2.4.0"] @@ -108,7 +108,7 @@ dev = [ # Ray "ray[data, client]==2.53.0", # Lance - "pylance==0.39.0", + "pylance==4.0.0", # Iceberg "pyiceberg==0.11.0", "pydantic==2.12.4", diff --git a/tests/io/lancedb/test_lancedb_scalar_index.py b/tests/io/lancedb/test_lancedb_scalar_index.py index 83c56c7216..f9d9fcf568 100644 --- a/tests/io/lancedb/test_lancedb_scalar_index.py +++ b/tests/io/lancedb/test_lancedb_scalar_index.py @@ -197,7 +197,7 @@ def test_build_distributed_index_invalid_index_type(self, multi_fragment_lance_d with pytest.raises( NotImplementedError, - match=r'Only "BTREE", "BITMAP", "NGRAM", "ZONEMAP", "LABEL_LIST", or "INVERTED" or "BLOOMFILTER" are supported for scalar columns. Received INVALID', + match=r"Received INVALID", ): create_scalar_index( uri=dataset_uri, diff --git a/uv.lock b/uv.lock index 015ab04ac1..24069c0d3b 100644 --- a/uv.lock +++ b/uv.lock @@ -1664,7 +1664,7 @@ requires-dist = [ { name = "pyarrow", specifier = ">=8.0.0,<24.0.0" }, { name = "pyarrow", marker = "extra == 'hudi'", specifier = ">=8.0.0,<22.1.0" }, { name = "pyiceberg", marker = "extra == 'iceberg'", specifier = ">=0.7.0,!=0.9.1,!=0.10.0,<=0.11.0" }, - { name = "pylance", marker = "extra == 'lance'", specifier = "<0.40.0" }, + { name = "pylance", marker = "extra == 'lance'", specifier = ">=4.0.0,<5.0.0" }, { name = "ray", extras = ["data", "client"], marker = "sys_platform == 'win32' and extra == 'ray'", specifier = ">=2.10.0,<2.54.0" }, { name = "ray", extras = ["data", "client"], marker = "sys_platform != 'win32' and extra == 'ray'", specifier = ">=2.0.0,<2.54.0" }, { name = "requests", marker = "extra == 'gravitino'", specifier = ">=2.28.0,<3.0.0" }, @@ -1734,7 +1734,7 @@ dev = [ { name = "pyarrow-stubs", specifier = "==20.0.0.20251215" }, { name = "pydantic", specifier = "==2.12.4" }, { name = "pyiceberg", specifier = "==0.11.0" }, - { name = "pylance", specifier = "==0.39.0" }, + { name = "pylance", specifier = "==4.0.0" }, { name = "pymysql", specifier = "==1.1.2" }, { name = "pyodbc", specifier = "==5.3.0" }, { name = "pytest", specifier = "==9.0.2" }, @@ -3580,22 +3580,19 @@ wheels = [ [[package]] name = "lance-namespace" -version = "0.0.21" +version = "0.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "lance-namespace-urllib3-client" }, - { name = "pyarrow" }, - { name = "pylance" }, - { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/f8/2d/d82eed4279aaeeeea0c1a49f7f7a5421ab2f462187cb883671beec0960d6/lance_namespace-0.0.21.tar.gz", hash = "sha256:11e0d2e07e8a0b8aa53c27b0aa088f55f7862f712edfababc4b85d001067c1d0", size = 32804, upload-time = "2025-11-14T07:05:53.551Z" } +sdist = { url = "https://files.pythonhosted.org/packages/28/9f/7906ba4117df8d965510285eaf07264a77de2fd283b9d44ec7fc63a4a57a/lance_namespace-0.6.1.tar.gz", hash = "sha256:f0deea442bd3f1056a8e2fed056ae2778e3356517ec2e680db049058b824d131", size = 10666, upload-time = "2026-03-17T17:55:44.977Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a1/7d/36f6b9244052989648534e1ad36a5bb971ba448c0773f1e5bc46a34b0c52/lance_namespace-0.0.21-py3-none-any.whl", hash = "sha256:f76660791ccebcab968f53ac68d2e4253e34ebbd7781f452d932ef28a48e3f9e", size = 25335, upload-time = "2025-11-14T07:05:51.735Z" }, + { url = "https://files.pythonhosted.org/packages/d1/91/aee1c0a04d17f2810173bd304bd444eb78332045df1b0c1b07cebd01f530/lance_namespace-0.6.1-py3-none-any.whl", hash = "sha256:9699c9e3f12236e5e08ea979cc4e036a8e3c67ed2f37ae6f25c5353ab908e1be", size = 12498, upload-time = "2026-03-17T17:55:44.062Z" }, ] [[package]] name = "lance-namespace-urllib3-client" -version = "0.0.21" +version = "0.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydantic" }, @@ -3603,9 +3600,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/7d/10/81e924f60520f66ce90218b538b7f5790721574d0719f527eca7f3f2b1e0/lance_namespace_urllib3_client-0.0.21.tar.gz", hash = "sha256:0c069ac9866c75e2e142ca22ef6b27aaf60adf8c0fa734186b045d353d4a9b6d", size = 134493, upload-time = "2025-11-14T07:05:55.079Z" } +sdist = { url = "https://files.pythonhosted.org/packages/63/a1/8706a2be25bd184acccc411e48f1a42a4cbf3b6556cba15b9fcf4c15cfcc/lance_namespace_urllib3_client-0.6.1.tar.gz", hash = "sha256:31fbd058ce1ea0bf49045cdeaa756360ece0bc61e9e10276f41af6d217debe87", size = 182567, upload-time = "2026-03-17T17:55:46.87Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/71/bf/9c7153c29d3001faa8a9e0c893a139757dcf5be39e997f6c6d60c8f494e5/lance_namespace_urllib3_client-0.0.21-py3-none-any.whl", hash = "sha256:b3e4fe3bbe0d377a7d1011baefbbab51cc688fe56bc3059d30271bd553fc508e", size = 229637, upload-time = "2025-11-14T07:05:54.099Z" }, + { url = "https://files.pythonhosted.org/packages/cd/c7/cb9580602dec25f0fdd6005c1c9ba1d4c8c0c3dc8d543107e5a9f248bba8/lance_namespace_urllib3_client-0.6.1-py3-none-any.whl", hash = "sha256:b9c103e1377ad46d2bd70eec894bfec0b1e2133dae0964d7e4de543c6e16293b", size = 317111, upload-time = "2026-03-17T17:55:45.546Z" }, ] [[package]] @@ -7200,7 +7197,7 @@ crypto = [ [[package]] name = "pylance" -version = "0.39.0" +version = "4.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "lance-namespace" }, @@ -7209,13 +7206,12 @@ dependencies = [ { name = "pyarrow" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/ef/99/a8a610ca0dd5ece26ccbfdb15803a9df1c2ae3a5d97918434c2e43aa25fc/pylance-0.39.0-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:faa6fbf45c345e430f4be75da86071fdab56550e94e657a749b7407b4add3a8f", size = 47094423, upload-time = "2025-11-04T05:35:47.689Z" }, - { url = "https://files.pythonhosted.org/packages/ce/c7/40781533b4596547785bbd828bfddde9f3242249eb4df3aa5a568420bde9/pylance-0.39.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:99b9fe4d884964ad679323bc99c1d3f0ec65266dbc13cb35c358d21cd22c18d7", size = 42942613, upload-time = "2025-11-04T05:24:33.273Z" }, - { url = "https://files.pythonhosted.org/packages/28/70/d1f696c521ab4e9337ab8a8ad64e5d475184d2d5b237d3071e3bee13a6ad/pylance-0.39.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84e013acb6af5b2b8bda8357f6f963138ab348261cccb7f5a67d6c07a5314db", size = 45086441, upload-time = "2025-11-04T05:19:20.696Z" }, - { url = "https://files.pythonhosted.org/packages/da/e7/c9bb07dbbd690d28bf651e3b6f06e34cf41a40a8549a0fb312939f435f80/pylance-0.39.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc28f23ea894ded1e343c1b16bac0c78d87a7484cc1837c56035532b34d9fd2b", size = 48656564, upload-time = "2025-11-04T05:23:19.931Z" }, - { url = "https://files.pythonhosted.org/packages/45/fd/dd90a3618cbe86fe1de13dc48322f35e893a553e0c7ec4aac0c82761e655/pylance-0.39.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:800da785463141648e24334e238201771a1227541323de4d4ebad78d234a3739", size = 45116876, upload-time = "2025-11-04T05:18:54.479Z" }, - { url = "https://files.pythonhosted.org/packages/18/21/5a3d8ca55e56c24d5a82818d561f1b6aceb0747d0e6cd00021cfb3261668/pylance-0.39.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:56a3e7252d958ad6191e104f0c4d804b6dd9956addf066b77a6b876b78c2aa39", size = 48632562, upload-time = "2025-11-04T05:23:04.298Z" }, - { url = "https://files.pythonhosted.org/packages/ae/3b/bf16ad8410b493f6bc0d8021b07e59e9641c9180f2da4450ba509663e6d4/pylance-0.39.0-cp39-abi3-win_amd64.whl", hash = "sha256:2a0547c36b9796993367fbbce423cc161af99f66bf58bd181b0d4a48af640c50", size = 50506288, upload-time = "2025-11-04T05:41:27.124Z" }, + { url = "https://files.pythonhosted.org/packages/19/29/5152da1261a628c293876917b6185538bd68f4cf1420da6265b5be79d09b/pylance-4.0.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:7310892f3089eeddb1af1fe5c398b71cc483a3015646caceaa2f62fc92b227b2", size = 54420876, upload-time = "2026-03-30T18:18:37.525Z" }, + { url = "https://files.pythonhosted.org/packages/99/ae/7edbbfc18c3be43eedb886e74a17826c09fdf35588b35912f2733779ea43/pylance-4.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57f6a521b1b4b77a62d791850213a854093719c7d76b9641e8abcd445eb73e56", size = 56752552, upload-time = "2026-03-30T18:24:21.331Z" }, + { url = "https://files.pythonhosted.org/packages/ef/88/6d8bda83224bac52806f09d3e211d8886b81500384948a753c4b24c11f35/pylance-4.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e433d6bddd66de99c58e472bc3e8ed1590c7ff4ff7948479254c1c2111a601a8", size = 60305704, upload-time = "2026-03-30T18:35:23.425Z" }, + { url = "https://files.pythonhosted.org/packages/52/f3/8d8369c756c4173ea070f6964213f9b622ac278bd04a058c48d00a549177/pylance-4.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:f36dce83c11cd5d598cb0f64bad7c51fc21ed43df868b9029184a385c6bf4d84", size = 56771233, upload-time = "2026-03-30T18:25:40.012Z" }, + { url = "https://files.pythonhosted.org/packages/66/e6/53e0713440685b1c76e20d72755eca2e531cc182ea9a612b4cb6a15abe50/pylance-4.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9ca03f97f22e0b75f06378c4006d587aba26408122fd066f0e43e2b7a019c67e", size = 60260813, upload-time = "2026-03-30T18:36:07.976Z" }, + { url = "https://files.pythonhosted.org/packages/1e/04/5f22b88c8965d3982f68f67bfe24d756e7b788e10392d2bec6f97f5eb0e3/pylance-4.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:9261c32d3bd6aaab33025a45b20c2f2554804e1bc2a1ec2bfcb06f0c9d2e59b9", size = 65137830, upload-time = "2026-03-30T18:37:33.048Z" }, ] [[package]]