Skip to content

Commit

Permalink
Patch release 2.2.5 (#19893)
Browse files Browse the repository at this point in the history
Co-authored-by: thomas chaton <thomas@grid.ai>
Co-authored-by: Luca Antiga <luca.antiga@gmail.com>
  • Loading branch information
3 people authored May 22, 2024
1 parent 2a46b0c commit ac3f1ee
Show file tree
Hide file tree
Showing 16 changed files with 125 additions and 36 deletions.
3 changes: 3 additions & 0 deletions docs/source-app/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,3 +449,6 @@ def find_source():

# ignore all links in any CHANGELOG file
linkcheck_exclude_documents = [r"^(.*\/)*CHANGELOG.*$"]

# ignore the following relative links (false positive errors during linkcheck)
linkcheck_ignore = ["https://openai.com/"]
3 changes: 1 addition & 2 deletions docs/source-pytorch/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,8 +343,6 @@ def _load_py_module(name: str, location: str) -> ModuleType:
"graphcore": ("https://docs.graphcore.ai/en/latest/", None),
"lightning_habana": ("https://lightning-ai.github.io/lightning-Habana/", None),
"tensorboardX": ("https://tensorboardx.readthedocs.io/en/stable/", None),
# needed for referencing App from lightning scope
"lightning.app": ("https://lightning.ai/docs/app/stable/", None),
# needed for referencing Fabric from lightning scope
"lightning.fabric": ("https://lightning.ai/docs/fabric/stable/", None),
# TODO: these are missing objects.inv
Expand Down Expand Up @@ -626,4 +624,5 @@ def package_list_from_file(file):
"https://stackoverflow.com/questions/66640705/how-can-i-install-grpcio-on-an-apple-m1-silicon-laptop",
"https://github.com/Lightning-AI/lightning/blob/master/examples/pytorch/ipu/mnist_sample.py",
"https://ngc.nvidia.com/catalog/containers/nvidia:nemo", # in ecosystem/asr_nlp_tts.rst
"https://openai.com/",
]
2 changes: 1 addition & 1 deletion requirements/app/app.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
lightning-cloud == 0.5.68 # Must be pinned to ensure compatibility
lightning-cloud == 0.5.69 # Must be pinned to ensure compatibility
packaging
typing-extensions >=4.4.0, <4.10.0
deepdiff >=5.7.0, <6.6.0
Expand Down
12 changes: 8 additions & 4 deletions src/lightning/app/core/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from lightning.app.api.request_types import _APIRequest, _CommandRequest, _DeltaRequest
from lightning.app.core.constants import (
BATCH_DELTA_COUNT,
CHECK_ERROR_QUEUE_INTERVAL,
DEBUG_ENABLED,
FLOW_DURATION_SAMPLES,
FLOW_DURATION_THRESHOLD,
Expand Down Expand Up @@ -165,6 +166,7 @@ def __init__(

self._last_run_time: float = 0.0
self._run_times: list = []
self._last_check_error_queue: float = 0.0

# Path attributes can't get properly attached during the initialization, because the full name
# is only available after all Flows and Works have been instantiated.
Expand Down Expand Up @@ -318,10 +320,12 @@ def batch_get_state_changed_from_queue(q: BaseQueue, timeout: Optional[float] =
return []

def check_error_queue(self) -> None:
exception: Exception = self.get_state_changed_from_queue(self.error_queue) # type: ignore[assignment,arg-type]
if isinstance(exception, Exception):
self.exception = exception
self.stage = AppStage.FAILED
if (time() - self._last_check_error_queue) > CHECK_ERROR_QUEUE_INTERVAL:
exception: Exception = self.get_state_changed_from_queue(self.error_queue) # type: ignore[assignment,arg-type]
if isinstance(exception, Exception):
self.exception = exception
self.stage = AppStage.FAILED
self._last_check_error_queue = time()

@property
def flows(self) -> List[Union[LightningWork, "LightningFlow"]]:
Expand Down
2 changes: 2 additions & 0 deletions src/lightning/app/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def get_lightning_cloud_url() -> str:
LIGHTNING_COMPONENT_PUBLIC_REGISTRY = "https://lightning.ai/v1/components"
LIGHTNING_APPS_PUBLIC_REGISTRY = "https://lightning.ai/v1/apps"
LIGHTNING_MODELS_PUBLIC_REGISTRY = "https://lightning.ai/v1/models"
ENABLE_ORCHESTRATOR = bool(int(os.getenv("ENABLE_ORCHESTRATOR", "1")))

LIGHTNING_CLOUDSPACE_HOST = os.getenv("LIGHTNING_CLOUDSPACE_HOST")
LIGHTNING_CLOUDSPACE_EXPOSED_PORT_COUNT = int(os.getenv("LIGHTNING_CLOUDSPACE_EXPOSED_PORT_COUNT", "0"))
Expand Down Expand Up @@ -99,6 +100,7 @@ def get_lightning_cloud_url() -> str:
SYS_CUSTOMIZATIONS_SYNC_PATH = ".sys-customizations-sync"

BATCH_DELTA_COUNT = int(os.getenv("BATCH_DELTA_COUNT", "128"))
CHECK_ERROR_QUEUE_INTERVAL = float(os.getenv("CHECK_ERROR_QUEUE_INTERVAL", "30"))


def enable_multiple_works_in_default_container() -> bool:
Expand Down
4 changes: 2 additions & 2 deletions src/lightning/app/runners/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
CloudspaceIdRunsBody,
Externalv1LightningappInstance,
Gridv1ImageSpec,
IdGetBody1,
IdGetBody,
ProjectIdCloudspacesBody,
V1BuildSpec,
V1CloudSpace,
Expand Down Expand Up @@ -1027,7 +1027,7 @@ def _api_create_run_instance(
project_id=project_id,
cloudspace_id=cloudspace_id,
id=run_id,
body=IdGetBody1(
body=IdGetBody(
cluster_id=cluster_id,
name=run_name,
desired_state=desired_state,
Expand Down
21 changes: 11 additions & 10 deletions src/lightning/app/runners/multiprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,17 @@ def dispatch(self, *args: Any, open_ui: bool = True, **kwargs: Any):

_set_flow_context()

storage_orchestrator = StorageOrchestrator(
self.app,
self.app.request_queues,
self.app.response_queues,
self.app.copy_request_queues,
self.app.copy_response_queues,
)
self.threads.append(storage_orchestrator)
storage_orchestrator.setDaemon(True)
storage_orchestrator.start()
if constants.ENABLE_ORCHESTRATOR:
storage_orchestrator = StorageOrchestrator(
self.app,
self.app.request_queues,
self.app.response_queues,
self.app.copy_request_queues,
self.app.copy_response_queues,
)
self.threads.append(storage_orchestrator)
storage_orchestrator.setDaemon(True)
storage_orchestrator.start()

if self.start_server:
self.app.should_publish_changes_to_api = True
Expand Down
10 changes: 7 additions & 3 deletions src/lightning/app/utilities/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,14 @@ def create_retry_strategy():
# are going to be alive for a very long time (~ 4 days) but retries every 120 seconds
total=_CONNECTION_RETRY_TOTAL,
backoff_factor=_CONNECTION_RETRY_BACKOFF_FACTOR,
# Any 4xx and 5xx statuses except
# 400 Bad Request
# 401 Unauthorized
# 403 Forbidden
# 404 Not Found
status_forcelist={
408, # Request Timeout
429, # Too Many Requests
*range(500, 600), # Any 5xx Server Error status
402,
*range(405, 600),
},
allowed_methods={
"POST", # Default methods are idempotent, add POST here
Expand Down
7 changes: 7 additions & 0 deletions src/lightning/fabric/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).


## [2.2.5] - 2024-05-23

### Fixed

- Fixed a matrix shape mismatch issue when running a model loaded from a quantized checkpoint (bitsandbytes) ([#19886](https://github.com/Lightning-AI/lightning/pull/19886))


## [2.2.2] - 2024-04-11

### Fixed
Expand Down
12 changes: 6 additions & 6 deletions src/lightning/fabric/plugins/precision/bitsandbytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torc
"""Inplace quantize."""
if weight is None:
weight = self.weight.data
if weight.data.type == torch.int8:
# already quantized
return
if weight.data.dtype == torch.int8:
# already quantized
return
assert isinstance(self.weight, bnb.nn.Int8Params)
self.weight = self.quantize(self.weight, weight, device)

Expand Down Expand Up @@ -318,9 +318,9 @@ def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torc
"""Inplace quantize."""
if weight is None:
weight = self.weight.data
if weight.data.type == torch.uint8:
# already quantized
return
if weight.data.dtype == torch.uint8:
# already quantized
return
assert isinstance(self.weight, bnb.nn.Params4bit)
self.weight = self.quantize(self.weight, weight, device)

Expand Down
7 changes: 7 additions & 0 deletions src/lightning/pytorch/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).


## [2.2.5] - 2024-05-23

### Fixed

- Fixed a matrix shape mismatch issue when running a model loaded from a quantized checkpoint (bitsandbytes) ([#19886](https://github.com/Lightning-AI/lightning/pull/19886))


## [2.2.3] - 2024-04-23

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion src/version.info
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.2.4
2.2.5
24 changes: 24 additions & 0 deletions tests/tests_app/core/test_lightning_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -1187,3 +1187,27 @@ def run(self):
def test_lightning_work_stopped():
app = LightningApp(SimpleWork2())
MultiProcessRuntime(app, start_server=False).dispatch()


class FailedWork(LightningWork):
def run(self):
raise Exception


class CheckErrorQueueLightningApp(LightningApp):
def check_error_queue(self):
super().check_error_queue()


def test_error_queue_check(monkeypatch):
import sys

from lightning.app.core import app as app_module

sys_mock = mock.MagicMock()
monkeypatch.setattr(app_module, "CHECK_ERROR_QUEUE_INTERVAL", 0)
monkeypatch.setattr(sys, "exit", sys_mock)
app = LightningApp(FailedWork())
MultiProcessRuntime(app, start_server=False).dispatch()
assert app.stage == AppStage.FAILED
assert app._last_check_error_queue != 0.0
10 changes: 5 additions & 5 deletions tests/tests_app/runners/test_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
Externalv1Cluster,
Externalv1LightningappInstance,
Gridv1ImageSpec,
IdGetBody1,
IdGetBody,
ProjectIdProjectclustersbindingsBody,
V1BuildSpec,
V1CloudSpace,
Expand Down Expand Up @@ -508,7 +508,7 @@ def test_basic_auth_enabled(self, tmpdir, monkeypatch):
project_id="test-project-id",
cloudspace_id=mock.ANY,
id=mock.ANY,
body=IdGetBody1(
body=IdGetBody(
desired_state=mock.ANY,
name=mock.ANY,
env=mock.ANY,
Expand Down Expand Up @@ -712,7 +712,7 @@ def test_call_with_queue_server_type_specified(self, tmpdir, lightningapps, monk
cloud_runtime.dispatch()

# calling with no env variable set
body = IdGetBody1(
body = IdGetBody(
desired_state=V1LightningappInstanceState.STOPPED,
env=[],
name=mock.ANY,
Expand All @@ -727,7 +727,7 @@ def test_call_with_queue_server_type_specified(self, tmpdir, lightningapps, monk
monkeypatch.setitem(os.environ, "LIGHTNING_CLOUD_QUEUE_TYPE", "http")
cloud_runtime.backend.client.reset_mock()
cloud_runtime.dispatch()
body = IdGetBody1(
body = IdGetBody(
desired_state=V1LightningappInstanceState.STOPPED,
env=mock.ANY,
name=mock.ANY,
Expand Down Expand Up @@ -998,7 +998,7 @@ def test_call_with_work_app_and_app_comment_command_execution_set(self, lightnin
project_id="test-project-id",
cloudspace_id=mock.ANY,
id=mock.ANY,
body=IdGetBody1(
body=IdGetBody(
desired_state=V1LightningappInstanceState.STOPPED,
name=mock.ANY,
env=[V1EnvVar(name="ENABLE_APP_COMMENT_COMMAND_EXECUTION", value="1")],
Expand Down
8 changes: 6 additions & 2 deletions tests/tests_app/utilities/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def test_find_free_network_port_cloudspace(_, patch_constants):
def test_http_client_retry_post(getconn_mock):
getconn_mock.return_value.getresponse.side_effect = [
mock.Mock(status=500, msg=HTTPMessage()),
mock.Mock(status=429, msg=HTTPMessage()),
mock.Mock(status=599, msg=HTTPMessage()),
mock.Mock(status=405, msg=HTTPMessage()),
mock.Mock(status=200, msg=HTTPMessage()),
]

Expand All @@ -61,14 +62,16 @@ def test_http_client_retry_post(getconn_mock):
mock.call("POST", "/test", body=None, headers=mock.ANY),
mock.call("POST", "/test", body=None, headers=mock.ANY),
mock.call("POST", "/test", body=None, headers=mock.ANY),
mock.call("POST", "/test", body=None, headers=mock.ANY),
]


@mock.patch("urllib3.connectionpool.HTTPConnectionPool._get_conn")
def test_http_client_retry_get(getconn_mock):
getconn_mock.return_value.getresponse.side_effect = [
mock.Mock(status=500, msg=HTTPMessage()),
mock.Mock(status=429, msg=HTTPMessage()),
mock.Mock(status=599, msg=HTTPMessage()),
mock.Mock(status=405, msg=HTTPMessage()),
mock.Mock(status=200, msg=HTTPMessage()),
]

Expand All @@ -80,4 +83,5 @@ def test_http_client_retry_get(getconn_mock):
mock.call("GET", "/test", body=None, headers=mock.ANY),
mock.call("GET", "/test", body=None, headers=mock.ANY),
mock.call("GET", "/test", body=None, headers=mock.ANY),
mock.call("GET", "/test", body=None, headers=mock.ANY),
]
34 changes: 34 additions & 0 deletions tests/tests_fabric/plugins/precision/test_bitsandbytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,37 @@ def __init__(self):
assert not keys.missing_keys
assert model.l.weight.device.type == "cuda"
assert model.l.weight.dtype == expected


@RunIf(min_cuda_gpus=1, min_torch="2.1")
@pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes unavailable")
def test_load_quantized_checkpoint(tmp_path):
"""Test that a checkpoint saved from a quantized model can be loaded back into a quantized model."""

class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.linear = torch.nn.Linear(16, 16, bias=False)

def forward(self, x):
return self.linear(x)

fabric = Fabric(accelerator="cuda", devices=1, plugins=BitsandbytesPrecision("nf4-dq"))
model = Model()
model = fabric.setup(model)
model(torch.randn(2, 16, device=fabric.device))
state_dict = model.state_dict()
# The checkpoint contains quantized weights
assert state_dict["linear.weight"].dtype == torch.uint8
assert state_dict["linear.weight"].shape == (128, 1)
torch.save(state_dict, tmp_path / "checkpoint.pt")

fabric = Fabric(accelerator="cuda", devices=1, plugins=BitsandbytesPrecision("nf4-dq"))
model = Model()
model = fabric.setup(model)
state_dict = torch.load(tmp_path / "checkpoint.pt")
model.load_state_dict(state_dict)
assert model.linear.weight.dtype == torch.uint8
assert model.linear.weight.shape == (128, 1)
# Shapes match during forward (weight is being dequantized during forward)
model(torch.randn(2, 16, device=fabric.device))

0 comments on commit ac3f1ee

Please sign in to comment.