"Attempted to access the data pointer on an invalid python storage" when saving model in TPU mode (Kaggle) · Issue #27578 · huggingface/transformers (original) (raw)

System Info

It keeps happening whenever I try to use TPU mode to fine-tune BERT model for sentiment analysis. Everything works fine in GPU mode. I even tried to downgrade/upgrade TensorFlow & safetensors, but it didn't work either. Can you give me any suggestion?

Link to that notebook: https://www.kaggle.com/code/phttrnnguyngia/final

trainer.save_model('final-result')

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
File /kaggle/working/env/safetensors/torch.py:13, in storage_ptr(tensor)
     12 try:
---> 13     return tensor.untyped_storage().data_ptr()
     14 except Exception:
     15     # Fallback for torch==1.10

RuntimeError: Attempted to access the data pointer on an invalid python storage.

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
Cell In[21], line 2
      1 # save the model
----> 2 trainer.save_model('final-result')

File /kaggle/working/env/transformers/trainer.py:2804, in Trainer.save_model(self, output_dir, _internal_call)
   2801     output_dir = self.args.output_dir
   2803 if is_torch_tpu_available():
-> 2804     self._save_tpu(output_dir)
   2805 elif is_sagemaker_mp_enabled():
   2806     # Calling the state_dict needs to be done on the wrapped model and on all processes.
   2807     os.makedirs(output_dir, exist_ok=True)

File /kaggle/working/env/transformers/trainer.py:2873, in Trainer._save_tpu(self, output_dir)
   2871         xm.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
   2872 else:
-> 2873     self.model.save_pretrained(output_dir, is_main_process=self.args.should_save, save_function=xm.save)
   2874 if self.tokenizer is not None and self.args.should_save:
   2875     self.tokenizer.save_pretrained(output_dir)

File /kaggle/working/env/transformers/modeling_utils.py:2187, in PreTrainedModel.save_pretrained(self, save_directory, is_main_process, state_dict, save_function, push_to_hub, max_shard_size, safe_serialization, variant, token, save_peft_format, **kwargs)
   2183 for shard_file, shard in shards.items():
   2184     if safe_serialization:
   2185         # At some point we will need to deal better with save_function (used for TPU and other distributed
   2186         # joyfulness), but for now this enough.
-> 2187         safe_save_file(shard, os.path.join(save_directory, shard_file), metadata={"format": "pt"})
   2188     else:
   2189         save_function(shard, os.path.join(save_directory, shard_file))

File /kaggle/working/env/safetensors/torch.py:281, in save_file(tensors, filename, metadata)
    250 def save_file(
    251     tensors: Dict[str, torch.Tensor],
    252     filename: Union[str, os.PathLike],
    253     metadata: Optional[Dict[str, str]] = None,
    254 ):
    255     """
    256     Saves a dictionary of tensors into raw bytes in safetensors format.
    257 
   (...)
    279     ```
    280     """
--> 281     serialize_file(_flatten(tensors), filename, metadata=metadata)

File /kaggle/working/env/safetensors/torch.py:460, in _flatten(tensors)
    453 if invalid_tensors:
    454     raise ValueError(
    455         f"You are trying to save a sparse tensors: `{invalid_tensors}` which this library does not support."
    456         " You can make it a dense tensor before saving with `.to_dense()` but be aware this might"
    457         " make a much larger file than needed."
    458     )
--> 460 shared_pointers = _find_shared_tensors(tensors)
    461 failing = []
    462 for names in shared_pointers:

File /kaggle/working/env/safetensors/torch.py:72, in _find_shared_tensors(state_dict)
     70 tensors = defaultdict(set)
     71 for k, v in state_dict.items():
---> 72     if v.device != torch.device("meta") and storage_ptr(v) != 0 and storage_size(v) != 0:
     73         # Need to add device as key because of multiple GPU.
     74         tensors[(v.device, storage_ptr(v), storage_size(v))].add(k)
     75 tensors = list(sorted(tensors.values()))

File /kaggle/working/env/safetensors/torch.py:17, in storage_ptr(tensor)
     14 except Exception:
     15     # Fallback for torch==1.10
     16     try:
---> 17         return tensor.storage().data_ptr()
     18     except NotImplementedError:
     19         # Fallback for meta storage
     20         return 0

File /kaggle/working/env/torch/storage.py:909, in TypedStorage.data_ptr(self)
    907 def data_ptr(self):
    908     _warn_typed_storage_removal()
--> 909     return self._data_ptr()

File /kaggle/working/env/torch/storage.py:913, in TypedStorage._data_ptr(self)
    912 def _data_ptr(self):
--> 913     return self._untyped_storage.data_ptr()

RuntimeError: Attempted to access the data pointer on an invalid python storage.

Who can help?

No response

Information

Tasks

Reproduction

Run in Kaggle TPU, Environment: Always use latest environment. Input data is included in the notebook

Expected behavior

Expected to save successfully like when using GPU.