Как сохранить файл контрольной точки tensorflow из google colaboratory в режиме tpu?

Когда я использую saver = tf.train.Saver() и save_path = saver.save(session, "checkpointsFolder/checkpoint.ckpt")

Я получаю ошибку UnimplementedError (see above for traceback): File system scheme '[local]' not implemented

Вот полная ошибка

---------------------------------------------------------------------------
UnimplementedError                        Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1333     try:
-> 1334       return fn(*args)
   1335     except errors.OpError as e:

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
   1318       return self._call_tf_sessionrun(
-> 1319           options, feed_dict, fetch_list, target_list, run_metadata)
   1320 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
   1406         self._session, options, feed_dict, fetch_list, target_list,
-> 1407         run_metadata)
   1408 

UnimplementedError: File system scheme '[local]' not implemented (file: 'checkpointsBook2Vec5Inputs')
     [[{{node save/SaveV2}} = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:tpu_worker/replica:0/task:0/device:CPU:0"](_recv_save/Const_0, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, embeddings, embeddings/Shampoo, embeddings/Shampoo_1, embeddings/Shampoo_2, epochCount, softmax_biases, softmax_weights, softmax_weights/Shampoo, softmax_weights/Shampoo_1, softmax_weights/Shampoo_2)]]

During handling of the above exception, another exception occurred:

UnimplementedError                        Traceback (most recent call last)
<ipython-input-22-ca87cd5e5739> in <module>()
     48             print('recEpoch_indexA is', recEpoch_indexA)
     49 
---> 50             save_path = saver.save(session, "checkpointsBook2Vec5Inputs/Research2VecCS4.ckpt") #Save checkpoint
     51             print( 'epochCount.eval() is ', epochCount.eval() )
     52 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py in save(self, sess, save_path, global_step, latest_filename, meta_graph_suffix, write_meta_graph, write_state, strip_default_attrs)
   1439           model_checkpoint_path = sess.run(
   1440               self.saver_def.save_tensor_name,
-> 1441               {self.saver_def.filename_tensor_name: checkpoint_file})
   1442 
   1443         model_checkpoint_path = compat.as_str(model_checkpoint_path)

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    927     try:
    928       result = self._run(None, fetches, feed_dict, options_ptr,
--> 929                          run_metadata_ptr)
    930       if run_metadata:
    931         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
   1150     if final_fetches or final_targets or (handle and feed_dict_tensor):
   1151       results = self._do_run(handle, final_targets, final_fetches,
-> 1152                              feed_dict_tensor, options, run_metadata)
   1153     else:
   1154       results = []

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1326     if handle is None:
   1327       return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1328                            run_metadata)
   1329     else:
   1330       return self._do_call(_prun_fn, handle, feeds, fetches)

/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1346           pass
   1347       message = error_interpolation.interpolate(message, self._graph)
-> 1348       raise type(e)(node_def, op, message)
   1349 
   1350   def _extend_graph(self):

UnimplementedError: File system scheme '[local]' not implemented (file: 'checkpointsBook2Vec5Inputs')
     [[node save/SaveV2 (defined at <ipython-input-15-c14caac2081d>:45)  = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:tpu_worker/replica:0/task:0/device:CPU:0"](_recv_save/Const_0, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, embeddings, embeddings/Shampoo, embeddings/Shampoo_1, embeddings/Shampoo_2, epochCount, softmax_biases, softmax_weights, softmax_weights/Shampoo, softmax_weights/Shampoo_1, softmax_weights/Shampoo_2)]]

Caused by op 'save/SaveV2', defined at:
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2718, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-c14caac2081d>", line 45, in <module>
    saver = tf.train.Saver()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1102, in __init__
    self.build()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1114, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1151, in _build
    build_save=build_save, build_restore=build_restore)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 792, in _build_internal
    save_tensor = self._AddSaveOps(filename_tensor, saveables)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 284, in _AddSaveOps
    save = self.save_op(filename_tensor, saveables)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 202, in save_op
    tensors)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_io_ops.py", line 1690, in save_v2
    shape_and_slices=shape_and_slices, tensors=tensors, name=name)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

UnimplementedError (see above for traceback): File system scheme '[local]' not implemented (file: 'checkpointsBook2Vec5Inputs')
     [[node save/SaveV2 (defined at <ipython-input-15-c14caac2081d>:45)  = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:tpu_worker/replica:0/task:0/device:CPU:0"](_recv_save/Const_0, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, embeddings, embeddings/Shampoo, embeddings/Shampoo_1, embeddings/Shampoo_2, epochCount, softmax_biases, softmax_weights, softmax_weights/Shampoo, softmax_weights/Shampoo_1, softmax_weights/Shampoo_2)]]

При поиске этой ошибки я обнаружил следующее:

Из официального руководства по отладке TPU от Google

https://cloud.google.com/tpu/docs/troubleshooting

Error Message

InvalidArgumentError: Unimplemented: File system scheme '[local]' not implemented

Details

All input files and the model directory must use a cloud storage bucket path (gs://bucket-name/...), and this bucket must be accessible from the TPU server. Note that all data processing and model checkpointing is performed on the TPU server, not the local machine. For information on how to properly configure cloud storage for use with the TPU, see the guide Connecting to Cloud Storage Buckets.

Кто-то еще с похожей проблемой

Локальная файловая система TPU не существует?

The local filesystem is not available on Cloud TPU's. Model directories (checkpoints etc) and input data should be stored in Google Cloud Storage (and prefixed with "gs://").

More details here

https://cloud.google.com/tpu/docs/storage-buckets

Однако у меня нет сервиса Google Cloud, я просто использую Google Colab. Есть ли способ сохранить контрольную точку Tensorflow в режиме TPU?

3
0
4 564
2
Перейти к ответу Данный вопрос помечен как решенный

Ответы 2

Вы можете создать учетную запись Google Cloud под бесплатный уровень, а затем создать Сегмент GCS. После этого вы можете аутентифицироваться в Colab, чтобы получить доступ на запись к вашей корзине GCS из Colab, выполнив следующие действия:

from google.colab import auth
auth.authenticate_user()

Вот образец записной книжки Colab, который использует Cloud TPU и GCS.

Похоже, это бесплатная пробная версия на 12 месяцев / 300 долларов. Сохраняется ли возможность создавать / использовать корзины после завершения пробной версии?

SantoshGupta7 27.10.2018 01:35

Да, но я считаю, что не бесплатно.

jysohn 27.10.2018 01:41
Ответ принят как подходящий

Другой способ сделать это - переписать модель с помощью Keras и использовать tf.contrib.tpu.keras_to_tpu_model (..) с tf.contrib.tpu.TPUDistributionStrategy (...). Вот небольшой фрагмент кода для этого:

def get_model():
  return keras.Sequential([
    keras.layers.Dense(10, input_shape=(4,), activation=tf.nn.relu, name = "Dense_1"),
    keras.layers.Dense(10, activation=tf.nn.relu, name = "Dense_2"),
    keras.layers.Dense(3, activation=None, name = "logits"),
    keras.layers.Dense(3, activation=tf.nn.softmax, name = "softmax")
  ])

dnn_model = get_model()

dnn_model.compile(optimizer=tf.train.AdagradOptimizer(learning_rate=0.1), 
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_crossentropy'])

tpu_model = tf.contrib.tpu.keras_to_tpu_model(
    dnn_model,
    strategy=tf.contrib.tpu.TPUDistributionStrategy(
        tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)))

# Train the model
tpu_model.fit(
  train_x, train_y,
  steps_per_epoch = steps_per_epoch,
  epochs=epochs,
)

tpu_model.save_weights('./saved_weights.h5', overwrite=True)

Сохраняет ли это также настройку оптимизатора?

SantoshGupta7 30.10.2018 05:49

Нет. Он просто сохранил окончательные изученные веса, которые можно загрузить в ту же модель, используя: prediction_model = get_model () prediction_model.load_weights ('./ saved_weights.h5')

aman2930 30.10.2018 23:16

Другие вопросы по теме