Skip to the content.

Tensor Implementation (CPU)

Structure

tensor_cpu

In general, to create a all zeros tensor:

Ref:

philosophy

memory internal

OS related

Autograd

Forward

forward

Key Files

Use Case

Take following block as example:

	Tensor t0 = torch::rand({2, 2}, TensorOptions().requires_grad(true));
	Tensor t1 = torch::rand({2, 2}, TensorOptions().requires_grad(true));

	Tensor a = torch::mm(t0, t1);
	Tensor b = a + t1;
	std::cout << "x add" << std::endl;
	Tensor c = b + t0;
	Tensor d = torch::sin(c);
	Tensor e = d.mean();

	e.backward();

Forward Phase

usecase0_forward

In forward phase, the operation result Variable remembers the backward function and set forward operation input Variables as next edges in backward phase.

Backward Phase

Initiation

Prepare GraphTask and GraphNode for task execution.

auto Engine::execute(const edge_list& roots,
                     const variable_list& inputs,
                     bool keep_graph,
                     bool create_graph,
                     const edge_list& outputs) -> variable_list {
// ...

  auto graph_task = std::make_shared<GraphTask>(
      keep_graph,
      create_graph,
      worker_device == NO_DEVICE ? 0 : total_depth + 1);

  auto graph_root = std::make_shared<GraphRoot>(roots, inputs);
 
// ...
}

After initiation:

initiation

Compute Dependencies
auto Engine::compute_dependencies(Node* root, GraphTask& task) -> void {
  std::unordered_set<Node*> seen;
  std::vector<Node*> queue { root };

  auto& dependencies = task.dependencies_;
  while (!queue.empty()) {
    auto fn = queue.back(); queue.pop_back();
    for (const auto& edge : fn->next_edges()) {
      if (auto next_ptr = edge.function.get()) {
        dependencies[next_ptr] += 1;
        const bool was_inserted = seen.insert(next_ptr).second;
        if (was_inserted) queue.push_back(next_ptr);
      }
    }
  }
}

The process of BFS:

compute_dependencies

Execution
void Engine::evaluate_function(
    std::shared_ptr<GraphTask>& graph_task,
    Node* func,
    InputBuffer& inputs) { 
  const auto opt_parent_stream = (*func).stream(c10::DeviceType::CUDA);
  c10::OptionalStreamGuard parent_stream_guard{opt_parent_stream};

  auto outputs = call_function(graph_task, func, inputs);
  auto& fn = *func;

  int num_outputs = outputs.size();
  if (num_outputs == 0) { 
    return;
  }

  std::lock_guard<std::mutex> lock(graph_task->mutex_);
  for (int i = 0; i < num_outputs; ++i) {
    auto& output = outputs[i];
    const auto& next = fn.next_edge(i);

    bool is_ready = false;
    auto& dependencies = graph_task->dependencies_;
    auto it = dependencies.find(next.function.get());

    if (--it->second == 0) {
      dependencies.erase(it);
      is_ready = true;
    }

    auto& not_ready = graph_task->not_ready_;
    auto not_ready_it = not_ready.find(next.function.get());
    if (not_ready_it == not_ready.end()) {
      InputBuffer input_buffer(next.function->num_inputs());

      // Accumulates into buffer
      const auto opt_next_stream = next.function->stream(c10::DeviceType::CUDA);
      input_buffer.add(next.input_nr,
                       std::move(output),
                       opt_parent_stream,
                       opt_next_stream);

      if (is_ready) {
        auto& queue = ready_queue(input_buffer.device());
        queue.push(
            NodeTask(graph_task, next.function, std::move(input_buffer)));
      } else {
        not_ready.emplace(next.function.get(), std::move(input_buffer));
      }
    } else {
      auto &input_buffer = not_ready_it->second;

      // Accumulates into buffer
      const auto opt_next_stream = next.function->stream(c10::DeviceType::CUDA);
      input_buffer.add(next.input_nr,
                       std::move(output),
                       opt_parent_stream,
                       opt_next_stream);
      if (is_ready) {
        auto& queue = ready_queue(input_buffer.device());
        queue.push(
            NodeTask(graph_task, next.function, std::move(input_buffer)));
        not_ready.erase(not_ready_it);
      }
    }
  }
}

The process of topology DAG traverse

backward_threadmain

Detach

detach

Tensor.detach() implements a shallow clone of original Tensor. The detached Tensor is invalid to grad immediately after detach() as its AutogradMeta has been set as nullptr.

Some Codes

intrusive_ptr.h

pytorch defines intrusive_ptr as an alternative to shared_ptr for:

boost intrusive_ptr

intrusive_ptr_target

The type that could be wrapped by intrusive_ptr should inherit class intrusive_ptr_target.

class C10_API intrusive_ptr_target {

It provides mutable atomic size_t type members for concurrent counting.

  mutable std::atomic<size_t> refcount_;
  mutable std::atomic<size_t> weakcount_;

The counters are not public members, define friend function to grant accession.

  template <typename T, typename NullType>
  friend class intrusive_ptr;
  friend inline void raw::intrusive_ptr::incref(intrusive_ptr_target* self);

Define destructor as protected to prevent end user from deleting the target.

 protected:
  // protected destructor. We never want to destruct intrusive_ptr_target*
  // directly.
  virtual ~intrusive_ptr_target() {

Define the function that executes the real cleanup job as private virtual:

 private:
  virtual void release_resources() {}

extra cleanup virtual function

intrusive_ptr

Define a null type for nullptr with noexcept constexpr static function singleton.

namespace detail {
template <class TTarget>
struct intrusive_target_default_null_type final {
  static constexpr TTarget* singleton() noexcept {
    return nullptr;
  }
};

Why an extra null type defined?

The intrusive_ptr class defined as:

template <
    class TTarget,
    class NullType = detail::intrusive_target_default_null_type<TTarget>>
class intrusive_ptr final {

Declare the inner resource:

  TTarget* target_;

Allow other types specification of intrusive_ptr as friend for implicit conversion that was allowed between types of wrapped pointer.

  template <class TTarget2, class NullType2>
  friend class intrusive_ptr;

Add reference counter. If the counter reached 0, report error as 0 reference followed by object destruction. intrusive_ptr is friend class of corresponding target type T.

  void retain_() {
    if (target_ != NullType::singleton()) {
      size_t new_refcount = ++target_->refcount_;
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
          new_refcount != 1,
          "intrusive_ptr: Cannot increase refcount after it reached zero.");
    }
  }

Remove reference of current object. NullType::singleton() used.

  void reset_() noexcept {
    if (target_ != NullType::singleton() && --target_->refcount_ == 0) {
      const_cast<std::remove_const_t<TTarget>*>(target_)->release_resources();

      if (--target_->weakcount_ == 0) {
        delete target_;
      }
    }
    target_ = NullType::singleton();
  }

Declare the constructor as private explicit, as intrusive_ptr is only allowed to be created by make_intrusive with counter management.

  explicit intrusive_ptr(TTarget* target) noexcept : target_(target) {}

Type alias:

 public:
  using element_type = TTarget;

Default constructor set target as NullType. Maybe for container usage.

  intrusive_ptr() noexcept : intrusive_ptr(NullType::singleton()) {}

The move constructor, set moved in rhs.target as NullType. Why not just swap?

  intrusive_ptr(intrusive_ptr&& rhs) noexcept : target_(rhs.target_) {
    rhs.target_ = NullType::singleton();
  }

If target pointers are convertible, the intrusive_ptr objects are convertible. The moved-in object rhs has nullptr set.

  template <class From, class FromNullType>
  /* implicit */ intrusive_ptr(intrusive_ptr<From, FromNullType>&& rhs) noexcept
      : target_(detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
    static_assert(
        std::is_convertible<From*, TTarget*>::value,
        "Type mismatch. intrusive_ptr move constructor got pointer of wrong type.");
    rhs.target_ = FromNullType::singleton();
  }

Copy constructor, increase reference counter.

  intrusive_ptr(const intrusive_ptr& rhs) : target_(rhs.target_) {
    retain_();
  }

Destructor, decrease reference counter, release resources if no more references.

  ~intrusive_ptr() noexcept {
    reset_();
  }

The assignment operator:

Safe Bool Idiom

  operator bool() const noexcept {
    return target_ != NullType::singleton();
  }

To construct intrusive_ptr object, end-user must take use of static function make. That is why constructor has been declared as private.

  template <class... Args>
  static intrusive_ptr make(Args&&... args) {
    auto result = intrusive_ptr(new TTarget(std::forward<Args>(args)...));
    // We can't use retain_(), because we also have to increase weakcount
    // and because we allow raising these values from 0, which retain_()
    // has an assertion against.
    ++result.target_->refcount_;
    ++result.target_->weakcount_;

    return result;
  }

Other operators have been defined as non-member functions.

Tensor.h Template

Some Knowledge

Anonymous Namespace

cppreference

stackoverflow

local linkage

template keyword

template<class Key, class Value, class Iterator>
class DictEntryRef final {
public:
  explicit DictEntryRef(Iterator iterator)
  : iterator_(std::move(iterator)) {}

  Key key() const {
    return iterator_->first.template to<Key>();
  }
//...

const

  const DictEntryRef<Key, Value, Iterator>& operator*() const {
      return entryRef_;
  }

entryRef_ is not a const object. Declaring the return type as const enables declaring the function as const

copy-and-swap

class  Blob final : public c10::intrusive_ptr_target {
 public:
  Blob() noexcept : meta_(), pointer_(nullptr), has_ownership_(false) {}
  ~Blob() {
    Reset();
  }

  Blob(Blob&& other) noexcept : Blob() {
    swap(other);
  }

  void Reset() {
    free_();
    pointer_ = nullptr;
    meta_ = TypeMeta();
    has_ownership_ = false;
  }
 
  void swap(Blob& rhs) {
    using std::swap;
    swap(meta_, rhs.meta_);
    swap(pointer_, rhs.pointer_);
    swap(has_ownership_, rhs.has_ownership_);
  }
}

The copy-and-swap idiom application:

  Blob& operator=(Blob&& other) noexcept {
    Blob(std::move(other)).swap(*this);
    return *this;
  }

The object other is moved into the operator=() function, this function take responsibility of resource of other and self. A pure swap is not able to guarantee the noexcept constraint.

copy-and-swap

stackoverflow

Variadic Templates

An example (torch/aten/src/ATen/core/Variadic.h)

  template <typename... Args>
  inline F& apply() {
    return self();
  }

  template <typename T, typename... Args>
  inline F& apply(T&& arg, Args&&... args) {
    self()(std::forward<T>(arg));
    if (self().short_circuit()) {
      return self();
    } else {
      return apply(std::forward<Args>(args)...);
    }
  }

variadic template