(learn&think)

C++11 Memory Model and Atomic

2017-06-11T23:47:41+08:00

C++11 Atomic
Memory Model and Order
More

C++11 Atomic¹

C++11 Atomic 可简单分为 4 部分:

atomic 类
对 atomic 类型的操作函数
atomic_flag 类
内存序列同步相关操作

`atomic` 类

主要分为四种模板类:

基本 std::atomic
```
template< class T >
struct atomic;
```
整形(Integral)的特化
```
template<>
struct atomic;
```
bool 的特化
```
template<>
struct atomic<bool>;
```
指针的特化
```
template< class T >
struct atomic<T*>;
```

bool 和 integral 类型:

std::atomic_bool        std::atomic<bool>

std::atomic_char        std::atomic<char>
std::atomic_schar       std::atomic<signed char>
std::atomic_uchar       std::atomic<unsigned char>
std::atomic_short       std::atomic<short>
std::atomic_ushort      std::atomic<unsigned short>
std::atomic_int std::atomic<int>
std::atomic_uint        std::atomic<unsigned int>
std::atomic_long        std::atomic<long>
std::atomic_ulong       std::atomic<unsigned long>
std::atomic_llong       std::atomic<long long>
std::atomic_ullong      std::atomic<unsigned long long>
std::atomic_char16_t    std::atomic
std::atomic_char32_t    std::atomic
std::atomic_wchar_t     std::atomic<wchar_t>
std::atomic_int8_t      std::atomic<std::int8_t>
std::atomic_uint8_t     std::atomic<std::uint8_t>
std::atomic_int16_t     std::atomic<std::int16_t>
std::atomic_uint16_t    std::atomic<std::uint16_t>
std::atomic_int32_t     std::atomic<std::int32_t>
std::atomic_uint32_t    std::atomic<std::uint32_t>
std::atomic_int64_t     std::atomic<std::int64_t>
std::atomic_uint64_t    std::atomic<std::uint64_t>
std::atomic_int_least8_t        std::atomic<std::int_least8_t>
std::atomic_uint_least8_t       std::atomic<std::uint_least8_t>
std::atomic_int_least16_t       std::atomic<std::int_least16_t>
std::atomic_uint_least16_t      std::atomic<std::uint_least16_t>
std::atomic_int_least32_t       std::atomic<std::int_least32_t>
std::atomic_uint_least32_t      std::atomic<std::uint_least32_t>
std::atomic_int_least64_t       std::atomic<std::int_least64_t>
std::atomic_uint_least64_t      std::atomic<std::uint_least64_t>
std::atomic_int_fast8_t std::atomic<std::int_fast8_t>
std::atomic_uint_fast8_t        std::atomic<std::uint_fast8_t>
std::atomic_int_fast16_t        std::atomic<std::int_fast16_t>
std::atomic_uint_fast16_t       std::atomic<std::uint_fast16_t>
std::atomic_int_fast32_t        std::atomic<std::int_fast32_t>
std::atomic_uint_fast32_t       std::atomic<std::uint_fast32_t>
std::atomic_int_fast64_t        std::atomic<std::int_fast64_t>
std::atomic_uint_fast64_t       std::atomic<std::uint_fast64_t>
std::atomic_intptr_t    std::atomic<std::intptr_t>
std::atomic_uintptr_t   std::atomic<std::uintptr_t>
std::atomic_size_t      std::atomic<std::size_t>
std::atomic_ptrdiff_t   std::atomic<std::ptrdiff_t>
std::atomic_intmax_t    std::atomic<std::intmax_t>
std::atomic_uintmax_t   std::atomic<std::uintmax_t>

基本模板类定义:

template < class T > struct atomic {
    bool is_lock_free() const volatile;
    bool is_lock_free() const;
    void store(T, memory_order = memory_order_seq_cst) volatile;
    void store(T, memory_order = memory_order_seq_cst);
    T load(memory_order = memory_order_seq_cst) const volatile;
    T load(memory_order = memory_order_seq_cst) const;
    operator  T() const volatile;
    operator  T() const;
    T exchange(T, memory_order = memory_order_seq_cst) volatile;
    T exchange(T, memory_order = memory_order_seq_cst);
    bool compare_exchange_weak(T &, T, memory_order, memory_order) volatile;
    bool compare_exchange_weak(T &, T, memory_order, memory_order);
    bool compare_exchange_strong(T &, T, memory_order, memory_order) volatile;
    bool compare_exchange_strong(T &, T, memory_order, memory_order);
    bool compare_exchange_weak(T &, T, memory_order = memory_order_seq_cst) volatile;
    bool compare_exchange_weak(T &, T, memory_order = memory_order_seq_cst);
    bool compare_exchange_strong(T &, T, memory_order = memory_order_seq_cst) volatile;
    bool compare_exchange_strong(T &, T, memory_order = memory_order_seq_cst);
    atomic() = default;
    constexpr atomic(T);
    atomic(const atomic &) = delete;
    atomic & operator=(const atomic &) = delete;
    atomic & operator=(const atomic &) volatile = delete;
    T operator=(T) volatile;
    T operator=(T);
};

Integral 特有的函数:

integral fetch_add(integral, memory_order = memory_order_seq_cst) volatile;
integral fetch_add(integral, memory_order = memory_order_seq_cst);

integral fetch_sub(integral, memory_order = memory_order_seq_cst) volatile;
integral fetch_sub(integral, memory_order = memory_order_seq_cst);

integral fetch_and(integral, memory_order = memory_order_seq_cst) volatile;
integral fetch_and(integral, memory_order = memory_order_seq_cst);

integral fetch_or(integral, memory_order = memory_order_seq_cst) volatile;
integral fetch_or(integral, memory_order = memory_order_seq_cst);

integral fetch_xor(integral, memory_order = memory_order_seq_cst) volatile;
integral fetch_xor(integral, memory_order = memory_order_seq_cst);

integral operator++(int) volatile;
integral operator++(int);
integral operator--(int) volatile;
integral operator--(int);
integral operator++() volatile;
integral operator++();
integral operator--() volatile;
integral operator--();
integral operator+=(integral) volatile;
integral operator+=(integral);
integral operator-=(integral) volatile;
integral operator-=(integral);
integral operator&=(integral) volatile;
integral operator&=(integral);
integral operator|=(integral) volatile;
integral operator|=(integral);
integral operator^=(integral) volatile;
integral operator^=(integral);

指针特有的函数

T* fetch_add(ptrdiff_t, memory_order = memory_order_seq_cst) volatile;
T* fetch_add(ptrdiff_t, memory_order = memory_order_seq_cst);

T* fetch_sub(ptrdiff_t, memory_order = memory_order_seq_cst) volatile;
T* fetch_sub(ptrdiff_t, memory_order = memory_order_seq_cst);

  T* operator=(T*) volatile;
  T* operator=(T*);
  T* operator++(int) volatile;
  T* operator++(int);
  T* operator--(int) volatile;
  T* operator--(int);
  T* operator++() volatile;
  T* operator++();
  T* operator--() volatile;
  T* operator--();
  T* operator+=(ptrdiff_t) volatile;
  T* operator+=(ptrdiff_t);
  T* operator-=(ptrdiff_t) volatile;
  T* operator-=(ptrdiff_t);

`atomic` 类型的操作函数

除了 atomic 类的成员函数,也提供了对其操作的函数:

atomic_is_lock_free: checks if the atomic type’s operations are lock-free
atomic_store and atomic_store_explicit: atomically replaces the value of the atomic object with a non-atomic argument
atomic_load and atomic_load_explicit: atomically obtains the value stored in an atomic object
atomic_exchange and atomic_exchange_explicit: atomically replaces the value of the atomic object with non-atomic argument and returns the old value of the atomic
atomic_compare_exchange_weak atomic_compare_exchange_weak_explicit atomic_compare_exchange_strong atomic_compare_exchange_strong_explicit: atomically compares the value of the atomic object with non-atomic argument and performs atomic exchange if equal or atomic load if not
atomic_fetch_add atomic_fetch_add_explicit: adds a non-atomic value to an atomic object and obtains the previous value of the atomic
atomic_fetch_sub atomic_fetch_sub_explicit: subtracts a non-atomic value from an atomic object and obtains the previous value of the atomic
atomic_fetch_and atomic_fetch_and_explicit: replaces the atomic object with the result of logical AND with a non-atomic argument and obtains the previous value of the atomic
atomic_fetch_or atomic_fetch_or_explicit: replaces the atomic object with the result of logical OR with a non-atomic argument and obtains the previous value of the atomic
atomic_fetch_xor atomic_fetch_xor_explicit: replaces the atomic object with the result of logical XOR with a non-atomic argument and obtains the previous value of the atomic

`atomic_flag` 类

atomic_flag 是一种原子布尔类型，不同于 std::atomic, 不提供 load 或 store 操作,只支持两种操作， test_and_set 和 clear 。

atomic_flag() noexcept = default;
atomic_flag (const atomic_flag&T) = delete;

std::atomic_flag 只有默认构造函数，拷贝构造函数已被禁用. 一般使用 ATOMIC_FLAG_INIT 初始化为 clear 状态.

内存序列同步相关操作

memory_order: defines memory ordering constraints for the given atomic operation

enum memory_order {
    memory_order_relaxed,
    memory_order_consume,
    memory_order_acquire,
    memory_order_release,
    memory_order_acq_rel,
    memory_order_seq_cst
};

kill_dependency: removes the specified object from the std::memory_order_consume dependency tree
atomic_thread_fence: Establishes memory synchronization ordering of non-atomic and relaxed atomic accesses, as instructed by order, without an associated atomic operation.
atomic_signal_fence: Establishes memory synchronization ordering of non-atomic and relaxed atomic accesses, as instructed by order, between a thread and a signal handler executed on the same thread. This is equivalent to std::atomic_thread_fence, except no CPU instructions for memory ordering are issued. Only reordering of the instructions by the compiler is suppressed as order instructs.

Memory Model and Order

在浅谈 Memory Reordering中提及编译开发者和处理器制造商遵循的中心内存排序准则是: 不能改变单线程程序的行为. 从而产生了:

Memory ordering at compile time: 编译优化造成
Memory ordering at processor time: CPU 允许乱序机器指令优化造成

在多核多线程时代，当多线程共享某一变量时，不同线程对共享变量的读写就应该格外小心，不适当的乱序执行可能导致程序运行错误。所以必须对编译器和 CPU 作出一定的约束才能合理正确地优化你的程序，这个约束就是 内存模型 (Memory Model) .

或者说,程序转化成机器指令执行时并不按照之前的原始代码顺序执行,所以内存模型是程序员、编译器，CPU 之间的准则约束,遵守这一准则约束后,大家各自做优化, 从而尽可能提高程序的性能。

wiki 上的 Memory model给出一个比较抽象的描述: In computing, a memory model describes the interactions of threads through memory and their shared use of the data.

C++11 中规定了 6 种访存次序(Memory Order)，如下：

enum memory_order {
    memory_order_relaxed,
    memory_order_consume,
    memory_order_acquire,
    memory_order_release,
    memory_order_acq_rel,
    memory_order_seq_cst
};

上面 C++11 Atomic 涉及 memory_order 的接口, 默认值是 std::memory_order_seq_cst .

可以把上述 6 种访存次序(内存序)分为 3 类，顺序一致性模型 (memory_order_seq_cst)，Acquire-Release 模型 (memory_order_consume, memory_order_acquire, memory_order_release, memory_order_acq_rel) 和 Relax 模型 (memory_order_relaxed).

memory_order_relaxed: all reorderings are okay²
memory_order_acquire: guarantees that subsequent loads are not moved before the current load or any preceding loads.
memory_order_release: preceding stores are not moved past the current store or any subsequent stores.
memory_order_acq_rel: combines the two previous guarantees.
memory_order_consume: potentially weaker form of memory_order_acquire that enforces ordering of the current load before other operations that are data-dependent on it (for instance, when a load of a pointer is marked memory_order_consume, subsequent operations that dereference this pointer won’t be moved before it (yes, even that is not guaranteed on all platforms!).
memory_order_scq_cst: 是 memory_order_acq_rel 的加强版，除了有 acq_rel 语义，还保证是sequencially-consistent.

C++ 多线程与内存模型资料汇总
Herb Sutter 的 talk
- Atomic Weapons 1
- Atomic Weapon 2
C++ atomics and memory ordering

Footnotes:

http://en.cppreference.com/w/cpp/atomic

https://bartoszmilewski.com/2008/12/01/c-atomics-and-memory-ordering/

Make Colorful Equations With Mathjax

2014-12-15T22:18:35+08:00

$\begin{align} \definecolor{x}{RGB}{114,0,172} \definecolor{mean}{RGB}{45,177,93} \definecolor{var}{RGB}{251,0,29} \definecolor{e}{RGB}{18,110,213} f(\color{x} x,\color{mean} \mu,\color{var} \sigma \color{black} ) = \frac{1}{\color{var}\sigma \color{black} \sqrt{2\pi}} \color{e} e \color{black} ^{- \frac{( {\color{x}x} - {\color{mean} \mu})^2} {2{ \color{var} \sigma}^2}} \end{align}$

其中, $\mu$ 是分布的均值或期望值, 而 $\sigma$ 是它的标准差, $\sigma^2$ 则是方差.

加载 Mathjax Color extension

--- a/head.html
+++ b/head.html
@@ -13,7 +13,8 @@
     skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
   },
   messageStyle: "none",
-  "HTML-CSS": { preferredFont: "TeX", availableFonts: ["STIX","TeX"] }
+  "HTML-CSS": { preferredFont: "TeX", availableFonts: ["STIX","TeX"] },
+  TeX: { extensions: ["color.js"] }
 });
 
 

使用

使用 \definecolor 和 \color 来为公式添加颜色如下:

\begin{align}
\definecolor{x}{RGB}{114,0,172}
\definecolor{mean}{RGB}{45,177,93}
\definecolor{var}{RGB}{251,0,29}
\definecolor{e}{RGB}{18,110,213}

f(\color{x} x,\color{mean} \mu,\color{var} \sigma \color{black} ) =
\frac{1}{\color{var}\sigma \color{black} \sqrt{2\pi}} \color{e}
e \color{black} ^{- \frac{( {\color{x}x} - {\color{mean} \mu})^2}
{2{ \color{var} \sigma}^2}}
\end{align}

MathJax 的 \definecolor 不支持 HTML 的颜色颜色空间,所以手动在它们之间转换颜色,文字部分如下:

其中, color="2DB15D"> $\mu$  是分布的均值或期望值,
而 color="FB001D"> $\sigma$  是它的标准差,
 color="FB001D"> $\sigma^2$ 则是方差.

浅谈C++11 Multithreading Programming

2014-08-25T00:00:00+08:00

Overview

上一篇浅谈 C++ Multithreading Programming主要介绍时下规范好的 C++使用 Pthread 库和 Boost Thread 库实现 C++多线程编程.这里主要谈谈正在规范的 C++11 引入的 Thread 库和 Atomic 库,终于自带的 C++库能支持高效并可移植的 Multithreading 编程.分为 2 篇,这里先谈谈 C++11 的Thread 的库 (并包含对 C 的支持), 后一篇谈谈 C++11 的Atomic 操作的库.

C++11(之前被成为 C++0x)是编程语言 C++最新版本的标准.它由 ISO 在 2011 年 8 月 12 日被批准替代C++03. C++11 标准正在规范中,从ISO 页面可以知道如何获得进行中的草稿:

所以本文:

标准内容主要参考如上的 N3690 版本的 C++11 标准.
使用的编译器是 GCC4.8,关于 GCC4.8 支持 C+11 的情况.
源代码之类主要参考cplusplus 和 cppreference.

更多有关 C++参考最后的其他资料.

Compile

GCC 编译支持 C++11,使用编译选项 -std=c++11 或 -std=gnu++11, 前者关闭 GNU 扩张支持.并加上 -pthread 选项.

g++ program.o -o program -std=c++11 -pthread

如果漏掉 -phtread 选项,编译能通过,当运行出现如下错误:

terminate called after throwing an instance of 'std::system_error'
  what():  Enable multithreading to use std::thread: Operation not permitted

Threads

概要

头文件是 , 分为两部分: thread 类和在 namespace this_thread 用来管理当前 thread 的函数.具体见之后的Header synopsis.

`thread::id` 类

thread::id 类型的对象为每个执行的线程提供唯一的标识,并为所有并不表示线程执行(默认构造的线程对象)的所有线程对象提供一个唯一的值.

thread::id 类没有特别的东西,主要提供方便比较或打印等运算符重载.

namespace std {
class thread::id {
 public:
  id() noexcept;
};
bool operator==(thread::id x, thread::id y) noexcept;
bool operator!=(thread::id x, thread::id y) noexcept;
bool operator<(thread::id x, thread::id y) noexcept;
bool operator<=(thread::id x, thread::id y) noexcept;
bool operator>(thread::id x, thread::id y) noexcept;
bool operator>=(thread::id x, thread::id y) noexcept;
template<class charT, class traits>
basic_ostream<charT, traits>&
operator<< (basic_ostream<charT, traits>& out, thread::id id);
// Hash support
template <class T> struct hash;
template <typename T> <> struct hash<thread::id>;
}

`thread` 类

namespace std {
class thread {
 public:
  // types:
  class id;
  typedef implementation-defined native_handle_type; // See 30.2.3
  // construct/copy/destroy:
  thread() noexcept;
  template <class F, class ...Args> explicit thread(F&& f, Args&&... args);
  ~thread();
  thread(const thread&) = delete;
  thread(thread&&) noexcept;
  thread& operator=(const thread&) = delete;
  thread& operator=(thread&&) noexcept;
  // members:
  void swap(thread&) noexcept;
  bool joinable() const noexcept;
  void join();
  void detach();
  id get_id() const noexcept;
  native_handle_type native_handle(); // See 30.2.3
  // static members:
  static unsigned hardware_concurrency() noexcept;
};
}

Constructs a thread object

从如上的 thread 类知道, 构造 thread 对象:

默认构造构造一个线程对象,但并不代表任何执行线程.
移动构造从其他线程构造一个 thread 对象,并设置其他线程为默认构造状态.
初始化构造创建一个新的 thread 对象并把它与执行线程相关联.复制/移动所有参数 args.. 到 thread 可访问的内存通过如下函数:

template <class T>
typename decay<T>::type decay_copy(T&& v) {
    return std::forward(v);
}

求值和复制/移动参数过程丢出的任何 exceptions 仅在当前线程丢出,不在新线程中.

复制构造复制构造被删除.线程不可被复制.

实例:

#include   // NOLINT
#include 
#include 
#include 

using std::cout;
using std::endl;

void Thread1Fun(int n) {
  for (int i = 0; i < n; ++i) {
    cout << "Thread 1 executing" << endl;
  }
}

void Thread2Fun(const int& n) {
  for (int i = 0; i < n; ++i) {
    std::cout << "Thread 2 executing\n";
  }
}

int main() {
  const int kLoops = 5;
  std::thread t1;  // t1 is not a thread
  std::thread t2(Thread1Fun, kLoops + 1);  // pass by value
  std::thread t3(Thread2Fun, std::ref(kLoops));  // pass by reference
  std::thread t4(std::move(t3));
  // t4 is now running f2(). t3 is no longer a thread
  t2.join();
  t4.join();
  return 0;
}

joinable

用来检查一个线程对象是否是正在执行的线程.若是,返回 true. 所以默认构造 thread 对象是不可 joinable.

实例:

#include   // NOLINT
#include 
#include 
using std::cout;
using std::endl;

void ThreadFun() {
  std::this_thread::sleep_for(std::chrono::seconds(1));
}

int main() {
  std::thread t;
  cout << "default construct, joinable: " << t.joinable() << endl;

  t = std::thread(ThreadFun);
  cout << "initial construct, joinable: " << t.joinable() << endl;
  t.join();
  return 0;
}

结果:

default construct, joinable: 0
initial construct, joinable: 1

`get_id`

返回 thread 对象的 std::thread::id 值. 实例:

#include   // NOLINT
#include 
#include 

using std::cout;
using std::endl;

void ThreadFun() {
  std::this_thread::sleep_for(std::chrono::seconds(1));
}

int main() {
  std::thread t1(ThreadFun);
  std::thread::id id_t1 = t1.get_id();
  cout << "thread1's id: " << id_t1 << endl;
  t1.join();
  return 0;
}

`native_handle`

这个函数是 implementation-defined. 它允许提供底层实现细节的访问.但实际使用它是 non-portable.

实例: 使用 native_handle 打开在 POSIX 系统上 C++线程的实时调度.

#include 
#include 
#include 
#include   // NOLINT
#include 
#include 
using std::cout;
using std::endl;

std::mutex iomutex;
void ThreadFun(int thread_id) {
  std::this_thread::sleep_for(std::chrono::seconds(1));
  sched_param sch;
  int policy;
  pthread_getschedparam(pthread_self(), &policy, &sch);
  std::lock_guard<std::mutex> lk(iomutex);
  cout << "Thread " << thread_id << " is executing at priority "
       << sch.sched_priority << endl;
}

int main() {
  std::thread t1(ThreadFun, 1), t2(ThreadFun, 2);
  sched_param sch;
  int policy;
  pthread_getschedparam(t1.native_handle(), &policy, &sch);
  sch.sched_priority = 20;
  if (pthread_setschedparam(t1.native_handle(), SCHED_FIFO, &sch)) {
    cout << "Failed to setschedparam: " << std::strerror(errno) << endl;
  }
  t1.join();
  t2.join();
  return 0;
}

使用 Super User,结果:

$ sudo ./test
Thread 1 is executing at priority 20
Thread 2 is executing at priority 0

`hardware_concurrency` (static)

返回硬件支持的 thread 数.这个值仅作为参考.如果这个值不可计算或没有很多的定义,那么实现返回 0.

#include   // NOLINT
#include 

int main() {
    unsigned int num = std::thread::hardware_concurrency();
    std::cout << num << " concurrent threads are supported." << std::endl;
}

swap

swap 操作用来交换 2 个线程对象的底层句柄.有 2 种可选,thread 类的成员函数和在 std 下的全局函数.

实例:

#include   // NOLINT
#include 
#include 

void Thread1Fun() {
  std::this_thread::sleep_for(std::chrono::seconds(1));
}

void Thread2Fun() {
  std::this_thread::sleep_for(std::chrono::seconds(1));
}

int main() {
    std::thread t1(Thread1Fun);
    std::thread t2(Thread2Fun);
    std::cout << "thread 1 id: " << t1.get_id() << std::endl;
    std::cout << "thread 2 id: " << t2.get_id() << std::endl;

    std::swap(t1, t2);
    std::cout << "after std::swap(t1, t2):" << std::endl;
    std::cout << "thread 1 id: " << t1.get_id() << std::endl;
    std::cout << "thread 2 id: " << t2.get_id() << std::endl;

    t1.swap(t2);
    std::cout << "after t1.swap(t2):" << std::endl;
    std::cout << "thread 1 id: " << t1.get_id() << std::endl;
    std::cout << "thread 2 id: " << t2.get_id() << std::endl;
    t1.join();
    t2.join();
    return 0;
}

管理当前 thread 的函数

在 thread 的头文件中,加了一个新的 namespace this_thread 用来包含一些管理操作当前 thread 的一些函数.

void yield();

重新调度线程的执行,让其他线程运行.具体行为依赖于实现,与 OS 的调度机制有关.

std::thread::id get_id();

返回当前线程的 thread::id 类型的对象.

template< class Rep, class Period >
void sleep_for( const std::chrono::duration<Rep, Period>& sleep_duration );

阻塞当前线程的执行至少相对时间 sleep_duration.

template< class Clock, class Duration >
void sleep_until( const std::chrono::time_point<Clock,Duration>& sleep_time );

阻塞当前线程的执行直到绝对时间 sleep_time 到达.

实例:

#include   // NOLINT
#include 
#include 
#include 
#include 
#include 
using std::cout;
using std::endl;
using std::chrono::system_clock;

std::atomic<bool> ready(false);

void Thread1Fun() {
  while (!ready) {
    std::this_thread::yield();
  }
  std::thread::id id = std::this_thread::get_id();
  cout << "thread " << id << "go to sleep" << endl;
  std::this_thread::sleep_for(std::chrono::seconds(1));
}

void Thread2Fun() {
  std::thread::id id = std::this_thread::get_id();
  cout << "thread " << id << "is running" << endl;
  ready = true;

  std::time_t tt = system_clock::to_time_t(system_clock::now());
  struct std::tm *ptm = std::localtime(&tt);
  ptm->tm_sec += 2;
  std::this_thread::sleep_until(system_clock::from_time_t(mktime(ptm)));
}

int main() {
  std::thread t1(Thread1Fun);
  std::thread t2(Thread2Fun);
  t1.join();
  t2.join();
  return 0;
}

Mutual exclusion

概要

头文件分为: mutexes,locks 和一些特殊函数. 具体见之后的Header synopsis.

Mutexes 是lockable types,用来对关键区域代码访问保护: mutex, recursive_mutex, timed_mutex, recursive_timed_mutex.
Locks 是用来管理 mutex 的对象,并对 mutex 的 lifetime 自我管理:lock_guard, unique_lock.
Functions 可以同时锁多个 mutexes(try_lock, lock),并使某个函数只被调用一次(call_once).

Lockable types

C++11 为 mutex 定义了不同类型的要求,如上图的层次,往右要求逐渐加强.

BasicLockable

BasicLockable 概念描述了最少特性类型,也就是满足(若 m 是 BasicLockable 类型 ):

m.lock()
m.unlock()

所以所有 mutex 都满足 BasicLockable 类型: mutex, recursive_mutex, timed_mutex, recursive_timed_mutex, unique_lock.

Lockable

Lockable 概念扩展了 BasicLockable 概念,并支持 try_lock.

所以这些 mutex 满足 Lockable 类型: mutex, recursive_mutex, timed_mutex, recursive_timed_mutex.

TimedLockable

TimedLockable 概念扩展了 Lockable 概念,并支持 try_lock_for 和 try_lock_until.

所以这些 mutex 满足 TimedLockable 类型: timed_mutex, recursive_timed_mutex.

`mutex` 类

mutex 类提供了一个不可递归的排它锁.基本接口可以从如下类中参考.

namespace std {
class mutex {
 public:
  constexpr mutex() noexcept;
  ~mutex();
  mutex(const mutex&) = delete;
  mutex& operator=(const mutex&) = delete;
  void lock();
  bool try_lock();
  void unlock();
  typedef implementation-defined native_handle_type; // See 30.2.3
  native_handle_type native_handle(); // See 30.2.3
};
}

实例:

#include   // NOLINT
#include 
#include 
#include 
using std::cout;
using std::endl;
using std::vector;

int g_value = 0;
std::mutex count_mutex;

void Increase() {
  const int kLoops = 100;
  for (int i = 0; i < kLoops; ++i) {
    count_mutex.lock();
    g_value++;
    count_mutex.unlock();
  }
}

int main(int argc, char *argv[]) {
  const int kNumThreads = 5;
  vector<std::thread> threads;
  for (int i = 0; i < kNumThreads; ++i) {
    threads.push_back(std::thread(Increase));
  }
  for (auto &thread : threads) {
    thread.join();
  }
  cout << "value = " << g_value << endl;
  return 0;
}

`recursive_mutex` 类

可递归的排它锁.如下基本接口如 mutex 基本一样.

namespace std {
class recursive_mutex {
 public:
  recursive_mutex();
  ~recursive_mutex();
  recursive_mutex(const recursive_mutex&) = delete;
  recursive_mutex& operator=(const recursive_mutex&) = delete;
  void lock();
  bool try_lock() noexcept;
  void unlock();
  typedef implementation-defined native_handle_type; // See 30.2.3
  native_handle_type native_handle(); // See 30.2.3
};
}

`timed_mutex` 类

namespace std {
class timed_mutex {
 public:
  timed_mutex();
  ~timed_mutex();
  timed_mutex(const timed_mutex&) = delete;
  timed_mutex& operator=(const timed_mutex&) = delete;
  void lock();
  bool try_lock();
  template <class Rep, class Period>
  bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
  template <class Clock, class Duration>
  bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
  void unlock();
  typedef implementation-defined native_handle_type; // See 30.2.3
  native_handle_type native_handle(); // See 30.2.3
};
}

`recursive_timed_mutex` 类

namespace std {
class recursive_timed_mutex {
 public:
  recursive_timed_mutex();
  ~recursive_timed_mutex();
  recursive_timed_mutex(const recursive_timed_mutex&) = delete;
  recursive_timed_mutex& operator=(const recursive_timed_mutex&) = delete;
  void lock();
  bool try_lock() noexcept;
  template <class Rep, class Period>
  bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
  template <class Clock, class Duration>
  bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
  void unlock();
  typedef implementation-defined native_handle_type; // See 30.2.3
  native_handle_type native_handle(); // See 30.2.3
};
}

Mutex Exception safety

基本保证: 当 exception 被以上 mutex 的成员函数抛出时,这些 mutex 对象保持有效状态. 如果是 lock 操作被 exception, lock 不会被抛出 exception 的线程所拥有.

抛出的是一个 system_error exception, 导致的基本情况是:

exception 类型	error 情况	描述
`system_error`	`errc::resource_deadlock_would_occur`	deadlock 被检测到
`system_error`	`errc::operation_not_permitted`	线程没有权利做这个操作
`system_error`	`errc::device_or_resource_busy`	native handle 已经被锁

`lock_guard` 类

之前的 mutex 必须写明 lock 和 unlock 调用,如果在 lock 和 unlock 之间产生 exception,那么必须在 exception 处理中不能忘记处理 unlock.当只是在一个关键区域内需要 mutex 保护,使用这样的 mutex 既不方便也容易忘记 unlock 而造成死锁.

引入对之前的 mutex 的封装后的 lock_guard 和 unique_lock ,提供易用性的 RAII-style 机制来获取锁在一段区域内.

lock guard 是一个用来管理一个 mutex 对象,并保持锁住它的对象.

在构造时,mutex 对象被调用的线程锁住,然后在析构时,mutex 被解锁.它是最简单的 lock,并且作为自动作用范围直到它的作用区域结束时特别有用.通过这种方法,它保证 mutex 对象得到解锁即使在 exception 被抛出时.

namespace std {
template <class Mutex>
class lock_guard {
 public:
  typedef Mutex mutex_type;
  explicit lock_guard(mutex_type& m);
  lock_guard(mutex_type& m, adopt_lock_t);
  ~lock_guard();
  lock_guard(lock_guard const&) = delete;
  lock_guard& operator=(lock_guard const&) = delete;
 private:
  mutex_type& pm; // exposition only
};
}

实例:

#include   // NOLINT
#include 
#include 
#include 

std::mutex mtx;

void PrintEven(int x) {
  if (x % 2 == 0) {
    std::cout << x << " is even\n";
  } else {
    throw(std::logic_error("not even"));
  }
}

void PrintThreadEvenId(int id) {
  try {
    std::lock_guard<std::mutex> lck(mtx);
    PrintEven(id);
  } catch (std::logic_error&) {
    std::cout << "[exception caught]" << std::endl;
  }
}

int main() {
  std::thread threads[10];
  for (int i = 0; i < 10; ++i) {
    threads[i] = std::thread(PrintThreadEvenId, i+1);
  }
  for (auto& th : threads) {
    th.join();
  }
  return 0;
}

`unique_lock` 类

unique_lock 与上面的 lock_guard 基本差不多,同样是 RAII-style 机制来获取锁在一段区域内的对象.

但 lock_guard 非常简单,只提供构造自动拥有锁和析构释放锁,如果需要一些其他的操作,那么就需要更复杂和接口更多的类来处理, lock_guard 能满足如此要求. 它类基本接口如下.

class

namespace std {
template <class Mutex>
class unique_lock {
 public:
  typedef Mutex mutex_type;
  // 30.4.2.2.1, construct/copy/destroy:
  unique_lock() noexcept;
  explicit unique_lock(mutex_type& m);
  unique_lock(mutex_type& m, defer_lock_t) noexcept;
  unique_lock(mutex_type& m, try_to_lock_t);
  unique_lock(mutex_type& m, adopt_lock_t);
  template <class Clock, class Duration>
  unique_lock(mutex_type& m, const chrono::time_point<Clock, Duration>& abs_time);
  template <class Rep, class Period>
  unique_lock(mutex_type& m, const chrono::duration<Rep, Period>& rel_time);
  ~unique_lock();
  unique_lock(unique_lock const&) = delete;
  unique_lock& operator=(unique_lock const&) = delete;
  unique_lock(unique_lock&& u) noexcept;
  unique_lock& operator=(unique_lock&& u) noexcept;
  // 30.4.2.2.2, locking:
  void lock();
  bool try_lock();
  template <class Rep, class Period>
  bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
  template <class Clock, class Duration>
  bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
  void unlock();
  // 30.4.2.2.3, modifiers:
  void swap(unique_lock& u) noexcept;
  mutex_type *release() noexcept;
  // 30.4.2.2.4, observers:
  bool owns_lock() const noexcept;
  explicit operator bool () const noexcept;
  mutex_type* mutex() const noexcept;
 private:
  mutex_type *pm; // exposition only
  bool owns; // exposition only
};
template <class Mutex>
void swap(unique_lock<Mutex>& x, unique_lock<Mutex>& y) noexcept;
}

Constructor

在mutex header 概要中可以看到有不同的构造函数,其中一类 unique_lock 构造传入不同的类型:

defer_lock : 不去获取 mutex,只有要和 mutex 一样,手动去 lock 它.
try_to_lock : 相当于在构造时,调用 try_lock, 不阻塞,之后可通过成员函数 bool owns_lock() 或直接操作符 explicit operator bool() const 判断是否获取锁成功.
adopt_lock_t : 认为调用的线程已经占有这个锁 m.已经占有这个锁了,为什么要去创建一个 unique_lock 去包含它呢? 因为可以利用 unique_lock 中途接手管理这个锁 m, 比如想用 RAII-style 机制管理它,使它 exception safe 等.

这些类型在源代码定义基本如下:

struct defer_lock_t { };
struct try_to_lock_t { };
struct adopt_lock_t { };
constexpr std::defer_lock_t defer_lock = std::defer_lock_t();
constexpr std::try_to_lock_t try_to_lock = std::try_to_lock_t();
constexpr std::adopt_lock_t adopt_lock = std::adopt_lock_t();

余下的构造:

unique_lock(); :仅仅创建一个 nique_lock 对象,不和任何 mutex 相关联.
nique_lock(unique_lock&& other); : 通过 other 的内容来构造 nique_lock 对像,使得 other 不和任何 mutex 相关连联.
explicit unique_lock(mutex_type& m); : 通过 m.lock() 来构造与 m 相关联的 unique_lock 对象.
unique_lock(mutex_type& m, const std::chrono::duration& timeout_duration); : 通过 m.try_lock_for(timeout_duration) 来构造与 m 相关联的 unique_lock 对象.
unique_lock( mutex_type& m, const std::chrono::time_point& timeout_time); : 通过 m.try_lock_until(timeout_time) 来构造与 m 相关联的 unique_lock 对象.

实例

利用 defer_lock, 不去获取 mutex, 只创建与它相关联的 unique_lock 对象,之后用 lock() 同时去获取两个锁,防止死锁.

#include   // NOLINT
#include 
#include 
#include 
using std::cout;
using std::endl;

struct Box {
  explicit Box(int num) : num_things{num} {}
  int num_things;
  std::mutex m;
};

void Transfer(Box *from, Box *to, int num) {
  // don't actually take the locks yet
  std::unique_lock<std::mutex> lock1(from->m, std::defer_lock);
  std::unique_lock<std::mutex> lock2(to->m, std::defer_lock);
  // lock both unique_locks without deadlock
  std::lock(lock1, lock2);
  from->num_things -= num;
  to->num_things += num;
  // 'from.m' and 'to.m' mutexes unlocked in 'unique_lock' dtors
}

int main() {
  Box acc1(100);
  Box acc2(50);
  cout << "acc1 num = " << acc1.num_things <<
      " ,acc2 num = " << acc2.num_things << endl;
  std::thread t1(Transfer, &acc1, &acc2, 10);
  std::thread t2(Transfer, &acc2, &acc1, 5);
  t1.join();
  t2.join();
  cout << "after transfer: " << "acc1 num = " << acc1.num_things <<
      " ,acc2 num = " << acc2.num_things << endl;
  return 0;
}

`lock_guard` VS `unique_lock`

lock_guard 和 unique_lock 很大程序上很相似,都是 RAII-style 机制来封装一个 mutex 的锁, lock_guard 可以说是 unique_lock 更严格并拥有限制的接口的版本.

如何合适的选择两者的使用呢? 如果 lock_guard 对于情况 A 足够,那么就使用它. 不仅仅是从效率(efficiency)考虑,更是从想要表达的功能(functionality) 考虑. 使用 lock_guard 不仅避免了不需要的其他接口的开销,更是对读代码者表达它的意图,你将永远都不需要解锁这个 guard.

所以你先考虑使用 lock_guard, 除非你需要 unique_lock 的功能. 比如 condition_variable 就需要传入一个 unique_lock 对象.

`try_lock` 和 `lock`

template< class Lockable1, class Lockable2, class LockableN... >
int try_lock(Lockable1& lock1, Lockable2& lock2, LockableN& lockn... );

按对象 lock1, lock2, …, lockn 从头到尾的顺序尝试去获取每个锁. 如果某个 try_lock 失败, unlock 所有对象并返回. 返回值:

成功: -1.
失败: 以 0 为起始点的获取锁失败的对象次序数(0 对于 lock1, 1 对于 lock2, ..).

template< class Lockable1, class Lockable2, class LockableN... >
void lock( Lockable1& lock1, Lockable2& lock2, LockableN& lockn... );

占有传入的锁 lock1, lock2, …, lockn,使用 防止死锁算饭 来防止死锁.

对于传入对象按照不特定的顺序调用它们的成员函数 lock , try_lock, unlock ,确保最后所有的锁被获取成功在函数返回时.

`call_once`

class once_flag;
template< class Callable, class... Args >
void call_once( std::once_flag& flag, Callable&& f, Args&&... args );

为了让一段代码只被多个线程只执行一次, mutex 文件中中包含了这个保证只调用一次的接口.

once_flag 对象是辅助 call_once 的,作为多个线程共同执行这段的标识, 所以这些个线程必须传入同一个 once_flag 对象.

它并对 exception 做一定的处理,如果 call_once 执行的函数以 exception 退出,那么 exception 会抛给调用者.这次已 exception 退出的执行并不算一次,之后其他函数仍可以继续调用它一次.

如下的实例, t1 和 t2 线程抛出 exception, t3 仍然运行一次, t4 无论是怎样,都得不到运行.

#include   // NOLINT
#include 
#include 
using std::cout;
using std::endl;

std::once_flag flag;

inline void MayThrowFunction(bool do_throw) {
  // only one instance of this function can be run simultaneously
  if (do_throw) {
    cout << "throw" << endl;  // this message may be printed from 0 to 3 times
    // if function exits via exception, another function selected
    throw std::exception();
  }
  cout << "once" << endl;  // printed exactly once, it's guaranteed that
  // there are no messages after it
}

inline void DoOnce(bool do_throw) {
  try {
    std::call_once(flag, MayThrowFunction, do_throw);
  }
  catch (...) {
  }
}

int main() {
  std::thread t1(DoOnce, true);
  std::thread t2(DoOnce, true);
  std::thread t3(DoOnce, false);
  std::thread t4(DoOnce, true);
  t1.join();
  t2.join();
  t3.join();
  t4.join();
  return 0;
}

Condition variables

概要

头文件主要包含两个 condition_variable 类, 一个全局函数.

namespace std {
class condition_variable;
class condition_variable_any;
void notify_all_at_thread_exit(condition_variable& cond, unique_lock lk);
enum class cv_status { no_timeout, timeout };
}

`cv_status`

Condition variables 与 mutex 之类在等待 timeout 时,返回的不一样,mutex 之类放回 bool 类型, 而 Condition variables 特意为它定义了 enum 类型: no_timeout 和 timeout, 来判断等待是否成功.

enum class cv_status { no_timeout, timeout };

cv_status::no_timeout The function returned without a timeout (i.e., it was notified).
cv_status::timeout The function returned because it reached its time limit (timeout).

`notify_all_at_thread_exit`

void notify_all_at_thread_exit(std::condition_variable& cond,
                                std::unique_lock<std::mutex> lk);

头文件中有这个函数,它提供机制 notify 其他线程在调用这个函数的线程退出时. 它相当于操作(并包括清理所有 thread_local 对象):

lk.unlock();
cond.notify_all();

虽然可以在调用线程的最后同样调用如上两句代码,但意图没有表现出来,表明 cond 的 notify 必须在线程退出时调用,后面维护者可能会在这之后继续添加代码. notify_all_at_thread_exit 用一句调用替代两个调用,既不用在函数最后去调用它,而且表明它的意图.

它的操作流程如下:

之前获取的锁 lk 的拥有权被转移到 cond 的内部.
当此线程退出时, cond 被 notified 通过:

lk.unlock();
cond.notify_all();

Notes

如果 lk.mutex() 没有被当前线程锁住,调用此函数导致 undefined behavior.
如果 lk.mutex() 的 mutex 不是其他线程使用来等待 condition variable 的同一个的话, 调用此函数导致 undefined behavior.

`condition_variable` 类

namespace std {
class condition_variable {
 public:
  condition_variable();
  ~condition_variable();
  condition_variable(const condition_variable&) = delete;
  condition_variable& operator=(const condition_variable&) = delete;
  void notify_one() noexcept;
  void notify_all() noexcept;
  void wait(unique_lock& lock);
  template <class Predicate>
  void wait(unique_lock& lock, Predicate pred);
  template <class Clock, class Duration>
  cv_status wait_until(unique_lock& lock,
                       const chrono::time_point<Clock, Duration>& abs_time);
  template <class Clock, class Duration, class Predicate>
  bool wait_until(unique_lock& lock,
                  const chrono::time_point<Clock, Duration>& abs_time,
                  Predicate pred);
  template <class Rep, class Period>
  cv_status wait_for(unique_lock& lock,
                     const chrono::duration<Rep, Period>& rel_time);
  template <class Rep, class Period, class Predicate>
  bool wait_for(unique_lock& lock,
                const chrono::duration<Rep, Period>& rel_time,
                Predicate pred);
  typedef implementation-defined native_handle_type; // See 30.2.3
  native_handle_type native_handle(); // See 30.2.3
};
}

Condition Variable 的基本概念可以从之前篇浅谈 C++ Multithreading Programming中获取.

condition_variable 类的 void wait(unique_lock& lock, Predicate pred); 接口:

需要传入 unique_lock.
pred 函数, 如果 predicate 返回 false ,等待. 相当于:

while (!pred()) {
    wait(lock);
}

实例:

#include   // NOLINT
#include 
#include 
#include 
#include 
using std::string;
using std::cout;
using std::endl;

std::mutex m;
std::condition_variable cv;
string data;
bool g_ready = false;
bool g_processed = false;

void WorkerThread() {
    // Wait until main() sends data
    std::unique_lock<std::mutex> lk(m);
    cv.wait(lk, []{return g_ready;});

    // after the wait, we own the lock.
    cout << "Worker thread is processing data" << endl;
    data += " after processing";

    // Send data back to main()
    g_processed = true;
    cout << "Worker thread signals data processing completed" << endl;

    // Manual unlocking is done before notifying, to avoid
    // that the waiting thread gets blocked again.
    lk.unlock();
    cv.notify_one();
}

int main() {
    std::thread worker(WorkerThread);
     data = "Example data";
    // send data to the worker thread
    {
        std::lock_guard<std::mutex> lk(m);
        g_ready = true;
        cout << "main() signals data ready for processing" << endl;
    }
    cv.notify_one();

    // wait for the worker
    {
        std::unique_lock<std::mutex> lk(m);
        cv.wait(lk, []{return g_processed;});
    }
    cout << "Back in main(), data = " << data << '\n';
    worker.join();
    return 0;
}

`condition_variable_any` 类

namespace std {
class condition_variable_any {
 public:
  condition_variable_any();
  ~condition_variable_any();
  condition_variable_any(const condition_variable_any&) = delete;
  condition_variable_any& operator=(const condition_variable_any&) = delete;
  void notify_one() noexcept;
  void notify_all() noexcept;
  template <class Lock>
  void wait(Lock& lock);
  template <class Lock, class Predicate>
  void wait(Lock& lock, Predicate pred);
  template <class Lock, class Clock, class Duration>
  cv_status wait_until(Lock& lock, const chrono::time_point<Clock, Duration>& abs_time);
  template <class Lock, class Clock, class Duration, class Predicate>
  bool wait_until(Lock& lock, const chrono::time_point<Clock, Duration>& abs_time,
                  Predicate pred);
  template <class Lock, class Rep, class Period>
  cv_status wait_for(Lock& lock, const chrono::duration<Rep, Period>& rel_time);
  template <class Lock, class Rep, class Period, class Predicate>
  bool wait_for(Lock& lock, const chrono::duration<Rep, Period>& rel_time,
                Predicate pred);
};
}

condition_variable_any 是 condition_variable 的一个通用版,它可以等待任何满足 BasicLockable 要求 Lock 类型的对象.其他与 condition_variable 一样.

实例:

#include   // NOLINT
#include 
#include 
#include 
#include 
using std::cout;
using std::endl;

std::condition_variable_any cv;
std::mutex cv_m;  // This mutex is used for three purposes:
                  // 1) to synchronize accesses to i
                  // 2) to synchronize accesses to std::cout
                  // 3) for the condition variable cv
int g_wait_val = 0;

void WaitVal(int id) {
  std::unique_lock<std::mutex> lk(cv_m);
  cout << "thread " << id << " Waiting... " << endl;
  cv.wait(lk, []{return g_wait_val == 1;});
  cout << "...finished waiting," << "thread " << id << endl;
}

void Signals() {
  std::this_thread::sleep_for(std::chrono::seconds(1));
  {
    std::lock_guard<std::mutex> lk(cv_m);
    cout << "Notifying..." << endl;
  }
  cv.notify_all();
  std::this_thread::sleep_for(std::chrono::seconds(1));
  {
    std::lock_guard<std::mutex> lk(cv_m);
    g_wait_val = 1;
    cout << "Notifying again..." << endl;
  }
  cv.notify_all();
}

int main() {
  std::vector<std::thread> threads;
  for (int i = 0; i < 3; ++i) {
    threads.emplace_back(WaitVal, i);
  }
  threads.emplace_back(Signals);
  for (auto& t : threads) {
        t.join();
  }
  return 0;
}

`condition_variable` VS `condition_variable_any`

引自 N3690 §30.5[thread.condition]:

Class condition_variable provides a condition variable that can only wait on an object of type unique_lock , allowing maximum efficiency on some platforms. Class condition_variable_any provides a general condition variable that can wait on objects of user-supplied lock types.

condition_variable 只与 unique_lock 类型对象关联,在某些平台上,它可以更好的得到特定的优化,如果不需要 condition_variable_any 的灵活性, 选更高效的 condition_variable 对象使用.

Future

概要

如果要异步的获取一个函数的运行结果, 可以创建一个线程,并利用 Condition varialbes 来同步线程间使得另外线程正确获取到这个结果. 但 C++11 的 future 库使得这一过程更方便, 它提供接口使程序在一个线程中获取一个在同一个或其他线程中运行的函数的结果(值或异常), (这些类使用并不限制在 multi-threaded 程序中,同样可以在 single-threaded 使用.

future 的概要主要分为:

运行函数提供共享结果的 Providers 类: promise 和 packaged_task .
获取共享结果的 Futures 类: future 和 shared_future .
Error handling: future_error , future_errc 等.
Providers 提供函数: async .

Error handling

`future_error` 类

future_error 类定义对 future 对象非法操作抛出异常的对象类型. 也就是专门为 future 库中接口出现异常提供特定的异常类.

从上图类图可知,这个类继承自 logic_error , 并添加获取 error_code 的成员函数 code , 获取 exception 信息的 what 成员函数.

namespace std {
class future_error : public logic_error {
 public:
  future_error(error_code ec); // exposition only
  const error_code& code() const noexcept;
  const char* what() const noexcept;
};
}
const error_code& code() const noexcept;

实例:

#include 
#include   // NOLINT

int main() {
  std::future<int> empty;
  try {
    int n = empty.get();
  } catch (const std::future_error& e) {
    std::cout << "Caught a future_error with code \"" << e.code()
              << "\"\nMessage: \"" << e.what() << "\"\n";
  }
}

`future_errc`

enum class future_errc {
    broken_promise             = /* implementation-defined */,
    future_already_retrieved   = /* implementation-defined */,
    promise_already_satisfied  = /* implementation-defined */,
    no_state                   = /* implementation-defined */
};

这个 enum class 定义了 future 抛出异常的error condition. future_errc 的值可以用来创建 error_condition 对象, 并与 future_error 的成员函数 code 返回的值对比, 决定所抛出异常的类型.

所以另外有两个函数提供它们之间的转换:

std::error_code make_error_code( std::future_errc e );
std::error_condition make_error_condition( std::future_errc e );
template<>
struct is_error_condition_enum<std::future_errc> : std::true_type;

实例:

#include   // NOLINT
#include 

int main() {
  std::promise<int> prom;

  try {
    prom.get_future();
    prom.get_future();
    // throws std::future_error with future_already_retrieved
  }
  catch (std::future_error& e) {
    if (e.code() ==
        std::make_error_condition(std::future_errc::future_already_retrieved)) {
      std::cerr << "[future already retrieved]\n";
    } else {
      std::cerr << "[unknown exception]\n";
    }
  }
  return 0;
}

`future_status`

enum class future_status {
    ready,
    timeout,
    deferred
};

future 和 shared_future 类中属于 wait 类型的接口返回的状态.

deferred: 返回这个类型是因为共享状态(shared state)含有的一个 deferred 函数.(见async 函数)

`future_category`

用来识别 future error 种类.

const std::error_category& future_category();

这个函数返回一个 error_category 类型的静态对象,拥有如下特性:

它的 name 成员函数返回指向字符串”future”的指针.

实例:

#include   // NOLINT
#include 

int main() {
  std::promise<int> prom;
  try {
    prom.get_future();
    prom.get_future();
    // throws a std::future_error of the future category
  }
  catch (std::future_error& e) {
    if (e.code().category() == std::future_category()) {
      std::cerr << "future_error of the future category thrown\n";
    }
  }
  return 0;
}

`template promise`

模版类 promise 提供一种方便的方法存储一个值或异常,之后可以异步的被 future 对象获取(同一个或其他线程).

promise 对象在共享状态(shared state)存储值的操作 synchronizes-with 在其他函数中成功获取这个共享状态的返回值(如 future::get ).

class

namespace std {
template <class R>
class promise {
 public:
  promise();
  template <class Allocator>
  promise(allocator_arg_t, const Allocator& a);
  promise(promise&& rhs) noexcept;
  promise(const promise& rhs) = delete;
  ~promise();
  // assignment
  promise& operator=(promise&& rhs) noexcept;
  promise& operator=(const promise& rhs) = delete;
  void swap(promise& other) noexcept;
  // retrieving the result
  future<R> get_future();
  // setting the result
  void set_value(see below );
  void set_exception(exception_ptr p);
  // setting the result with deferred notification
  void set_value_at_thread_exit(const R& r);
  void set_value_at_thread_exit(see below );
  void set_exception_at_thread_exit(exception_ptr p);
};
template <class R>
void swap(promise<R>& x, promise<R>& y) noexcept;
template <class R, class Alloc>
struct uses_allocator<promise<R>, Alloc>;
}

`set_value` and `set_value_at_thread_exit`

set_value 接口存储值到 shared state,并使 state 准备好.这个操作是原子性的. 而 set_value_at_thread_exit 接口如名字,调用后不会马上设置值到 shared state 中,只在当前函数退出时.

使用 get_future 返回与它相关联同一个 shared state 的 future 对象.

实例:

#include   // NOLINT
#include 
#include 
#include 

void Print(std::future<int>& fut) {
  // (synchronizes with getting the future)
  int x = fut.get();
  std::cout << "value: " << x << std::endl;
}

int main() {
  std::promise<int> prom;
  std::future<int> fut = prom.get_future();
  std::thread t1(Print, std::ref(fut));
  prom.set_value(10);  // fulfill promise
  t1.join();
  return 0;
}

`set_exception` and `set_exception_at_thread_exit`

这两个接口与上面 set_value 和 set_value_at_thread_exit 一样, 只是保存的是 exception.

实例:

#include   // NOLINT
#include 
#include 

int main() {
  std::promise<int> result;

  std::thread t([&]{
      try {
        throw std::runtime_error("Example");
      } catch(...) {
        try {
          // store anything thrown in the promise
          result.set_exception(std::current_exception());
        } catch(...) {}  // set_exception() may throw too
      }
    });

  try {
    std::cout << result.get_future().get();
  } catch(const std::exception& e) {
    std::cout << "Exception from the thread: " << e.what() << std::endl;
  }
  t.join();
  return 0;
}

`template packaged_task`

packaged_task 与 promise 类似,都是提供异步获取值的方法,不同是 promise 直接设置值, 而 packaged_task 封装一个可调用的元素,并把这个可调用任务的返回值异步到 shared state 中.

class

namespace std {
template<class> class packaged_task; // undefined
template<class R, class... ArgTypes>
class packaged_taskArgTypes...)> {
 public:
  // construction and destruction
  packaged_task() noexcept;
  template <class F>
  explicit packaged_task(F&& f);
  template <class F, class Allocator>
  explicit packaged_task(allocator_arg_t, const Allocator& a, F&& f);
  ~packaged_task();
  // no copy
  packaged_task(const packaged_task&) = delete;
  packaged_task& operator=(const packaged_task&) = delete;
  // move support
  packaged_task(packaged_task&& rhs) noexcept;
  packaged_task& operator=(packaged_task&& rhs) noexcept;
  void swap(packaged_task& other) noexcept;
  bool valid() const noexcept;
  // result retrieval
  future<R> get_future();
  // execution
  void operator()(ArgTypes... );
  void make_ready_at_thread_exit(ArgTypes...);
  void reset();
};
template <class R, class... ArgTypes>
void swap(packaged_task& x, packaged_task& y) noexcept;
template <class R, class Alloc>
struct uses_allocator<packaged_task<R>, Alloc>;
}

construct and use

packaged_task 的创建与 thread 类似, 它可以:

Lambda 表达式.
Bind 一个函数.
直接传入函数.

运行:

因为它重载了操作符 () , 可以直接运行如: task() .
可以 move 给一个线程运行.

实例:

#include   // NOLINT
#include 
#include 
#include 
#include 

// unique function to avoid disambiguating the std::pow overload set
int FunPow(int x, int y) {
  return std::pow(x, y);
}

void TaskLambda() {
  std::packaged_task<int(int, int)> task([](int a, int b) {
      return std::pow(a, b);
    });
  std::future<int> result = task.get_future();
  task(2, 9);
  std::cout << "task_lambda:\t" << result.get() << '\n';
}

void TaskBind() {
  std::packaged_task<int()> task(std::bind(FunPow, 2, 11));
  std::future<int> result = task.get_future();
  task();
  std::cout << "task_bind:\t" << result.get() << '\n';
}

void TaskThread() {
  std::packaged_task<int(int, int)> task(FunPow);
  std::future<int> result = task.get_future();
  std::thread task_td(std::move(task), 2, 10);
  task_td.join();
  std::cout << "task_thread:\t" << result.get() << '\n';
}

int main() {
  TaskLambda();
  TaskBind();
  TaskThread();
}

reset

packaged_task 的 reset 接口, 重置状态,舍弃之前运行的结果.相当于: *this = packaged_task(std::move(f)) .

实例:

#include   // NOLINT
#include 
#include 
#include 

int main() {
    std::packaged_task<int(int, int)> task([](int a, int b) {
        return std::pow(a, b);
    });
    std::future<int> result = task.get_future();
    task(2, 9);
    std::cout << "2^9 = " << result.get() << '\n';

    task.reset();
    result = task.get_future();
    std::thread task_td(std::move(task), 2, 10);
    task_td.join();
    std::cout << "2^10 = " << result.get() << '\n';
}

`template future` 类

模版类 future 是用来异步获取共享状态里的结果. future 类是独占的,不能与其他 future 共享异步的获取结果. 若要多个 future 共享异步结果, 使用之后的 shared_future 类.

有效的与共享状态相关联的 future 对象,由如下函数构造:

async .
promise::get_future .
package_task::get_future .

它的接口:

share : 转换 shared state 从 *this 到一个 shared_future 对象.
get : 返回 shared state 的值, 若未准备好,调用者阻塞等待它准备好.
wait : 阻塞等待结果直到有效.
wait_for 和 wait_until : 等待一段时间, 并通过 future_status 判断等待后的状态.

namespace std {
template <class R>
class future {
 public:
  future() noexcept;
  future(future &&) noexcept;
  future(const future& rhs) = delete;
  ~future();
  future& operator=(const future& rhs) = delete;
  future& operator=(future&&) noexcept;
  shared_future<R> share();
  // retrieving the value
  see below get();
  // functions to check state
  bool valid() const noexcept;
  void wait() const;
  template <class Rep, class Period>
  future_status wait_for(const chrono::duration<Rep, Period>& rel_time) const;
  template <class Clock, class Duration>
  future_status wait_until(const chrono::time_point<Clock, Duration>& abs_time) const;
};
}

`template shared_future` 类

模版类 shared_future 与 future 基本一样, 不同就是多个 shared_future 对象可以共享异步结果.

namespace std {
template <class R>
class shared_future {
 public:
  shared_future() noexcept;
  shared_future(const shared_future& rhs);
  shared_future(future<R>&&) noexcept;
  shared_future(shared_future&& rhs) noexcept;
  ~shared_future();
  shared_future& operator=(const shared_future& rhs);
  shared_future& operator=(shared_future&& rhs) noexcept;
  // retrieving the value
  see below get() const;
  // functions to check state
  bool valid() const noexcept;
  void wait() const;
  template <class Rep, class Period>
  future_status wait_for(const chrono::duration<Rep, Period>& rel_time) const;
  template <class Clock, class Duration>
  future_status wait_until(const chrono::time_point<Clock, Duration>& abs_time) const;
};
}

实例:

#include   // NOLINT
#include 
#include 

int main() {
  std::promise<void> ready_promise, t1_ready_promise, t2_ready_promise;
  std::shared_future<void> ready_future(ready_promise.get_future());
  std::chrono::time_point<std::chrono::high_resolution_clock> start;

  auto fun1 = [&]() -> std::chrono::duration<double, std::milli> {
    t1_ready_promise.set_value();
    ready_future.wait();  // waits for the signal from main()
    return std::chrono::high_resolution_clock::now() - start;
  };

  auto fun2 = [&]() -> std::chrono::duration<double, std::milli> {
    t2_ready_promise.set_value();
    ready_future.wait();  // waits for the signal from main()
    return std::chrono::high_resolution_clock::now() - start;
  };

  auto result1 = std::async(std::launch::async, fun1);
  auto result2 = std::async(std::launch::async, fun2);

  // wait for the threads to become ready
  t1_ready_promise.get_future().wait();
  t2_ready_promise.get_future().wait();

  // the threads are ready, start the clock
  start = std::chrono::high_resolution_clock::now();

  // signal the threads to go
  ready_promise.set_value();

  std::cout << "Thread 1 received the signal "
            << result1.get().count() << " ms after start\n"
            << "Thread 2 received the signal "
            << result2.get().count() << " ms after start\n";
  return 0;
}

template async 函数

模版函数 asnyc 异步运行函数 f,并返回一个 future 对象来获取这个函数调用的结果.

Launching policy for async

enum class launch : /* unspecified */ {
    async =    /* unspecified */,
    deferred = /* unspecified */,
    /* implementation-defined */
};

函数 async 有不同的策略来运行函数:

launch::async :创建一个新的线程来调用函数ｆ.
launch::deferred :调用函数 f 延迟(deferred)到返回的 future 的 shared state 被访问时(wait 或 get).
launch::async|launch::deferred :函数自动选择策略运行.与系统的库实现有关.

async

template <class F, class... Args>
future<typename result_of<typename decay<F>::type(typename decay::type...)>::type>
async(F&& f, Args&&... args);
template <class F, class... Args>
future<typename result_of<typename decay<F>::type(typename decay::type...)>::type>
async(launch policy, F&& f, Args&&... args);

第一个接口没有 policy 作为传入参数, 相当于 async(std::launch::async | std::launch::deferred, f, args...)

实例:

#include   // NOLINT
#include 
#include 
#include 
#include 

template <typename RAIter>
int ParallelSum(RAIter beg, RAIter end) {
  auto len = std::distance(beg, end);
  if (len < 1000)
    return std::accumulate(beg, end, 0);

  RAIter mid = beg + len/2;
  auto handle = std::async(std::launch::async,
                           ParallelSum<RAIter>, mid, end);
  int sum = ParallelSum(beg, mid);
  return sum + handle.get();
}

int main() {
    std::vector<int> v(10000, 1);
    std::cout << "The sum is " << ParallelSum(v.begin(), v.end()) << '\n';
}

Header synopsis

基本概要如下(§30.3 [thread.threads] of N3690):

// Header  synopsis
namespace std {
class thread;
void swap(thread& x, thread& y) noexcept;
namespace this_thread {
thread::id get_id() noexcept;
void yield() noexcept;
template <class Clock, class Duration>
void sleep_until(const chrono::time_point<Clock, Duration>& abs_time);
template <class Rep, class Period>
void sleep_for(const chrono::duration<Rep, Period>& rel_time);
}
}

// Header  synopsis
namespace std {
class mutex;
class recursive_mutex;
class timed_mutex;
class recursive_timed_mutex;
struct defer_lock_t { };
struct try_to_lock_t { };
struct adopt_lock_t { };
constexpr defer_lock_t defer_lock { };
constexpr try_to_lock_t try_to_lock { };
constexpr adopt_lock_t adopt_lock { };
template <class Mutex> class lock_guard;
template <class Mutex> class unique_lock;
template <class Mutex>
void swap(unique_lock<Mutex>& x, unique_lock<Mutex>& y) noexcept;
template <class L1, class L2, class... L3> int try_lock(L1&, L2&, L3&...);
template <class L1, class L2, class... L3> void lock(L1&, L2&, L3&...);
struct once_flag {
  constexpr once_flag() noexcept;
  once_flag(const once_flag&) = delete;
  once_flag& operator=(const once_flag&) = delete;
};
template<class Callable, class ...Args>
void call_once(once_flag& flag, Callable func, Args&&... args);
}

namespace std {
enum class future_errc {
broken_promise = implementation-defined ,
future_already_retrieved = implementation-defined ,
promise_already_satisfied = implementation-defined ,
no_state = implementation-defined
};
enum class launch : unspecified {
async = unspecified ,
deferred = unspecified ,
implementation-defined
};
enum class future_status {
ready,
timeout,
deferred
};
template <> struct is_error_code_enum<future_errc> : public true_type { };
error_code make_error_code(future_errc e) noexcept;
error_condition make_error_condition(future_errc e) noexcept;
const error_category& future_category() noexcept;
class future_error;
template <class R> class promise;
template <class R> class promise<R&>;
template <> class promise<void>;
template <class R>
void swap(promise<R>& x, promise<R>& y) noexcept;
template <class R, class Alloc>
struct uses_allocator<promise<R>, Alloc>;
template <class R> class future;
template <class R> class future<R&>;
template <> class future<void>;
template <class R> class shared_future;
template <class R> class shared_future<R&>;
template <> class shared_future<void>;
template <class> class packaged_task; // undefined
template <class R, class... ArgTypes>
class packaged_taskArgTypes...)>;
template <class R>
void swap(packaged_task&, packaged_task&) noexcept;
template <class R, class Alloc>
struct uses_allocator<packaged_task<R>, Alloc>;
template <class F, class... Args>
future<typename result_of<typename decay<F>::type(typename decay::type...)>::type>
async(F&& f, Args&&... args);
template <class F, class... Args>
future<typename result_of<typename decay<F>::type(typename decay::type...)>::type>
async(launch policy, F&& f, Args&&... args);
}

其他资料

Books

Scott Meyers 的 Overview of the New C++ (C++11/14)

Online resources

Scott Meyers 的Summary of C++11 Feature Availability in gcc and MSVC
C++11 on cppreference
C++11 on cplusplus
Bjarne Stroustrup 的C++11 FAQ
C++11 Wiki
C++ standards drafts on GitHub
C documentation for Thread support library

浅谈C++ Multithreading Programming

2014-08-07T00:00:00+08:00

Overview

随着多核 CPU 随处可见,多线程(multithreading)可以被用来实现并行,提高 CPU 的利用率和性能显著的提高.掌握多线程编程也成为现代实现软件的基本要求技能之一.Introduction to Parallel Computing详细的介绍了 Parallel Computing; 为什么使用它;Parallel Computing 的分类;Parallel Computing 的 limits 和 costs; Parallel Computing 的程序模型;如何设计 Parallel 程序等.

这里先介绍多线程的概念,多线程中涉及的基本概念,然后用实例介绍 Pthread 库的使用,并介绍 Google Code 中如何把它封装成 C++类,最后介绍可移植并大量使用的 Boost Thread 库.

还有一些其他的 Thread 库:

OpenMP 是一个可移植的接口,在共享内存的多处理器上实现 fork-join 并行. OpenMP 的 tutorial:这里和这里,
OMNI Thread为 C++提供 Thread 操作的.
其他 multithreading libraries.

Thread

定义

A thread is defined as an independent stream of instructions that can be scheduled to run as such by the operating system.所以它是在程序中独立于其他代码可由操作系统调度的一段指令.

那么是操作系统是如何具体实现这一独立性呢?

要理解 thread,必须先明白 process.进程由操作系统创建来运行相应的程序,进程包含程序资源和程序执行状态的信息.以 Linux 的进程为例包含:

Process ID, process group ID, user ID, and group ID
Environment
Working directory
Program instructions
Registers
Stack
Heap
File descriptors
Signal actions
Shared libraries
Inter-process communication tools (such as message queues, pipes, semaphores, or shared memory).

Thread 使用 Process 的资源,并且能成为独立的元件被操作系统调度,是因为它仅重复那些使得它们能成为独立运行代码的必要资源.Thread 维护它自己如下的信息:

Stack pointer
Registers
Scheduling properties (such as policy or priority)
Set of pending and blocked signals
Thread specific data.

与 Process 比较,Thread 可以总结如下:

Thread 相当于一个 lightweight 的 Process,拥有如 ID,properties 等相似信息, 但仅仅包含能使得它独立运行的信息即可.
信息包含的不同,与需要复制大量信息来创建 Process 比,Thread 的创建比较快捷.
独立的 Processes 不共享任何信息.每个 Process 运行在独立的地址空间.Threads 共享所在 Process 的资源,全局变量和文件描述符.
Threads 可以直接与所在同一 Process 的 Threads 通信.而 Processes 必须使用 inter-process communication(IPC)来通信.
因为 Threads 的资源共享性,增加并行操作资源的难度,需要引入 Thread 同步机制来达到资源并行性.

一些术语

Posix Thread 基本模型如下图,一些有关其中 Thread 的术语:

Lightweight Process(LWP) 可以被认为虚拟的 CPU,在系统中通常 LWP 的个数大于实际 CPU 个数.Thread 库通过与 LWP 沟通来调度 thread.LWP 也通常被称为 kernel threads.
Contention Scope 是如何决定哪个线程得到调度.
Bound threads 拥有系统级别的 Contention Scope,也就是,它们与其他进程一同竞争.
Unbound threads 拥有进程级别的 Contention Scope.
Thread-safe 意味共享数据被得到保护,可以同时被多个 thread 调用而安全.
Reentrant code 意味程序可以被多个 thread 并行运行.
asynchronous-safe function 在 signal handler 下被安全调用并没有任何副作用.
Concurrency vs. Parallelism 并不一样.Parallelism 意味同时运行代码.而 Concurrency 意味许多任务可以以任何顺序执行或可以是并行运行.

Amdahl 法则和 Pareto 原则

Threads 能提供益处 对于相适 的应用.所以 thread 的并行性对于应用来说也有它的限制.

Amdahl 法则

Amdahl 法则陈述到潜在的程序加速由能被并行的代码率 P 定义为:

$$ \begin{align} speedup = \dfrac{1}{1-P} \end{align} $$

如果代码中没有能并行的部分,P=0,那么 speedup=1.
如果所有代码都能并行,P=1,那么 speedup 能达到无限(理论上).
如果 50%的代码能并行,那么最大的 speedup=2,也就是最多 2 倍的加速.

引入能并行的处理器个数,那么进一步可以定义为:

$$ \begin{align} speedup = \dfrac{1}{\dfrac{P}{N} + (1-P)} 其中 P 并行率,N 处理器个数 \end{align} $$

Pareto 原则

Pareto 原则陈述到 80%的处理器时间花在 20%的代码中.所以仔细分析代码,不要把时间花在并行/优化那部分不重要的代码.

Thread 设计模式¹

在程序中有不同的方法使用线程,这里讨论 3 种线程设计模式,没有哪一种模式最好,每种模式都有相应适合的应用场合.

Boss/worker(Thread pool)

如上图,一个 Boss 线程创建其他 Worker 线程,并给它们分配任务,必要的话,并等待其他线程运行结束.通常 Boss 线程会在初始建立 Thread Pool 来为之后分配.尽管线程是轻量级的,但是创建它们仍是有开销的.

Peer(Workcrew)

Peer 模式又叫做 workcrew 模式,一个 thread 创建其他 peer threads 当程序开始,但是如上图,与 Boss/worker 模式不同,这个 thread 之后也变成 peer thread 去处理自己的任务.

Pipeline

Pipeline 模式假定:

一串连续长输入.
每个输入经过一连串的子操作(熟知为 stages 或 fliers).
每个处理 stage 能一次处理个不同的输入.

如上图, Pipeline 就像流水线一般,每个 thread 是一个长链中的一部分.每个 thread 处理由之前 thread 过的数据.

线程同步原语

如上线程中的定义,线程们共享进程中的全局变量或资源,它们可以并行同时对这些数据和资源操作,如果没有一定的机制协调它们,那么数据或资源将处于一个不安全状态,引起诸如如下的一些问题:

Race condition发生于不能决定行为的结果因为线程们操作共享数据或资源没有遵循一定的同步规则.
ABA problem发生于一个地方被读取两次,都读到相同的值,’值是相同的’被用来说明’没有东西被改变’.但是,另外一个线程能在这两次读取中间执行操作并修改这个位置的值,然后做一些其他操作,最后把这个值改回去,以致愚弄第一个线程让它认为’没有东西被改变’,即使第二个线程的操作已经破坏了这个假设.

所以我们需要如下的一些线程同步原语满足不同的线程间同步需求.

Mutex

Mutex 又被称为 Lock,所以它就像一把 Lock,一个线程 Lock 住一段资源,那么其他线程就不能去访问那段资源,只有等到第一个线程 Unlock 那么资源,它才能访问.

在 Lock 和 Unlock 之间的代码,一般被称为 critical section.

Mutex 也包含一些复杂的类型,如下:

Recursive: 允许占有锁的那一个线程再次获取同样的锁,对递归算法是必要的.
Queuing: 使得公平的获取锁,通过 FIFO 排序锁的请求.
Reader/Writer(rwlock): 允许多个 reader 同时获取锁,如果有 reader 占用锁,writer 只有等到 reader 释放锁.
Scoped: RAII 类型定义的锁获取和解锁.

但 Mutex 也会引入其他一些问题,如deadlock 和 priority inversion.

在 Blog 中之前浅谈 Mutex (Lock)中可以看到更多有关 Mutex 的性能和开销分析,并如何实现一个轻量级的 Mutex.

Join

线程 join 机制能让一个线程 join 到另外一个线程中.比如一个子线程 join 回主线程,那么主线程就会等待子线程运行结束.从而达到线程间等待的同步机制.

Condition Variable

Condition variable 允许线程同步到某个共享资源的某个值.

比如,程序有一个计数器,当计数器达到某一个值时去激活某个线程运行.把计数器当成一个 Condition variable.这个线程可以等待这个 Condition variable,其他 active 线程操作完这个 Condition variable,可以通过 signal/broadcast 去唤醒那些等待这个 Condition variable 睡眠的线程.

Barrier

Barrier 是一种能让一系列线程在某个点得到同步的方法,通过让参与 barrier 的线程等待直到所有参与线程都调用了这个 barrier 函数.本质上就是,阻塞所有参与 barrier 的线程直到最慢的那个参与线程调用 barrier.

Spinlock

Spinlock 与 mutex 类似,是种锁,但当获取锁失败时,spinlock 不会让线程进入睡眠,而是不断 poll 去获取这个锁直到获取成功.更多Mutex 与 Spinlock 的区别.

Semaphore

当某些资源具有多个时,简单的 Mutex 不能满足,引入 Semphore,Semphore 可以根据资源个数初始化为任意值.当线程们占有所有资源,使得 Semphore 为 0,那么其他线程再获取资源只有等待.当 Semphore 值只能是 1 或 0 时,它相当于简单的 Mutex.

Pthread

Overview

原始的 Pthread API 由 ANSI/IEEE POSIX 1003.1 - 1995 standard 定义.POSIX 标准也随着时间不断改进.

接下来主要把 Pthread API 分成如下主要 5 部分:

Thread Management
Mutex Variables
Condition Variables
Synchronization
Miscellaneous

如果想把 Pthread 封装成类对象或 Scoped Lock,可以参考之后 Google wrap the Pthread,或直接使用之后介绍的Boost thread library.

如果更全面的 API 参考文章最后的Pthread Library Routines Reference.更多有关资料参考文章后的其他资料.

编译 Pthread 程序

include

对于 POSIX 系统,包含头文件 pthread.h. 如果使用 semaphore, 包含 semaphore.h.

#include 
#include

compile

对于 Gcc 编译器,使用选项 -l,如下:

gcc Program.o -o Program -lpthread

Thread Management

Creating and Terminating Threads

APIs

int pthread_create(pthread_t *thread,
              const pthread_attr_t *attr,
              void *(*start_routine)(void*), void *arg);
void pthread_exit(void *value_ptr);
int pthread_cancel(pthread_t thread);
int pthread_attr_init(pthread_attr_t *attr);
int pthread_attr_destroy(pthread_attr_t *attr);

Creating Threads

pthread_create 创建一个新的线程并运行它.它能在代码的任何处被多次调用.

pthread_create 的参数:

thread:返回新 thread 程的唯一标识.
attr:设置 thread 的性质.NULL 为默认性质.
start_routine: 新 thread 运行的函数指针.
arg:传给 start_routine 的参数,必须强制转换成 void *.NULL 为没有参数传入.

Process 能创建的最大 thread 个数由系统配置决定.如下 Ubuntu 打印出的结果:

$ limit
cputime         unlimited
filesize        unlimited
datasize        unlimited
stacksize       8MB
coredumpsize    0kB
memoryuse       unlimited
maxproc         62694
descriptors     1024
memorylocked    64kB
addressspace    unlimited
maxfilelocks    unlimited
sigpending      62694
msgqueue        819200
nice            0
rt_priority     0
rt_time         unlimited

Thread Attributes

pthread_attr_init 和 pthread_attr_destroy 被用来初始化/销毁 thread 性质对象.

性质包括:

Detached or joinable state
Scheduling inheritance
Scheduling policy
Scheduling parameters
Scheduling contention scope
Stack size
Stack address
Stack guard (overflow) size

Thread Binding

Pthread APIs 并没有提供 binding threads 到特定 cpus/cores 的接口.但不同系统可能包含这功能,比如提供非标准的pthread_setaffinity_np 接口.

比如设置两个线程都在 core0 上运行,如下设置:

cpu_set_t cpus;
CPU_ZERO(&cpus);
CPU_SET(0, &cpus);
pthread_setaffinity_np(thread[0], sizeof(cpu_set_t), &cpus);
pthread_setaffinity_np(thread[1], sizeof(cpu_set_t), &cpus);

Terminating Threads

一个线程有很多种方法终止:

线程从它的运行中正常放回.它的工作完成.
线程调用 pthread_exit 无论它的工作完成否.
线程被另外一个线程调用 pthread_cancel 来取消.
整个线程终止因为调用 exec() 或 exit().
main() 函数先完成,没有调用 pthread_exit.

pthread_exit() 允许指定一个可选的终止 status parameter.这个可选参数一般返回给线程”joining”到这个终止线程.

pthread_exit() 不关闭文件,在线程打开的任何文件将继续打开在线程终止后.

在 main() 调用 pthread_exit() :

如果 main() 在它创建的 threads 之前终止,并没有显示的调用 pthread_exit(),这将是个问题.所有创建的线程将终止因为 main()结束,不再存在支持这些线程.
通过 main()在最后调用 pthread_exit(), main()将阻塞并保持存活来支持它创建的线程运行直到它们完成.

Example: Pthread Creation and Termination

如果注释掉 main()中最后的 pthread_exit(NULL); ,那么它创建的线程将会完成不了所有的打印而被强制退出.

#include 
#include 
#include 

void *ThreadProc(void *param) {
  int id;
  id = *(static_cast<int *>(param));
  for (int i = 0; i < 10; ++i) {
    printf("thread %d: run %d \n", id, i);
  }
  pthread_exit(NULL);
}

int main(int argc, char *argv[]) {
  const int kNumThreads = 4;
  pthread_t threads[kNumThreads];
  int thread_ids[kNumThreads];
  for (int i = 0; i < kNumThreads; ++i) {
    thread_ids[i] = i;
    int rt = pthread_create(&threads[i], NULL, ThreadProc,
                            static_cast<void *>(&thread_ids[i]));
    if (rt) {
      printf("ERROR: pthread_create failed, rt=%d\n", rt);
      exit(1);
    }
  }
  pthread_exit(NULL);
}

Joining and Detaching Threads

APIs

int pthread_join(pthread_t thread, void **value_ptr);
int pthread_detach(pthread_t thread);
int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate);
int pthread_attr_getdetachstate(const pthread_attr_t *attr,
              int *detachstate);

Joining

Joining 是同步不同线程的方法之一,原理如下图:

pthread_join() 阻塞调用它的线程直到指定的 threadid 的线程终止.
调用的线程能获取目标线程终止返回的 status 只要目标线程调用 pthread_exit().
当一个线程被创建,它的属性之一是它是否可以 join.只有创建的能被 join 的线程才能被 join.如果线程线程以 detached 创建,它永远都不能被 join.
显示的创建一个线程可 join 或 detached,使用在 pthread_create() 中的 attr 参数.典型的步骤是:
1. 定义 pthread_attr_t 类型的 pthread 属性;
2. 用 pthread_attr_init() 初始化属性变量;
3. 用 pthread_attr_setdetachstate() 设置 detached 属性;
4. 用 pthread_attr_destroy() 释放属性使用的资源.

Detaching

pthread_detach() 能显示的 detach 一个线程即使它是以可 join 创建.
没有相反的操作.

Example: Pthread Joining

#include 
#include 
#include 

void *ThreadProc(void *param) {
  int id;
  id = *(static_cast<int *>(param));
  for (int i = 0; i < 10; ++i) {
    printf("thread %d: run %d \n", id, i);
  }
  pthread_exit(param);
}

int main(int argc, char *argv[]) {
  const int kNumThreads = 4;
  pthread_t threads[kNumThreads];
  int thread_ids[kNumThreads];
  pthread_attr_t attr;

  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);

  for (int i = 0; i < kNumThreads; ++i) {
    thread_ids[i] = i;
    int rt = pthread_create(&threads[i], &attr, ThreadProc,
                            static_cast<void *>(&thread_ids[i]));
    if (rt) {
      printf("ERROR: pthread_create failed, rt=%d\n", rt);
      exit(1);
    }
  }
  for (int i = 0; i < kNumThreads; ++i) {
    void *status;
    int rt = pthread_join(threads[i], &status);
    if (rt) {
      printf("ERROR: pthread_join failed, rt=%d\n", rt);
      exit(1);
    }
    printf("completed join with thread %d having a status of %d\n"
           , i, *static_cast<int *>(status));
  }
  pthread_exit(NULL);
}

Stack Management

APIs

int pthread_attr_getstacksize(const pthread_attr_t *restrict attr,
              size_t *restrict stacksize);
int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize);
int pthread_attr_getstackaddr(const pthread_attr_t *restrict attr,
              void **restrict stackaddr);
int pthread_attr_setstackaddr(pthread_attr_t *attr, void *stackaddr);

每个线程都有各自独立的 stack, pthread_attr_getstackaddr 和 pthread_attr_setstackaddr 分别获取和设置线程的 stack 属性.

Example: Stack Management

#include 
#include 
#include 

pthread_attr_t attr;

void *ThreadProc(void *param) {
  int id;
  size_t thread_stack_size;
  id = *(static_cast<int *>(param));
  pthread_attr_getstacksize(&attr, &thread_stack_size);
  printf("thread %d: stack size = %d\n", id, thread_stack_size);
  for (int i = 0; i < 10; ++i) {
    printf("thread %d: run %d \n", id, i);
  }
  pthread_exit(NULL);
}

int main(int argc, char *argv[]) {
  const int kNumThreads = 4;
  const int kThround = 1000;
  pthread_t threads[kNumThreads];
  int thread_ids[kNumThreads];
  size_t stack_size;

  pthread_attr_init(&attr);
  pthread_attr_getstacksize(&attr, &stack_size);
  printf("Default stack size = %d\n", stack_size);
  stack_size = sizeof(double) * kThround * kThround;
  printf("Setting stack size = %d\n", stack_size);
  pthread_attr_setstacksize(&attr, stack_size);
  for (int i = 0; i < kNumThreads; ++i) {
    thread_ids[i] = i;
    int rt = pthread_create(&threads[i], &attr, ThreadProc,
                            static_cast<void *>(&thread_ids[i]));
    if (rt) {
      printf("ERROR: pthread_create failed, rt=%d\n", rt);
      exit(1);
    }
  }
  pthread_exit(NULL);
  pthread_attr_destroy(&attr);
  return 0;
}

Miscellaneous

pthread_t pthread_self(void);
int pthread_equal(pthread_t t1, pthread_t t2);
int pthread_once(pthread_once_t *once_control,
              void (*init_routine)(void));
pthread_once_t once_control = PTHREAD_ONCE_INIT;

pthread_self 返回调用线程的唯一 thread ID.
pthread_equal 比较两个线程 ID 是否相等.
pthread_once 只执行 init_routine 仅仅一次在进程中.

Mutex Variables

Overview

Mutex 以”mutual exclusion”(互斥)简称.

Mutex variable 就像一把”锁”一样保护共享数据资源.mutex 的基本概念就是,只有一个线程能 lock 一个 mutex 变量在任何时候.所以,即使很多线程尝试去锁一个 mute,也仅仅只有一个线程能成功.

典型使用 mutex 的顺序如下:

创建和初始化 mutex 变量;
许多线程尝试锁住 mutex;
只有一个线程成功锁住 mutex,其他线程等待;
拥有 mutex 的线程进行自己的操作;
拥有线程解锁 mutex;
其他线程继续获取 mutex 并持续如上步骤;
最后 mutex 销毁.

Creating and Destroying Mutexes

int pthread_mutex_destroy(pthread_mutex_t *mutex);
int pthread_mutex_init(pthread_mutex_t *restrict mutex,
          const pthread_mutexattr_t *restrict attr);
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
int pthread_mutexattr_destroy(pthread_mutexattr_t *attr);
int pthread_mutexattr_init(pthread_mutexattr_t *attr);

Mutex 变量由 pthread_mutex_t 声明定义,而且必须初始化在使用前.两种方法初始:

静态的,当声明时.如:

pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;

动态的,使用 pthread_mutex_init() 函数,并能设置 mutex 的属性 attr.

attr 用来设置 mutex 变量的属性,必须是 pthread_mutexattr_t 类型.Pthread 标准中定义的 3 种可选 mutex 属性:

Protocol: Specifies the protocol used to prevent priority inversions for a mutex.
Prioceiling: Specifies the priority ceiling of a mutex.
Process-shared: Specifies the process sharing of a mutex.(Pthread mutex 能被 process 间使用).

Locking and Unlocking Mutexes

int pthread_mutex_lock(pthread_mutex_t *mutex);
int pthread_mutex_trylock(pthread_mutex_t *mutex);
int pthread_mutex_unlock(pthread_mutex_t *mutex);

pthread_mutex_lock() 函数被用来获取传入的 mutex 变量,如果 mutex 已经被其他线程占用,那么这个调用就阻塞调用线程,使它进入睡眠等待这个 mutex 直到它被释放.

pthread_mutex_trylock() 仅尝试获取锁,若不成功也立即返回’busy’信号.

Example: Using Mutexes

#include 
#include 
#include 

struct ThreadData {
  int tid;
  int data;
};

int shared_x;
pthread_mutex_t lock;

void *ThreadProc(void *param) {
  ThreadData *data = static_cast(param);
  printf("begin from thread id: %d\n", data->tid);
  pthread_mutex_lock(&lock);
  shared_x += data->data;
  printf("thread %d: x = %d\n", data->tid, shared_x);
  pthread_mutex_unlock(&lock);
  pthread_exit(NULL);
}

int main(int argc, char *argv[]) {
  const int kNumThreads = 4;
  pthread_t threads[kNumThreads];
  ThreadData threads_data[kNumThreads];
  pthread_attr_t attr;

  shared_x = 0;
  pthread_mutex_init(&lock, NULL);
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
  for (int i = 0; i < kNumThreads; ++i) {
    threads_data[i].tid = i;
    threads_data[i].data = i * i;
    int rt = pthread_create(&threads[i], &attr, ThreadProc,
                            static_cast<void *>(&threads_data[i]));
    if (rt) {
      printf("ERROR: pthread_create failed, rt=%d\n", rt);
      exit(1);
    }
  }
  for (int i = 0; i < kNumThreads; ++i) {
    void *status;
    pthread_join(threads[i], &status);
  }
  pthread_attr_destroy(&attr);
  pthread_exit(NULL);
  return 0;
}

Condition Variables

Overview

Mutex 变量如锁一般防止多个线程访问共享数据资源,如果某个线程等待某个共享数据达到某个数值才进行相应的操作,那么这个线程需要不断的去 poll,查看是否满足需要的值,这样开销很大,因为线程需要一直处于忙状态.

引入 Condition Variables 来完成这样的同步到某个实际数据值而不要不断 poll.

Condition 变量一般与 mutex 一起使用.锁住查看的共享数据资源.

使用 Condition 的一般步骤如下:

声明和定义需要同步的共享数据;
声明和定义 condition 变量;
声明和定义相对应的 mutex;
创建线程使用 condition 变量同步.

Creating and Destroying Condition Variables

int pthread_cond_destroy(pthread_cond_t *cond);
int pthread_cond_init(pthread_cond_t *restrict cond,
                      const pthread_condattr_t *restrict attr);
int pthread_condattr_destroy(pthread_condattr_t *attr);
int pthread_condattr_init(pthread_condattr_t *attr);

Condition 变量由 pthread_cond_t 声明定义,而且必须初始化在使用前.两种方法初始:

静态的,当声明时.如:

pthread_cond_t convar = PTHREAD_COND_INITIALIZER;

动态的,使用 pthread_cond_init() 函数,并能设置 condition 的属性 attr.

attr 用来设置 condition 变量的属性,必须是 pthread_condattr_t 类型.只有一种属性可选:是否进程共享,也就是允许其他进程中的线程也能看到它.

Waiting and Signaling on Condition Variables

int pthread_cond_wait(pthread_cond_t *cond,
                      pthread_mutex_t *mutex);
int pthread_cond_signal(pthread_cond_t *cond);
int pthread_cond_broadcast(pthread_cond_t *cond);

pthread_cond_wait() 阻塞调用它的线程直到其中 cond 被 signal.这个函数需要在占有 mutex 时被调用,而它将 自动释放 mutex 当它等待时.等到 signal 收到,线程被唤醒, mutex 将 自动被占有 .最后当线程完成 condition 的操作,要负责对 mutex 解锁.

pthread_cond_signal() 用来 signal 其他等待这个 cond 的线程.它需要在占有 mutex 时被调用.然后必须对 mutex 解锁来完成 pthread_cond_wait 的等待.

如果有多余一个线程处于等待 cond 而阻塞, 应该用 pthread_cond_broadcast() 替换 pthread_cond_signal().

Example: Using Condition Variables

#include 
#include 
#include 
#include 

const int kNumThreads = 3;
const int kLoops = 10;
const int kCountLimit = 15;

int g_count;
pthread_mutex_t count_mutex;
pthread_cond_t count_cv;

void *IncreaseCount(void *param) {
  int id;
  id = *(static_cast<int *>(param));
  for (int i = 0; i < kLoops; ++i) {
    pthread_mutex_lock(&count_mutex);
    g_count++;
    if (g_count == kCountLimit) {
      pthread_cond_signal(&count_cv);
      printf("increse thread %d: count = %d, signal cond\n", id, g_count);
    }
    printf("increse thread %d: count = %d, unlock mutex\n", id, g_count);
    pthread_mutex_unlock(&count_mutex);
    sleep(1);
  }
  pthread_exit(NULL);
}

void *WatchCount(void *param) {
  int id;
  id = *(static_cast<int *>(param));
  pthread_mutex_lock(&count_mutex);
  while (g_count < kCountLimit) {
    pthread_cond_wait(&count_cv, &count_mutex);
    printf("watch thread %d: count = %d, receive signal\n", id, g_count);
  }
  pthread_mutex_unlock(&count_mutex);
  pthread_exit(NULL);
}

int main(int argc, char *argv[]) {
  pthread_t threads[kNumThreads];
  int thread_ids[kNumThreads];
  pthread_attr_t attr;

  pthread_mutex_init(&count_mutex, NULL);
  pthread_cond_init(&count_cv, NULL);
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
  for (int i = 0; i < kNumThreads; ++i) {
    thread_ids[i] = i;
  }
  int rt;
  rt = pthread_create(&threads[0], &attr, WatchCount,
                            static_cast<void *>(&thread_ids[0]));
  if (rt) {
    printf("ERROR: pthread_create failed, rt=%d\n", rt);
    exit(1);
  }
  rt = pthread_create(&threads[1], &attr, IncreaseCount,
                            static_cast<void *>(&thread_ids[1]));
  if (rt) {
    printf("ERROR: pthread_create failed, rt=%d\n", rt);
    exit(1);
  }
  rt = pthread_create(&threads[2], &attr, IncreaseCount,
                            static_cast<void *>(&thread_ids[2]));
  if (rt) {
    printf("ERROR: pthread_create failed, rt=%d\n", rt);
    exit(1);
  }
  for (int i = 0; i < kNumThreads; ++i) {
    pthread_join(threads[i], NULL);
  }
  pthread_attr_destroy(&attr);
  pthread_cond_destroy(&count_cv);
  pthread_mutex_destroy(&count_mutex);
  pthread_exit(NULL);
}

Barrier

Overview

Barrier 就是栅栏一样,调用等待 barrier 的线程需要等待直到满足调用 barrier 的线程个数达到要求的 count.

Creating, Destroying and Wait Barrier

int pthread_barrier_init(pthread_barrier_t *barrier,
                const pthread_barrierattr_t *attr, unsigned count);
pthread_barrier_t barrier = PTHREAD_BARRIER_INITIALIZER(count);
int pthread_barrier_destroy(pthread_barrier_t *barrier);
int pthread_barrierattr_init(pthread_barrierattr_t *attr);
int pthread_barrierattr_destroy(pthread_barrierattr_t *attr);
int pthread_barrier_wait(pthread_barrier_t *barrier);

Barrier 变量由 pthread_barrier_t 声明定义,而且必须初始化在使用前.需要传入满足 barrier 等待的个数 count, 两种方法初始:

静态的,当声明时.如:

pthread_barrier_t barrier = PTHREAD_BARRIER_INITIALIZER(count);

动态的,使用 pthread_barrier_init() 函数,并能设置 barrier 的属性 attr.

线程调用 barrier,只需要调用 pthread_barrier_wait 来等待 barrier 达到满足条件.

Google wrap the Pthread

Mutex 类和 CondVar 类

Google api 的 base 包里封装了 Mutex 类和 CondVar 类.

namespace base {
enum LinkerInitialized { LINKER_INITIALIZED };
}

class LOCKABLE PThreadMutex {
 public:
  explicit PThreadMutex(base::LinkerInitialized) {
    pthread_mutex_init(&mutex_, NULL);
  }
  PThreadMutex()   { pthread_mutex_init(&mutex_, NULL); }
  ~PThreadMutex()  { pthread_mutex_destroy(&mutex_); }

  void Lock()     { CHECK_EQ(0, pthread_mutex_lock(&mutex_)); }
  void Unlock()   { CHECK_EQ(0, pthread_mutex_unlock(&mutex_)); }

 private:
  friend class PThreadCondVar;
  pthread_mutex_t mutex_;

  DISALLOW_COPY_AND_ASSIGN(PThreadMutex);
};

class PThreadCondVar {
 public:
  PThreadCondVar()  { pthread_cond_init(&cv_, NULL); }
  ~PThreadCondVar() { CHECK_EQ(0, pthread_cond_destroy(&cv_)); }

  void Signal()        { CHECK_EQ(0, pthread_cond_signal(&cv_)); }
  void SignalAll()     { CHECK_EQ(0, pthread_cond_broadcast(&cv_)); }
  void Wait(PThreadMutex* mu) {
    CHECK_EQ(0, pthread_cond_wait(&cv_, &mu->mutex_));
  }
  bool WaitWithTimeout(PThreadMutex* mu, int64 millis) {
    struct timeval tv;
    struct timespec ts;
    gettimeofday(&tv, NULL);
    ts.tv_sec = tv.tv_sec + millis / 1000;
    ts.tv_nsec = millis % 1000;
    int result = pthread_cond_timedwait(&cv_, &mu->mutex_, &ts);
    if (!result) return true;

    CHECK_EQ(ETIMEDOUT, result);
    return false;
  }

 private:
  pthread_cond_t cv_;
  DISALLOW_COPY_AND_ASSIGN(PThreadCondVar);
};

typedef PThreadCondVar CondVar;
typedef PThreadMutex Mutex;

GoogleOnceInit 类

typedef pthread_once_t GoogleOnceType;
#define GOOGLE_ONCE_INIT PTHREAD_ONCE_INIT

inline void GoogleOnceInit(GoogleOnceType* once, void (*initializer)()) {
  CHECK_EQ(0, pthread_once(once, initializer));
}

Monitoring, Debugging and Performance Analysis Tools for Pthreads

Monitoring

Linux ps command

使用 Linux 自带的 ps 命令查看运行的 thread 情况,ps 的 man 手册.

➜$ ps -Lf
UID        PID  PPID   LWP  C NLWP STIME TTY          TIME CMD
shougang 13103  8814 13103  0    1 23:30 pts/17   00:00:00 /bin/zsh
shougang 13237 13103 13237  0    6 23:30 pts/17   00:00:00 [thread]
shougang 13237 13103 13240  0    6 23:30 pts/17   00:00:00 [thread]
shougang 13237 13103 13241  0    6 23:30 pts/17   00:00:00 [thread]
shougang 13237 13103 13242  0    6 23:30 pts/17   00:00:00 [thread]
shougang 13237 13103 13243  0    6 23:30 pts/17   00:00:00 [thread]
shougang 13237 13103 13244  0    6 23:30 pts/17   00:00:00 [thread]

Linux top command

Linux 的 top 命令加上 -H 参数, process 中的 threads 也能看到. 如下是 top -H 的一个例子:

Debugging

GDB

DDD

Examining Threads

Performance Analysis Tools²

Boost thread library

Overview

直到 C++11 库才比较好的支持 thread,之前 C++程序使用操作系统支持的 thread 库(如 Pthread).但这样做至少有个主要的问题:(1) 这些库基本是 C 的库,需要很小心的 C++中使用,和(2) 每个操作系统提供自己的一套对 thread 支持的库.以致,编写的代码既不标准又不可移植.

Boost Thread可以解决这两个主要问题. Boost Thread 不是通过继承来使用线程,而是 Boost 的 thread 类使用一个 Callable 的对象创建.

编译 Boost Thread 程序

include

根据使用到的 Boost Thread 中的类型包含不同头文件:

#include 
#include 
#include 
#include  
#include

compile

对于 Gcc 编译器,使用选项 -l,如下:

g++ Program.o -o Program -lboost_thread -lboost_system

Thread Management

Thread 类

  #include 
  class thread
  {
  public:
      thread() noexcept;
      thread(const thread&) = delete;
      thread& operator=(const thread&) = delete;

      thread(thread&&) noexcept;
      thread& operator=(thread&&) noexcept;
      ~thread();

      template <class F>
      explicit thread(F f);
      template <class F>
      thread(F &&f);

      template <class F,class A1,class A2,...>
      thread(F f,A1 a1,A2 a2,...);
      template <class F, class ...Args>
      explicit thread(F&& f, Args&&... args);

      template <class F>
      explicit thread(attributes& attrs, F f); // EXTENSION
      template <class F>
      thread(attributes& attrs, F &&f); // EXTENSION
      template <class F, class ...Args>
      explicit thread(attributes& attrs, F&& f, Args&&... args);
      class id;
      id get_id() const noexcept;

      bool joinable() const noexcept;
      void join();
...
};

整个 thread 类包含 thread 的所有特性,如 thread id, join, detach 等.

Create Thread

Callable 对象既可以是一个函数又可以是类中的 operator() 实现,如下:

void hello() {
  cout << "hello world" << endl;
}

struct Hello {
 public:
  void operator() () {
    cout << "hello world" << endl;
  }
};

Hello h;
boost::thread thread_hello(h);
boost::thread thread_hello(hello);

Passing data

传递参数给线程

thread 创建时附加后面

void hello(const string &str) {
  cout << str << endl;
}
string str = "hello";
boost::thread thrd(hello, str);

利用 Boost.bind 库接口

void hello(const string &str) {
  cout << str << endl;
}
string str = "hello";
string str = "hello";
boost::thread thrd(bind(hello, str));

Thread group 类

可以使用 thread group 类管理 thread,通过 add_thread 和 create_thread 添加线程到管理类中, 可以直接 join_all 将所有管理类中的线程 join.

#include 

class thread_group
{
public:
    thread_group(const thread_group&) = delete;
    thread_group& operator=(const thread_group&) = delete;

    thread_group();
    ~thread_group();

    template<typename F>
    thread* create_thread(F threadfunc);
    void add_thread(thread* thrd);
    void remove_thread(thread* thrd);
    bool is_this_thread_in();
    bool is_thread_in(thread* thrd);
    void join_all();
    void interrupt_all();
    int size() const;
};

Miscellaneous

Boost Thread 中还有 strict_scoped_thread 类和 scoped_thread 类,提供线程结束不是调用 terminate ,而是调用传入的参数来执行特定行为.

Example: Thread Creation

#include 
#include 
#include 
#include 

using namespace std;
using namespace boost;

void hello(const string &str) {
  cout << str << endl;
}

int main(){
  string str = "hello";
  boost::thread thrd(bind(hello, str));
  thrd.join();
  return 0;
}

Mutex Variables

#include 

class mutex:
    boost::noncopyable
{
public:
    mutex();
    ~mutex();

    void lock();
    bool try_lock();
    void unlock();

    typedef platform-specific-type native_handle_type;
    native_handle_type native_handle();

    typedef unique_lock<mutex> scoped_lock;
    typedef unspecified-type scoped_try_lock;
};

lock() 来获取锁.
unlock() 释放锁.
typedef unique_lock scoped_lock; 定义了 scoped_lock 的类型,通过 boost::mutex::scoped_lock 来定义一个 RAII-style 锁,离开定义区域自动释放锁.

`lock_guard`

boost::lock_guard 非常简单:

构造时,它获取锁.
析构时,它释放锁.

它提供了一个简单的 RAII-style 锁对象,使得 exception-safe 锁和解锁更容易.

namespace boost
{
  template<typename Lockable>
  class lock_guard
#if ! defined BOOST_THREAD_NO_MAKE_LOCK_GUARD
  template <typename Lockable>
  lock_guard<Lockable> make_lock_guard(Lockable& mtx); // EXTENSION
  template <typename Lockable>
  lock_guard<Lockable> make_lock_guard(Lockable& mtx, adopt_lock_t); // EXTENSION
#endif
}

基本使用,传入可 Lockable 的 mutex 类型:

boost::mutex count_mutex;
boost::lock_guard<mutex> lock(count_mutex) ;

Example: Using Mutexes

#include 
#include 
#include 

using std::cout;
using std::endl;
boost::mutex count_mutex;

struct count {
  count(int id) : id_(id) {
  }
  void operator() () {
    for (int i = 0; i < 10; ++i) {
      boost::mutex::scoped_lock lock(count_mutex);
      cout << id_ << ": " << i << endl;
    }
  }
  int id_;
};

int main(int argc, char *argv[]) {
  boost::thread thread1(count(1));
  boost::thread thread2(count(2));
  thread1.join();
  thread2.join();
  return 0;
}

Condition Variables

与 Pthread, Boost Condition Variable 功能更全面,如不同条件的 wait_until , wait_for 等功能.

namespace boost
{
    class condition_variable
    {
    public:
        condition_variable();
        ~condition_variable();

        void notify_one() noexcept;
        void notify_all() noexcept;

        void wait(boost::unique_lock<boost::mutex>& lock);

        template<typename predicate_type>
        void wait(boost::unique_lock<boost::mutex>& lock,predicate_type predicate);

        template <class Clock, class Duration>
        typename cv_status::type
        wait_until(
            unique_lock& lock,
            const chrono::time_point<Clock, Duration>& t);
...
};

Example: Using Condition Variables

利用 Condition Variables 实现一个简单的 read/writer Buffer.

#include 
#include 
#include 
#include 
using std::cout;
using std::endl;
const int kBufSize = 10;
const int kIters = 100;

boost::mutex io_mutex;

class Buffer {
 public:
  typedef boost::mutex::scoped_lock scoped_lock;
  Buffer() : p(0), c(0), full(0) {
  }

  void Put(int m) {
    scoped_lock lock(mutex);
    if (full == kBufSize) {
      {
        scoped_lock lock(io_mutex);
        cout << "Buffer is full." << endl;
      }
      while (full == kBufSize) {
        cond.wait(lock);
      }
    }
    buf[p] = m;
    p = (p + 1) % kBufSize;
    ++full;
    cond.notify_one();
  }

  int Get() {
    scoped_lock lock(mutex);
    if (full == 0) {
      {
        scoped_lock lock(io_mutex);
        cout << "Buffer is empty." << endl;
      }
      while (full == 0) {
        cond.wait(lock);
      }
    }
    int i = buf[c];
    c = (c + 1) % kBufSize;
    --full;
    cond.notify_one();
    return i;
  }
 private:
  boost::mutex mutex;
  boost::condition cond;
  unsigned int p, c, full;
  int buf[kBufSize];
};

Buffer buf;

void writer() {
  for (int i = 0; i < kIters; ++i) {
    {
      boost::mutex::scoped_lock lock(io_mutex);
      cout << "sending: " << i << endl;
    }
    buf.Put(i);
  }
}

void reader() {
  for (int i = 0; i < kIters; ++i) {
    int n = buf.Get();
    {
      boost::mutex::scoped_lock lock(io_mutex);
      cout << "received: " << n << endl;
    }
  }
}

int main(int argc, char *argv[]) {
  boost::thread thread_reader(&reader);
  boost::thread thread_writer(&writer);
  thread_reader.join();
  thread_writer.join();
  return 0;
}

Miscellaneous

Barrier

基本使用:

定义 barrier,传入参与 thread 个数: barrier b(num_threads).
thread 中等待 barrier: b.wait().

class barrier
{
public:
    barrier(barrier const&) = delete;
    barrier& operator=(barrier const&) = delete;

    barrier(unsigned int count);
    template <typename F>
    barrier(unsigned int count, F&&);

    ~barrier();

    bool wait();
    void count_down_and_wait();
};

Once Routines

#include 

namespace boost
{
  struct once_flag;
  template<typename Function, class ...ArgTypes>
  inline void call_once(once_flag& flag, Function&& f, ArgTypes&&... args);

#if defined BOOST_THREAD_PROVIDES_DEPRECATED_FEATURES_SINCE_V3_0_0
  void call_once(void (*func)(),once_flag& flag);
#endif

}

Example: Call once

#include 
#include 
#include 

int i = 0;
boost::once_flag flag = BOOST_ONCE_INIT;

void init()
{
  ++i;
}

void thread()
{
  boost::call_once(&init, flag);
}

int main(int argc, char* argv[])
{
  boost::thread thrd1(&thread);
  boost::thread thrd2(&thread);
  thrd1.join();
  thrd2.join();
  std::cout << i << std::endl;
  return 0;
}

其他资料

Books:

PThreads Programming and Its notes
Anthony Williams’ blog and his book, C++ Concurrency in Action
Herlihy & Shavit’s The Art of Multiprocessor Programming

Online resources:

Dmitriy V’jukov’s website and various forum discussions
Bartosz Milewski’s blog
Charles Bloom’s Threading Posts on his blog
Doug Lea’s The JSR-133 Cookbook for Compiler Writers
memory-barriers.txt document
Hans Boehm’s collection of links about the C++11 memory model
Scott Meyers’s Information on the C++11 Memory Model
Herb Sutter’s Effective Concurrency series
POSIX Standard: http://www.unix.org/version3/ieee_std.html

有关 Mutex

Pthread Library Routines Reference

pthread_atfork
pthread_attr_destroy
pthread_attr_getdetachstate
pthread_attr_getguardsize
pthread_attr_getinheritsched
pthread_attr_getschedparam
pthread_attr_getschedpolicy
pthread_attr_getscope
pthread_attr_getstack
pthread_attr_getstackaddr
pthread_attr_getstacksize
pthread_attr_init
pthread_attr_setdetachstate
pthread_attr_setguardsize
pthread_attr_setinheritsched
pthread_attr_setschedparam
pthread_attr_setschedpolicy
pthread_attr_setscope
pthread_attr_setstack
pthread_attr_setstackaddr
pthread_attr_setstacksize
pthread_barrier_destroy
pthread_barrier_init
pthread_barrier_wait
pthread_barrierattr_destroy
pthread_barrierattr_getpshared
pthread_barrierattr_init
pthread_barrierattr_setpshared
pthread_cancel
pthread_cleanup_pop
pthread_cleanup_push
pthread_cond_broadcast
pthread_cond_destroy
pthread_cond_init
pthread_cond_signal
pthread_cond_timedwait
pthread_cond_wait
pthread_condattr_destroy
pthread_condattr_getclock
pthread_condattr_getpshared
pthread_condattr_init
pthread_condattr_setclock
pthread_condattr_setpshared
pthread_create
pthread_detach
pthread_equal
pthread_exit
pthread_getconcurrency
pthread_getcpuclockid
pthread_getschedparam
pthread_getspecific
pthread_join
pthread_key_create
pthread_key_delete
pthread_kill
pthread_mutex_destroy
pthread_mutex_getprioceiling
pthread_mutex_init
pthread_mutex_lock
pthread_mutex_setprioceiling
pthread_mutex_timedlock
pthread_mutex_trylock
pthread_mutex_unlock
pthread_mutexattr_destroy
pthread_mutexattr_getprioceiling
pthread_mutexattr_getprotocol
pthread_mutexattr_getpshared
pthread_mutexattr_gettype
pthread_mutexattr_init
pthread_mutexattr_setprioceiling
pthread_mutexattr_setprotocol
pthread_mutexattr_setpshared
pthread_mutexattr_settype
pthread_once
pthread_rwlock_destroy
pthread_rwlock_init
pthread_rwlock_rdlock
pthread_rwlock_timedrdlock
pthread_rwlock_timedwrlock
pthread_rwlock_tryrdlock
pthread_rwlock_trywrlock
pthread_rwlock_unlock
pthread_rwlock_wrlock
pthread_rwlockattr_destroy
pthread_rwlockattr_getpshared
pthread_rwlockattr_init
pthread_rwlockattr_setpshared
pthread_self
pthread_setcancelstate
pthread_setcanceltype
pthread_setconcurrency
pthread_setschedparam
pthread_setschedprio
pthread_setspecific
pthread_sigmask
pthread_spin_destroy
pthread_spin_init
pthread_spin_lock
pthread_spin_trylock
pthread_spin_unlock
pthread_testcancel

Footnotes:

http://maxim.int.ru/bookshelf/PthreadsProgram/htm/r_19.html

https://computing.llnl.gov/?set=code&page=software_tools#perftools

Multithreading相关术语总结

2014-07-05T23:23:52+08:00

在谈到内存模型,Multithreading,尤其 lock-free programmming 等时,总会遇到一些相关术语来描述,如 Memory Barrier,Acquire semantics,Release semantics,happens-before relation 等.在这里稍微整理一下.

Memory Barriers

在之前浅谈 Memory Reordering 中谈到编译器 reordering 和在多核下的处理器的 reordering,在 lock-free programming 中,如果不控制好这两者的 reordering 就会引起上文中所不想的结果.

你可以通过指令强制 CPU 和编译器在内存处理上的顺序,这些指令就被成为 Memory Barrier.

有很多指令作为 memory barriers,所以需要知道很多不同类型的 memory barriers. Doug Lea 指出如下的四大类可以很好的归纳在 CPU 上的特殊指令.尽管不是完全,大多数时候,一个正真的 CPU 指令执行包含上面 barrier 类型的各种组合,或附带其他效果.无论如何, 一旦你理解了这四种类型的 memory barriers,你就很好的理解了大部分真正 CPU 的关于内存约束的指令. Memory Barriers Are Like Source Control Operations 这篇把 Memory Barriers 与 Source Control 作类比,熟悉 Source Control 机制的可以很形象的理解各类 Memory Barriers 机制.

LoadLoad

顺序: Load1; LoadLoad; Load2

保证 Load1 的数据加载在被 load2 和之后的 load 指令读取加载之前.是一个比较好的方法防止看到旧的数据.以这个经典的例子,CPU1 检查一个共享的标识变量 flag 来确认一些数据是否被 CPU1 更新.如果标识变量 flag 是 true 的话,把LoadLoadbarrier 放在读取更新数据之前:

if (is_updated) {
    LOADLOAD_FENCE();  // Prevent reordering of loads
    return value;  // Load updated value
}

只要is_updated被 CPU1 看到为 true, LoadLoadfence 防止 CPU1 读到比标识变量 flag 本身旧的value.

StoreStore

顺序: Store1; StoreStore; Store2

保证 Store1 的数据被其他 CPU 看到在与这数据相关的 Store2 和之后的 store 指令之前.同样,它足够的防止其他 CPU 看到自己的旧数据.同上一样的例子,CPU1 需要更新一些数据到共享的内存中,把StoreStore barrier 放在标识变量 flag 是 true 之前:

value = x;
STORESTORE_FENCE();
is_updated = 1;  // Set shared flag to show the update of data

一旦其他 CPU 看到is_updated为 true,它能自信它看到正确的value值.而且 value不需要原子类型,它可以是一个包含很多元素的大数据结构.

LoadStore

顺序: Load1; LoadStore; Store2

保证 Load1 的数据被加载在与这数据相关的 Store2 和之后的 store 指令之前.

StoreLoad

顺序: Store1; StoreLoad; Load2

保证 Store1 的数据被其他 CPU 看到在数据被 Load2 和之后的 load 指令加载之前.也就是说,它有效的防止所有 barrier 之前的 stores 与所有 barrier 之后的 load 乱序.

StoreLoad是唯一的.它是唯一的 memory barrier 类型来防止r1=r2=0在之前 Memory ordering at processor time 中给出的例子.

StoreLoad有什么区别与StoreStore之后跟LoadLoad?虽然,StoreStore按序把存储改变推送到主内存中,LoadLoad按序把改变加载过来,但是这两种类型的 barrier 是不够的.Store 可以延迟任意的指令,以致在 Load 之后,Load 也可以不是加载最新 Store 之后的内容.这就是为啥 PowerPC 的指令 lwsync,包含这三种 memory barriers,LoadLoad,LoadStore和 StoreStore,但不包含StoreLoad,是不足以防止r1=r2=0在那个实例中.

Data dependency barriers

除了上面 4 大类,还有Loadload的弱化模式的Data dependency barrier.如 LoadLoad类似,在两个 load 顺序执行,load2 依赖于 load1 的结果,Data dependency barrier需要插入保证两者的顺序.

但与LoadLoad不同,Data dependency barrier只是部分顺序约束在内在以来的 load,就是 load1 必须与 load2 是 data dependency 而不是仅仅是 control dependency.

data dependency

r1 与 r2 之间是 data dependency.

r1 = 1;
r2 = r1;

control dependency

r1 与 r2 之间是 control dependency.

r1 = value;
if (r1) {
    r2 = r1;
} else {
    r2 = 1;
}

Acquire and Release semantics

在 lock-free programming 中,共享内存被多个线程通过合作传递信息来处理,在这种处理下,acquire 和 release semantics 是关键技术保证可靠的传递信息在线程之间.

acqure 和 release semantics 并没有好的被定义,这里借用 Jeff Preshing 在这里给予的定义:

Acquire semantics 是一种只能应用于如下操作的性质: 从共享内存读取,无论是 read-modify-write 操作还是普通的加载.这一操作被认为是一个 read acquire. Acquire semantics 防止 read acquire 程序上之后的任何读或写操作与它的内存乱序.

Release semantics 是一种只能应用于如下操作的性质: 写入到共享内存, 无论是 read-modify-write 操作还是普通的存储.这一操作被认为是一个 write release. Release semantics 防止 write release 程序上之前的任何读或写操作与它的乱序.

Acqure 和 release semantics 能通过之前四种 memory barrier 的简单组合来达到.

Acqure 和 release semantics 可以基本划分为如下结构:

使用明确的平台相关 Fence 指令

在 X86/64 使用mefence指令,mfence 是一个满足全部 memory barrier,防止任何类型的内存乱序.

可移植的 C++11 的 Fences

C++11 的 atomic 库定义了一个可移植的函数atomic_thread_fence(),输入一个变量来指定什么类型的 fence.

可移植的 C++11 的 atomic,非明确的 fence

在 C++11 中,可以直接对 atomic 变量直接约束 fence,而不是显示的明确 fence.与上面明确 fence 相比,这实际是更优的方法来表达 acquire and release semantics 在 C++11 中.

Happens-before relation

Happens-before 是一个术语来描述 C++11,Java,LLVM 之类背后的软件内存模型.

在之上每个语言里都能找到* happends-before *的定义,尽管每个都有不同的说法,但内在意思基本一致.粗略地讲,基本定义如下:

A 和 B 表示一个多线程进行的操作.若 A happens-before B,那么,在 B 进行前,A 对 B 的内存影响有效的被 B 看到.

无论使用任何编程语言,它们都有一个共同处:如果操作 A 和 B 被同一个进程进行,A 的语句在 B 的语句之前在程序顺序上,那么 A 优先发生(happens-before)B.这也是在之前 Memory ordering 中谈到中心原则.

这里再次提一下指令重排序问题,有人有如下疑问: 指令重排序会破坏 happens-before 原则吗？happens-before 的程序次序原则说：在一个线程内，按照程序代码顺序，书写在前面的操作会先行发生于书写在后面的操作。如果线程内出现指令重排序，那不是破坏了程序次序原则了吗？

是会破坏程序次序的执行,但是并不破坏 happens-before 原则,并不造成内存对单线程有效性的破坏.这里主要的困惑是时间上顺序的发生之前(happening before)与先行发生(happens-before)两者关系.

时间上顺序的发生在前于(happening before)与先行发生(happens-before)两者是不一样的,基本没太大关系.特别:

A 先行发生(happens-before)B 并不意味着 A 发生在前于(happening before)B.
A 发生在前于(happening before)B 并不意味 A 先行发生(happens-before)B.

谨记 happens-before 是由一系列编程语言特定定义的操作间的关系,它的存在独立于时间的概念.

happens-before 并不意味 happening before

如下例子有 happens-before 关系但并不是顺序执行,没有 happening before.如下代码:(1) 存储到 A,之后(2)存储到 B.根据程序顺序原则,(1) happens-before (2).

int A, B;
void test() {
  A = B + 1;  // (1)
  B = 0;  // (2)
}

用 O2 打开优化编译的如下:

$ gcc -S -O2  -masm=intel test.c

	mov	eax, DWORD PTR B
	mov	DWORD PTR B, 0
	add	eax, 1
	mov	DWORD PTR A, eax

从汇编指令看出,第二句mov DWORD PTR B, 0就已经完成对B的存储,但是对A的存储还没进行.(1)顺序上并没有在(2)之前执行!

但是 happens-before 原则有被违背吗?根据定义,(1)的内存效用必须有效被看到在进行(2)之前.也就是存储 A 必须影响存储 B.

在这里,存储 A 实际并没有影响存储 B.(2)被提前执行与之后执行仍然一样,相当与 (1)的内存有效性是一样的.因此,这并不算违背 happens-before 原则.

happening before 并不意味 happens-before

这是个时间上发生于前但并含有 happens-before 关系的例子.如下的代码,想象一个线程调用UpdateValue,而另一个线程调用ConsumeValue.因为处理共享的数据并行的,为了简单,认为普通的读取和存储int是 atomic 的.因为程序顺序原则,在(1)和(2)之间 happens-before 关系,(3)和(4)之间 happens-before 关系.

int value = 0;
int updated = 0;

void UpdateValue() {
    value = 123;  // (1)
    update = 1;  // (2)
}

void ConsumeValue() {
if (update) {  // (3)
    printf("%d\n", value);  // (4)
}

进一步假设在运行开始的时候,(3)读取update到为 1,这个值是有(2)在另外个线程中存储的.这里,我们可以得出时间顺序上(2)必须发生前于(3).但是这里并没有规则意味着在(2)和(3)之间有 happens-before 关系.(2)和(3)之间没有 happens-before 关系,(1)和(4)之间也没有 happens-before 关系.因此,(1)和(4) 的内存可以重排序,因为编译器重排序或在 CPU 上内存重排序,以致(4)可以打印 “0”,即使(3)读到 1.

浅谈Mutex (Lock)

2014-06-29T20:52:09+08:00

Mutex(又叫 Lock),在多线程中,作为同步的基本类型,用来保证没有两个线程或进程同时在他们的关键区域.因为 Mutex 这种排它性,很多人认为 Mutex 开销很大,尽量避免使用它.就如这篇分析完共享数据问题后,进一步分析说明 Avoiding locks 来解决这个问题.但 Mutex 真的开销如此大,还是被大家误解了?Matthew Dillon 写道,”Most people have the misconception that locks are slow.”, Jeff Preshing 也写了这篇”Locks Aren’t Slow; Lock Contention Is”.

那么接下来做 3 个关于 Mutex 的 Benchmark,具体分析一下 Mutex 的开销如何,最后并利用原子操作和 semaphore 实现一个 lightweight Mutex.

一个 Mutex 仅仅从 Lock 到 Unlock 具体开销是多少,是不是占用很多时间,从 Always Use a Lightweight Mutex 从可以看到在 windows 中有两种 Mutex:Muetx 和 Critical Section, 重量级和轻量级的区别,两者的时间开销相差 25 倍多,所以一直使用轻量级的 Mutex.

这篇文章在高强度下 lock 的性能:每个线程做任何事情都占用 lock(高冲突),lock 占用极短的时间 (高频率).值得一读,但是在实际应用中,基本避免如此使用 locks.这里对 Mutex Contention 和 Mutex Frequency 都做最好和最坏场景的使用测试.

Mutex 被灌以避免使用也因为其他原因.现在有很多大家熟知的 lock-free programming 技术.Lock-free 编程非常具有挑战性,但在实际场景中获得巨大的性能.既然有 lock-free 的技术吸引我们使用它们,那么 locks 就显得索然无味了.

但也不能因此忽略 lock.因为在实际很多场景,它仍然是利器.

Lightweight Mutex Benchmark

Linux 下的 POSIX thread 是轻量级的 Mutex.基于 Linux 特有的 futex 技术,当没有其他线程竞争锁时它被优化过.使用如下简单的例子,测试一个单线程 lock 和 unlock,所有代码在 Github 上.

pthread_mutex_init(&lock, NULL);
const int kN = 1000000;
for (int i = 0; i < kN; ++i) {
    pthread_mutex_lock(&lock);
    pthread_mutex_unlock(&lock);
}
pthread_mutex_destroy(&lock);

插入相应的时间代码,算出 10 万次的单线程 lock/unlock 平均时间.在不同的处理器下,结果如下:

如果假设一个线程每分钟获取 1e5 次 mutex,并且没有其他线程与它竞争.基于如下的图,可预计 0.2%到 0.4%的开销.不算差.在比较低频率下,开销基本忽略不计.之后 Build own lightweight mutex,会利用 semaphore 和一个原子操作,实现一个 lightweight mutex.

POSIX thread 与 Windows Critical Section 不同,它不仅支持线程间的同步, 还支持进程间的同步.实例代码如下:

mutex_between_process.cc

pthread_mutex_t mutex;
pthread_mutexattr_t attrmutex;

/* Initialise attribute to mutex. */
pthread_mutexattr_init(&attrmutex);
pthread_mutexattr_setpshared(&attrmutex, PTHREAD_PROCESS_SHARED);
pthread_mutex_init(&mutex, &attrmutex);

/* Use the mutex. */

/* Clean up. */
pthread_mutex_destroy(pmutex);
pthread_mutexattr_destroy(&attrmutex);

Mutex Contention Benchmark

在测试中,产生一个不断生成随机数的线程,使用自己编制的线程安全的 Mersenne Twister 实现代码.每过一段时间,它获取和释放一个锁,获取和释放锁之间的时间每次是随机的,但是总的平均时间是提前设计好的.这个随机的过程就是个泊松分布过程,计算出产生一个随机数的平均时间 6.25 ns 在 2.93 GHz i7 上,把它作为运行单位.利用 Poisson Process 的算法决定运行多少个运行单位在获取和释放锁之间.并利用 High Resolution TimeAPI 计算时间.这个线程的代码如下,所有代码在 Github 上:

  GetMonotonicTime(&start);
  for (;;) {
    work_units = static_cast<int> (random.PoissonInterval(
        global_state.average_unlock_count) + 0.5f);
    for (int i = 0; i < work_units; ++i) {
      random.Integer();
    }
    thread_stats.workdone += work_units;

    GetMonotonicTime(&end);
    elapsed_time = GetElapsedTime(&start, &end);
    if (elapsed_time >= global_state.time_limit) {
      break;
    }

    // Do some work while holding the lock
    pthread_mutex_lock(&global_state.thread_mutex);
    work_units = static_cast<int> (random.PoissonInterval(
        global_state.average_locked_count) + 0.5f);
    for (int i = 0; i < work_units; ++i) {
      random.Integer();
    }
    thread_stats.workdone += work_units;
    pthread_mutex_unlock(&global_state.thread_mutex);

    thread_stats.iterations++;
    GetMonotonicTime(&end);
    elapsed_time = GetElapsedTime(&start, &end);
    if (elapsed_time >= global_state.time_limit) {
      break;
    }
  }

这里模拟获取和释放 15000 次锁每秒,从 1 个线程运行到 2 个线程,最后到 4 个线程.并且验证占用锁的时间,从 0%到 100%的每次运行时间占用锁.把 1 个线程的完成的工作量作为基准数据,其他的去除以它,计算相对增益.基本测试方案如下:

// Test 15000 locks per second: thread number, lock_interval
    1, 1/15000.0f,
    2, 1/15000.0f,
    3, 1/15000.0f,
    4, 1/15000.0f,

从图中看出,随着锁占用的时间增加,并行性越来越差,直到最后占用 60%以后,单线程运行的更好.可以说,短时间的占用锁的时间,以 10%以内,系统达到很高的并行性.虽然并不是完美的,但是也接近.锁总体很快.

把这个结果放到实际中,Jeff Preshing 在这篇提到,实际的游戏程序中,15000 的锁每秒来自 3 个线程,占用锁的时间相对 2%.在图中很适中的区域.

Mutex Frequency Benchmark

尽管一个 lightweight mutex 有开销,但如上测试在 2.40GHz i5 上,lock/unlock 锁开销约 34.2ns ,因此 15000 锁每秒开销很低以致不是严重影响结果.那么把锁的每秒频率提高呢?

只创建 2 个线程,进行一系列的锁的每秒频率测试在 2.40GHz i5 上,从占用锁时间 10 ns(1e8/s)到 100 us(1e4/s),用单线程的占用锁时间 10 ms 作为基准工作量,其他与它比较,测试方案如下:

  // Reference
  1, 10e-3f,      // 10 ms        100/s

    // Test various lock rates with 2 threads
    2, 10e-9f,      // 10 ns        100000000/s
    2, 31.6e-9f,    // 31.6 ns      31600000/s
    2, 100e-9f,     // 100 ns       10000000/s
    2, 316e-9f,     // 316 ns       3160000/s
    2, 1e-6f,       // 1 us         1000000/s
    2, 3.16e-6f,    // 3.16 us      316000/s
    2, 10e-6f,      // 10 us        100000/s
    2, 31.6e-6f,    // 31.6 us      31600/s
    2, 100e-6f,     // 100 us       10000/s

如预想一样,对于非常高频率的锁,锁的开销开始减少实际工作量.在网络上,可以找到很多同样的测试.图中下边的线条,对于这样高的频率,也就是占用锁的时间很短,就一些 CPU 的指令,这样的情况下,当锁之间的工作如此简单,那么一个 lock-free 的实现更适合.

我们获得了一大块关于锁的性能:从它进行很好的情况,到缓慢应用的情况.在考虑实际锁的使用情况,不能说所有锁都是慢的.必须承认,很容易乱用锁,但不用太担心,任何的瓶颈问题都会在细心的 profiling 中发现.当你考虑锁是如何的稳定, 相对容易的理解它们(与 lock-free 技术相比),锁有时候其实很好用.

Build own lightweight mutex

我们也可以实现自己的简单轻量级的 mutex,但仅仅作为教育手段,理解 mutex 一些内在实现细节,实际现在操作系统都提供轻量级的 mutex,千万不要自己实现一个并实际使用,直接只用操作系统提供的即可.

网络上有很多种方法在用户层写自己的 mutex:

roll-your-own-lightweight-mutex 利用 Windows 提供的 semaphore 和 atomic 操作实现的 mutex.
Review of many Mutex implementations 很长的一篇文章,总结了很多种 mutex 的实现细节.

这里利用 Benaphore 技术,在 Linux 平台上利用 semaphore 和 atomic 操作实现自己的 C++版本的 lightweight mutex.这里并没有用 C++11 的原子库.所有代码在 Github 上.

 #include <semaphore.h>
class Benaphore {
 public:
  Benaphore() : counter_(0) {
    sem_init(&semaphore_, 0, 0);
  }
  ~Benaphore() {
    sem_destroy(&semaphore_);
  }
  void Lock() {
    if (__sync_add_and_fetch(&counter_, 1) > 1) {
      sem_wait(&semaphore_);
    }
  }
  void Unlock() {
    if (__sync_sub_and_fetch(&counter_, 1) > 0) {
      sem_post(&semaphore_);
    }
  }
  bool TryLock() {
    return __sync_bool_compare_and_swap(&counter_, 0, 1);
  }

 private:
  long counter_;
  sem_t semaphore_;
};

__sync_add_and_fetch 是一个由 GCC 内部提供的 atomic read-modify-write (RMW) 操作,它把 1 加到某个数并且返回新的数,在同一时间所有操作由一个线程原子操作完成,其他线程不能干涉,只能在后等待.这里counter_初始化为 0,第一个线程调用Lock将得到 1 从__sync_add_and_fetch,然后跳过sem_wait,一旦这个线程占用这个锁, 之后线程都将递增counter_,获得大于 1 的数,从而调用sem_wait等待.

之后,第一个线程完成自己的操作,调用Unlock,__sync_sub_and_fetch的返回值大于 1 说明有其他线程在等待这个 mutex,调用sem_post唤醒其他线程.

底层分析与性能

上面使用了__sync_add_and_fetch,它编译成lock xadd指令如下.在没有竞争下的 lock/unlock 操作性能与 pthread mutex 相当.但是在 mutex 多线程竞争情况下,这个 mutex 性能没有 pthread mutex 好.

增强 Mutex 支持递归

上面简单的 lightweight mutex 的局限性是它不能递归.也就是同一个线程试图获取同样的锁两次以上,将造成死锁(deadlock).递归锁在函数调用自己时很有用.比如在内存管理代码中,可能会遇到如下代码:

Realloc(void* ptr, size_t size)
{
    LOCK;

    if (ptr == NULL)
    {
        return Alloc(size);
    }
    else if (size == 0)
    {
        Free(size);
        return NULL;
    }
    else
        ...
}

Alloc(size_t size)
{
    LOCK;

    ...
}

Lock是个封装好的 C++宏,用来获取锁和自动结果当退出函数.

可以看到,当传递NULL给Realloc,锁被Realloc函数获取,然后第二次被获取当Alloc被调用.

把它扩展成可递归的锁如下,加入 2 个新成员变量,owner_,存储当前占有线程的 ID(TID),和recursion_,存储递归的层数.基本代码如下:

 #include <semaphore.h>
 #include <pthread.h>
 #define LIGHT_ASSERT(x) { if (!(x)) __builtin_trap(); }

class RecursiveBenaphore {
 public:
  RecursiveBenaphore() : counter_(0), owner_(0), recursion_(0) {
    sem_init(&semaphore_, 0, 0);
  }
  ~RecursiveBenaphore() {
    sem_destroy(&semaphore_);
  }
  void Lock() {
    pthread_t thread_id = pthread_self();
    if (__sync_add_and_fetch(&counter_, 1) > 1) {
      if (!pthread_equal(thread_id, owner_)) {
        sem_wait(&semaphore_);
      }
    }
    owner_ = thread_id;
    recursion_++;
  }
  void Unlock() {
    pthread_t thread_id = pthread_self();
    LIGHT_ASSERT(pthread_equal(thread_id, owner_));
    long recur = --recursion_;
    if (recur == 0) {
      owner_ = 0;
    }
    long result = __sync_sub_and_fetch(&counter_, 1);
    if (result > 0) {
      if (recur == 0) {
        int sem_value;
        sem_getvalue(&semaphore_, &sem_value);
        if (sem_value == 0) {
          sem_post(&semaphore_);
        }
      }
    }
  }
  bool TryLock() {
    pthread_t thread_id = pthread_self();
    if (pthread_equal(thread_id, owner_)) {
      __sync_add_and_fetch(&counter_, 1);
    } else {
      bool result = __sync_bool_compare_and_swap(&counter_, 0, 1);
      if (result == false) {
        return false;
      }
      owner_ = thread_id;
    }
    recursion_++;
    return true;
  }

 private:
  long counter_;
  sem_t semaphore_;
  pthread_t owner_;
  long recursion_;
};

如之前一样,第一个线程调用Lock,设置owner_为自己的 TID,增加 recursion_到 1.如果同一个线程再次调用Lock,它将同时增加 recursion_和counter_.

之后,第一个线程完成自己的操作,调用Unlock,同时减少recursion_和counter_, 仅仅调用sem_post唤醒其他线程当recursion_减少到0.如果 recursion_仍然大于 0,意味着当前的线程仍然占有此锁在外层程序.

最后进行压力测试,建立一些线程,每个随机获取锁,随机的递归层次.代码在 Github 上.

一些细节问题: * 在Unlock中,设置owner_为 0 在调用__sync_sub_and_fetch之前,否则可能发生死锁(deadlock).比如,有两个线程 TID 是 111 和 222. 1. 线程 111 完成操作调用Unlock,先调用__sync_sub_and_fetch把counter_减到 0 2. 在设置owner_为 0 被中断,线程 222 得到运行,它调用Lock,发现counter_为 0,跳过sem_wait,设置owner_=222,完成Lock操作. 3. 线程 222 被中断调出,线程 111 重新得到运行,设置owner_为 0,然后完成Unlock操作. 4. 因为此时owner_为 0,线程 222 不能在递归占用锁,一旦它再次获取锁,形成死锁.

在Unlock中,recursion_被拷贝到本地变量一次,之后只本地变量,比如没有在__sync_sub_and_fetch之后重新读取她.因为在那之后它能被其他线程已经改变.
recursion_和owner_没有原子操作.因为它们在调用Lock的 __sync_add_and_fetch和调用Unlock的__sync_sub_and_fetch之间,线程占有锁,独占recursion_和owner_的读写操作,并拥有所有的 acquire and release semantics.对recursion_和owner_使用原子操作没必要.因为在 X86/84 的平台上,__sync_add_and_fetch生成lock xadd的指令,保证全部的 memory barrier,也就保证 acquire and release semantics.

Mutex VS Spinlock

提到 Mutex,往往会提到 Spinlock,因为在使用 Lock 时,会遇到如何在 Mutex 与 Spinlock 之间选择.那么接下来对比一下两者.

定义

Mutex: 如果一个线程试图获取一个 mutex,但是没有成功,因为 mutex 已经被占用, 它将进入睡眠,让其他进程运行,直到 mutex 被其他进程释放.

Spinlock: 如果一个线程试图获取一个 Spinlock, 但是没有成功,它将持续试着去获取它,直到它最终成功获取,因为它将不允许其他线程运行(然而,操作系统将强制调度其他线程).

各自缺点

Mutex: Mutex 将使得线程睡眠,然后再唤醒它们,两者都是开销比较大的操作,也就是 context switch 的开销.如果锁只是被其他线程占用非常短的时间,那么时间花在使的线程睡眠并唤醒它可能超过它使用 spinlock 持续获取锁的时间.

Spinlock: Spinlock 持续获取锁,浪费很多 CPU 时间,如果锁被其他线程占用很长时间,那么它将浪费很多时间,不如使得线程进入睡眠,让出 CPU.Spinlock 的确能优化 context switches 但会在没有 threads priority inversion 的平台上产生副作用.(但一个高优先级的线程自旋一个锁来等待一个低优先级的线程释放这个锁,就会造成死锁).在没有 Preemption 的 Uniprocessor,使用 spinlock 是没有意义的,当前只有一个线程运行,没有必要保护关键区域,也没有其他线程同时运行,释放锁给它.

所以在 Linux 下,Spinlock 在 kernel 这样实现:

没有打开CONFIG_SMP和CONFIG_PREEMPT,spinlock 实现代码是空的.
没有打开CONFIG_SMP,打开CONFIG_PREEMPT,spinlock 仅仅是简单的关闭 preemption,足够来防止任何的 races.
打开CONFIG_SMP,打开CONFIG_PREEMPT,spinlock 实现如下代码,不断检查 lock 是否被其他线程释放:

  extern inline void spin_lock(spinlock_t *plock)
  {
    __asm__ __volatile__(
        spin_lock_string
        :"=m" (__dummy_lock(plock)));
  }
  // Macro spin_lock_string expand
  extern inline void spin_lock(spinlock_t *plock)
 {
  1:
    lock ; btsl ,plock;
    jc 2f;
    .section .text.lock,"ax"
  2:
    testb ,plock;
    rep;nop;
    jne 2b;
    jmp 1b;
    .previous
 }

总结

Criteria	Muutex	Spinlock
机制	尝试获取锁.若可得到就占有.若不能,就进入睡眠等待.	尝试获取锁.若可得到就占有.若不能,持续尝试直到获取.
什么时候使用	当线程进入睡眠没有伤害.或需要等待一段足够长的时间才能获取锁.	当线程不应该进入睡眠如中断处理等.当只需等待非常短的时间就能获取锁.
缺点	引起 context switch 和 scheduling 开销.	线程不做任何事情在获取到锁前.浪费 CPU 运行.

大多数操作系统(包括 Solaris,Mac OS X 和 FreeBSD)使用混合的机制叫”adaptive mutex”或”hybrid mutex”.一个 hybrid mutex 首先行为和 spinlock 一样,如果不能获取锁,持续尝试获取,但过了一定的时间,它就和 mutex 一样,让线程进入睡眠.¹.

http://stackoverflow.com/questions/5869825/when-should-one-use-a-spinlock-instead-of-mutex↩

浅谈Memory Reordering

2014-06-28T22:55:22+08:00

Memory ordering

在我们编写的 C/C++代码和它被在 CPU 上运行,按照一些规则,代码的内存交互会被乱序.内存乱序同时由编译器(编译时候)和处理器(运行时)造成,都为了使代码运行的更快.

被编译开发者和处理器制造商遵循的中心内存排序准则是:

不能改变单线程程序的行为.

因为这条规则,在写单线程代码时内存乱序被普遍忽略.即使在多线程程序中,它也被时常忽略,因为有 mutexes,semaphores 等来防止它们调用中的内存乱序.仅当 lock-free 技术被使用时,内存在不受任何互斥保护下被多个线程共享,内存乱序的影响能被看到.

下面先比较 Weak 和 Strong 的内存模型,然后分两部分,实际内存乱序如何在编译和运行时发生,并如何防止它们.

Weak VS strong Memory Models

Jeff Preshing 在 Weak vs. Strong Memory Models 中很好的总结了从 Weak 到 Strong 的类型:

非常弱	数据依赖性的弱	强制	顺序一致
DEC Alpha	ARM	X86/64	dual 386
C/C++11 low-level atomics	PowerPC	SPARC TSO	Java volatile/C/C++11 atomics

弱内存模型

在最弱的内存模型中,可能经历所有四种内存乱序 (LoadLoad, StoreStore, LoadStore and StoreLoad).任何 load 或 store 的操作能与任何的其他的 load 或 store 操作乱序,只要它不改变一个独立进程的行为.实际中,这样的乱序由于编译器引起的指令乱序或处理器本身处理指令的乱序.

当处理器是弱硬件内存模式,通常称它为 weakly-ordered 或 weak ordering.或说它有 relaxed memory model. DEC Alpha 是最具代表的弱排序的处理器.

C/C++的底层原子操作也呈现弱内存模型,无论代码的平台是如 x86/64 的强序处理器.下面章节 Memory ordering at compile time 会演示其弱内存模型,并说明如何强制内存顺序来保护编译器乱序.

数据依赖性的弱

ARM 和 PowerPC 系列的处理器内存模型和 Alpha 同样弱,除了它们保持 data dependency ordering.它意味两个相依赖的load(load A, load B<-A)被保证顺序load B<-A总能在 load A之后.(A data dependency barrier is a partial ordering on interdependent loads only; it is not required to have any effect on stores, independent loads or overlapping loads.)

强内存模型

弱和强内存模型区别存在分歧.Preshing 总结的定义是:

一个强硬件内存模型是在这样的硬件上每条机器指令隐性的保证 acquire and release
semantics 的执行.因此,当一个 CPU 核进行了一串写操作,每个其他的 CPU 核看到这些值的改变顺序与其顺序一致.

所以也就是保证了四种内存乱序 (LoadLoad, StoreStore, LoadStore and StoreLoad) 中的 3 种,除了不保证 StoreLoad 的顺序.基于以上的定义,x86/64 系列处理器基本就是强顺序的.之后 Memory ordering at processor time 可以看到 StoreLoad 在 X86/64 的乱序实验.

顺序一致

在顺序一致 (Sequential consistency) 的内存模型中,没有内存乱序存在.

如今,很难找到一个现代多核设备保证在硬件层 Sequential consistency.也就早期的 386 没有强大到能在运行时进行任何内存的乱序.

当用上层语言编程时,Sequential consistency 成为一个重要的软件内存模型.Java5 和之后版本,用volatile声明共享变量.在 C+11 中,可以使用默认的顺序约束memory_order_seq_cst在做原子操作时.当使用这些术语后,编译器会限制编译乱序和插入特定 CPU 的指令来指定合适的 memory barrier 类型.

Memory ordering at compile time

看如下代码:

test.c

int A, B;
void test() {
  A = B + 1;
  B = 0;
}

不打开编译器的优化,把它编译成汇编,我们可以看到,B的赋值在A的后面,和原程序的顺序一样.

$ gcc -S -masm=intel test.c

	mov	eax, DWORD PTR B
	add	eax, 1
	mov	DWORD PTR A, eax
	mov	DWORD PTR B, 0

用O2打开优化:

$ gcc -S -O2  -masm=intel test.c

	mov	eax, DWORD PTR B
	mov	DWORD PTR B, 0
	add	eax, 1
	mov	DWORD PTR A, eax

这次编译器把B的赋值提到A的前面.为什么它可以这么做呢?内存顺序的中心没有破坏.这样的改变并不影响单线程程序,单线程程序不能知道这样的区别.

但是当编写 lock-free 代码时,这样的编译器乱序就会引起问题.看如下例子,一个共享的标识来表明其他共享数据是否更新:

int value;
int updated = 0;
void UpdateValue(int x) {
    value = x;
    update = 1;
}

如果编译器把update的赋值提到value赋值的前面.即使在单核处理器系统中,会有问题:在两个参数赋值的中间这个线程被中断,使得另外的程序通过update判断以为value的值已经得到更新,实际上却没有.

显性的 Compiler Barriers

一种方法是用一个特殊的被称为 Compiler Barrier 的指令来防止编译器优化的乱序.以下 asm volative 是 GCC 中的方法.

test_barrier.c

int A, B;
void test() {
  A = B + 1;
  asm volatile("" ::: "memory");
  B = 0;
}

经过这样的修改,打开优化,B的存储将保持在要求的顺序上.

$ gcc -S -O2  -masm=intel test.c

	mov	eax, DWORD PTR B
	add	eax, 1
	mov	DWORD PTR A, eax
	mov	DWORD PTR B, 0

隐性的 Compiler Barriers

在 C++11 中原子库中,每个不是 relaxed 的原子操作同时是一个 compiler barrier.

int value;
std::atomic<int> updated(0);
void UpdateValue(int x) {
    value = x;
    // reordering is prevented here
    update.store(1, std::memory_order_release);
}

每一个拥有 compiler barrier 的函数本身也是一个 compiler barrier,即使它是 inline 的.

int a;
int b;
void DoSomething() {
    a = 1;
    UpdateValue(1);
    b = a + 1;
}

进一步推知,大多数被调用的函数是一个 compiler barrier.无论它们是否包含 memory barrier.排除 inline 函数,被声明为pure attribution 或当 link-time code generation 使用时.因为编译器在编译时,并不知道UpdateValue的运行是否依赖于a或会改变a的值从而影响b,所以编译器不会乱序它们之间的顺序.

可以看到,有许多隐藏的规则禁止编译指令的乱序,也防止了编译器多进一步的代码优化,所以在某些场景 Why the “volatile” type class should not be used, 来让编译器进一步优化.

无缘由的存储

有隐形的 Compiler Barriers,同样 GCC 编译器也有无缘由的存储.来自这里的实例:

extern int v;

    void
    f(int set_v)
    {
      if (set_v)
        v = 1;
    }

在 i686,GCC 3.3.4–4.3.0 用O1编译得到:

        pushl   %ebp
        movl    %esp, %ebp
        cmpl    $0, 8(%ebp)
        movl    $1, %eax
        cmove   v, %eax        ; load (maybe)
        movl    %eax, v        ; store (always)
        popl    %ebp
        ret

在单线程中,没有问题,但多线程中调用f(0)仅仅只是读取 v 的值,但中断后回去覆盖其他线程修改的值.引起 data rate.在新的 C++11 标准中明确禁止了这样的行为,看最近 C+11 标准进行的 draft§1.10.22 节:

Compiler transformations that introduce assignments to a potentially shared memory location that would not be modified by the abstract machine are generally precluded by this standard.

Memory ordering at processor time

看一个简单的 CPU 乱序的简单例子,即使在强内存模型的 X86/64 也能看到.有两个整数X和Y初始是 0,另外两个变量 r1 和 r2 读取它们的值,两个线程并行运行,执行如下的机器代码:

每个线程存储 1 到一个共享变量,然后把对方变量读取到一个变量或一个寄存器中.无论哪个线程先写 1 到内存,另外个线程读回那个值,意味着最后 r1=1 或 r2=1 或两者都是.但是 X86/64 是强内存模型,它还是允许乱序机器指令.特别,每个线程允许延迟存储到读回之后.以致最后 r1 和 r2 能同时等于 0–违反直觉的一个结果.因为指令可能如下顺序执行:

写一个实例程序,实际看一下 CPU 的确乱序了指令.源码可以 Github 下载.两个读写的线程代码如下:

sem_t begin_sem1;
sem_t begin_sem2;
sem_t end_sem;

int X, Y;
int r1, r2;

void *ThreadFunc1(void *param) {
  MersenneTwister random(1);
  for (;;) {
    sem_wait(&begin_sem1);
    // random delay
    while (random.Integer() % 8 != 0) {
    }
    X = 1;
    asm volatile("" ::: "memory");  // prevent compiler ordering
    r1 = Y;
    sem_post(&end_sem);
  }
  return NULL;
}

void *ThreadFunc2(void *param) {
  MersenneTwister random(2);
  for (;;) {
    sem_wait(&begin_sem2);
    // random delay
    while (random.Integer() % 8 != 0) {
    }
    Y = 1;
    asm volatile("" ::: "memory");  // prevent compiler ordering
    r2 = X;
    sem_post(&end_sem);
  }
  return NULL;
}

随机的延迟被插入在存储的开始处,为了交错线程的开始时间,以来达到重叠两个线程的指令的目的.随机延迟使用线程安全的MersenneTwister类.汇编代码asm volatile("" ::: "memory");如上节所述只是用来防止编译器的乱序, 因为这里是要看 CPU 的乱序,排除编译器的乱序影响.

主线程如下,利用 POSIX 的 semaphore 同步它与两个子线程的同步.先让两个子线程等待,直到主线程初始化X=0和 Y=0.然后主线程等待,直到两个子线程完成操作,然后主线程检查r1和r2的值.所以 semaphore 防止线程见的不同步引起的内存乱序,主线程代码如下:

int main(int argc, char *argv[]) {
  sem_init(&begin_sem1, 0, 0);
  sem_init(&begin_sem2, 0, 0);
  sem_init(&end_sem, 0, 0);

  pthread_t thread[2];
  pthread_create(&thread[0], NULL, ThreadFunc1, NULL);
  pthread_create(&thread[1], NULL, ThreadFunc2, NULL);

  int detected = 0;
  for (int i = 1; ; ++i) {
    X = 0;
    Y = 0;
    sem_post(&begin_sem1);
    sem_post(&begin_sem2);
    sem_wait(&end_sem);
    sem_wait(&end_sem);
    if (r1 == 0 && r2 == 0) {
      detected++;
      printf("%d reorders detected after %d iterations\n", detected, i);
    }
  }
  return 0;
}

在 Intel i5-2435M X64 的 ubuntu 下运行一下程序:

reorders detected after 2181 iterations
reorders detected after 4575 iterations
reorders detected after 7689 iterations
reorders detected after 22215 iterations
reorders detected after 60023 iterations
reorders detected after 60499 iterations
reorders detected after 61639 iterations
reorders detected after 62243 iterations
reorders detected after 67998 iterations
reorders detected after 68098 iterations
reorders detected after 71179 iterations
reorders detected after 71668 iterations
reorders detected after 72417 iterations
reorders detected after 73970 iterations
reorders detected after 78227 iterations
reorders detected after 81897 iterations
reorders detected after 82722 iterations
reorders detected after 85377 iterations
...

差不多每 4000 次的迭代才发现一次 CPU 内存乱序.所以多线程的 bug 是多么难发现.那么如何消除这些乱序.至少有如下两种方法:

让两个子线程在同一个 CPU 核下运行.(没有可移植性方法,如下是 linux 平台的).
使用 CPU 的 memory barrier 防止它的乱序.

Lock to one processor

让两个子线程在同一个 CPU 核下运行,代码如下:

  cpu_set_t cpus;
  CPU_ZERO(&cpus);
  CPU_SET(0, &cpus);
  pthread_setaffinity_np(thread[0], sizeof(cpu_set_t), &cpus);
  pthread_setaffinity_np(thread[1], sizeof(cpu_set_t), &cpus);

Place a memory barrier

防止一个 Store 在 Load 之后的乱序,需要一个 StoreLoad 的 barrier.这里使用 mfence的一个全部 memory barrier,防止任何类型的内存乱序.代码如下:

void *ThreadFunc1(void *param) {
  MersenneTwister random(1);
  for (;;) {
    sem_wait(&begin_sem1);
    // random delay
    while (random.Integer() % 8 != 0) {
    }
    X = 1;
    asm volatile("mfence" ::: "memory");  // prevent CPU ordering
    r1 = Y;
    sem_post(&end_sem);
  }
  return NULL;
  }

Summarization

有两种内存乱序存在:编译器乱序和 CPU 乱序.
如何防止编译器乱序.
如何防止 CPU 乱序.

High Resolution Time

2014-06-24T20:19:36+08:00

在不同的平台有繁多的 Time API，如何选用精准的高精度 Time 函数来做 performance benchmarking 呢？

Wall-clock time VS CPU time

先理解一些时间的概念。明白不同时间 API 测量的是什么时间。

Wall-clock time,顾名思义，墙上的钟，代表一个任务从开始到完成所经历的时间。它包含 3 部分：CPU 的时间，I/O 的时间和通信延迟的时间。但 wall-clock 很少是正确的时钟来使用，因为它随着时区，和 daylightsaving 改变，或与 NTP 同步。而这些特性没有一个是有益的，如果你用它来调度任务或做 performance benchmarking。它仅仅如名字所言，墙上的一个时钟。

CPU time 仅仅统计一个任务从开始到完成在 CPU 上所花的时间。CPU time 主要包括 User time（在 user space 所花时间）和 System time（在 kernel space 所花时间）。

以并行程序为例，CPU time 就是所有 CPU 在这个程序所花的时间总和， Wall-clock time 在这种情况可能时间相对短，它只统计任务开始到结束所花时间。

不同时钟 API 对比¹

对于不同的时钟 API,主要分析如下特性：

API 测试的是什么时间？（real, user, system，CPU or wall-clock)
API 的精度？(s, ms, µs, or faster?)
多久时间这个时钟数字会返转？或有什么策略避免它？
时钟是 monotonic 的，还是它会随着系统时间改变（比如 NTP，time zone， daylight saving time, by the user, etc)?

Linux 和 OS X 的主要时钟 API：

time()返回系统的 wall-clock，精度到秒。
clock()返回 user 和 systime 总共的时间.现在标准要求CLOCKS_PER_SEC是1000000,使精度最多达到 1µs.clock_t类型平台相关(The range and precision of times representable in clock_t and time_t are implementation-defined.) 它 wrap around 一旦达到最大值.(通常是 32 位的类型,那么~2^32 ticks 后,还是比较长的时间.)
clock_gettime(CLOCK_MONOTONIC,..) 提供纳秒级的精确度并且是单调的.它的秒和纳秒是分开存储的,所以,任何的 wrap around 将很多年才发生一次.它是个不错的时钟,但 OS X 平台上没有.
getrusage 返回独立的 user 和 system 时间,并且不会 wrap around.精确达到 1 µs,
gettimeofday 返回一个 wall-clock 时间并达到µs 精度.但是精度不能保证,因为依赖于硬件.
mach_absolute_time() 是 OS X 平台的高精度(ns)计时的一个选择.ns 以 64 位 unsigned integer 存储,实际使用 wrap around 不是大问题,移植性是问题.

Window 的高精度时钟：

QueryPerformanceFrequency() 和 QueryPerformanceCounter(). QueryPerformanceFrequency() 返回计数的频率,QueryPerformanceCounter()返回当前计数值.和 Linux 中 CLOCK_MONOTONIC 一样,它是一个稳定并单调递增计数器,精准达到纳秒级,并且不会 wrap around.

更多参考:

不同平台 High Resolution Time

Linux

使用 clock_gettime(CLOCK_MONOTONIC,..) 作为 High Resolution Time,编译需加上参数-lrt,实例代码如下:

clock_gettime.c

#include 
#include 

void GetMonotonicTime(struct timespec *ts) {
  clock_gettime(CLOCK_MONOTONIC, ts);
}

double GetElapsedTime(struct timespec *before, struct timespec *after) {
  double delta_s = after->tv_sec - before->tv_sec;
  double delta_ns = after->tv_nsec - before->tv_nsec;
  return delta_s * 1e9 + delta_ns;
}

int main(int argc, char *argv[]) {
  struct timespec before, after;
  GetMonotonicTime(&before);
  double sum = 0.0;
  unsigned int i;
  for (i = 1; i < 100; ++i) {
    sum += 1.0 / i;
  }
  GetMonotonicTime(&after);
  printf("the elapsed time=%e ns\n", GetElapsedTime(&before, &after));
  return 0;
}

除了clock_gettime()高精度时钟外,还有相对应的高精度的睡眠函数 clock_nanosleep, 实例代码如下:

clock_nanosleep.c

#include 

int main(int argc, char *argv[])
{
  struct timespec sleep_time;
  sleep_time.tv_sec = 0;
  sleep_time.tv_nsec = 100;
  clock_nanosleep(CLOCK_REALTIME, 0, &sleep_time, NULL);
  return 0;
}

OS X

使用`clock_get_time`

clock_get_time.c

#include 
#include 
#include 
#include 

void GetMonotonicTime(struct timespec *ts) {
  clock_serv_t cclock;
  mach_timespec_t mts;
  host_get_clock_service(mach_host_self(), SYSTEM_CLOCK, &cclock);
  clock_get_time(cclock, &mts);
  mach_port_deallocate(mach_task_self(), cclock);
  ts->tv_sec = mts.tv_sec;
  ts->tv_nsec = mts.tv_nsec;
}

double GetElapsedTime(struct timespec *before, struct timespec *after) {
  double delta_s = after->tv_sec - before->tv_sec;
  double delta_ns = after->tv_nsec - before->tv_nsec;
  return delta_s * 1e9 + delta_ns;
}

int main(int argc, char *argv[]) {
  struct timespec before, after;
  GetMonotonicTime(&before);
  double sum = 0.0;
  unsigned int i;
  for (i = 1; i < 100; ++i) {
    sum += 1.0 / i;
  }
  GetMonotonicTime(&after);
  printf("the elapsed time=%e ns\n", GetElapsedTime(&before, &after));
  return 0;
}

使用`mach_absolute_time`

mach_absolute_time.c

int main(int argc, char *argv[]) {
    uint64_t        start;
    uint64_t        end;
    uint64_t        elapsed;
    Nanoseconds     elapsedNano;
    start = mach_absolute_time();
    double sum = 0.0;
    unsigned int i;
    for (i = 1; i < 100; ++i) {
        sum += 1.0 / i;
    }
    end = mach_absolute_time();
    elapsed = end - start;
    // Convert to nanoseconds
    elapsedNano = AbsoluteToNanoseconds( *(AbsoluteTime *) &elapsed );
}

Windows

query_performance.cc

#include 
#include  
using namespace std;

int main()
{
    LARGE_INTEGER frequency;
    LARGE_INTEGER start, end;
    double elapsedTime;

    // get ticks per second
    QueryPerformanceFrequency(&frequency);

    QueryPerformanceCounter(&start);

    //do someting
    double sum = 0.0;
    unsigned int i;
    for (i = 1; i < 100; ++i) {
        sum += 1.0 / i;
    }

    QueryPerformanceCounter(&end);

    // compute and print the elapsed time in millisec
    elapsedTime = (end.QuadPart - start.QuadPart) * 1000.0 / frequency.QuadPart;
    cout << elapsedTime << " ms.\n";
    return 0;
}

http://stackoverflow.com/questions/12392278/measure-time-in-linux-getrusage-vs-clock-gettime-vs-clock-vs-gettimeofday↩

Double-Checked Locking Works in C++11

2014-06-22T14:07:01+08:00

在浅谈设计模式六: 单例模式(Singleton) 中提到 double-checked locking pattern(DCLP)来实现 Singleton 设计模式，但是在 C++11 之前，没有安全方法在可移植的 C++中去实现它．具体原因可见单例模式(Singleton) 或 Scott Meyers 和 Andrei Alexandrescu 发布的原文 “C++ and the Perils of Double-Checked Locking” ．

C++11 引入了新的内存模型和线程库，使得能在 C++中实现可移植的 DCLP．本文说明如何实现它．

什么是 Double-Checked Locking

在单例模式(Singleton) 很好的介绍什么是 DCLP,这里稍作回顾.

线程安全的方式实现 Signleton 模式如下:

singleton.cc

Singleton* Singleton::instance() {
  Lock lock;    // acquire lock (params omitted for simplicity)
  if(pInstance == NULL) {
    pInstance = new Singleton();
  }
  return pInstance;
  }  // release lock (via Lock destructor)

每次获取 Singleton 都要获取一个锁，但是实际上，我们只有当初始化 pInstance 时才需要一个锁。也就是只发生在第一次调用 instance 时。如果在一个程序运行时， instance 被调用了 n 次，我们只需要锁在第一次调用时。当我们知道那 n-1 次锁是没必要的.

DCLP 的关键点是发现，大多数 instance 的调用将看到 pInstance 是非空的，因此根本没必要去尝试初始化它。因此，DCLP 判断 pInstance 是否为空在尝试获取锁前。只有当判断成功（ pInstance 还没有被初始化）才去获取锁，然后之后这个判断在此进行一次确保 pInstance 是仍然空的。（所以名字叫双重检查锁）。第二个检查是有必要的，因为从上可以看到，另外的线程可能碰巧初始化了 pInstance 在 pInstance 被第一次判断和获取锁之间。

singleton-dclp.cc

Singleton* Singleton::instance() {
  Singleton *tmp = pInstance;
  ...  // need memory barrier
  if(tmp == 0) { // 1st test
  Lock lock;
  tmp = pInstance;
  if(tmp == 0) { // 2nd test
    tmp  = new Singleton;
  ...  // need memory barrier
    pInstance = tmp;
  }
  }
return pInstance;
}

单例模式(Singleton) 说明了各种不安全实现的缺陷,主要原因是 1) 编译器的乱序编译和 2) CPU 的乱序执行指令.所以安全的实现依靠 memory barrier,防止它们的乱序,使得在多线程中得到同步,C++11 之前没有可移植的 C/C++函数,但现在,C++11 有了.

使用 C++11 的 Acqure 和 Release Fence

使用 Acqure 和 Release Fence 来实现它,并且保证对实例pInstance进行原子操作,把它定义为atomic类型,并用memory_order_relaxed操作.(Relaxed ordering: there are no synchronization or ordering constraints, only atomicity is required of this operation.)如下实现代码.

std::atomic<Singleton *> Singleton::m_pInstance;
std::mutex Singleton::m_mutex;

Singleton* Singleton::instance() {
  Singleton *tmp = m_pInstance.load(std::memory_order_relaxed);
  std::atomic_thread_fence(std::memory_order_acquire);
  if(tmp == nullptr) {
    std::lock_guard<std::mutex> lock(m_mutex);
    tmp = m_pInstance.load(std::memory_order_relaxed);
    if(tmp == nullptr) {
      tmp  = new Singleton;
      std::atomic_thread_fence(std::memory_order_release);
      m_pInstance.store(tmp, std::memory_order_relaxed);
    }
  }
  return m_pInstance;
}

在多核系统中,这整个代码也是稳健的,因为 memory fences 在多个线程间建立了同步的关系.Singleton::m_pInstance作为 guard variable,singleton 变量自身成为 payload.

如果没有这层同步关系的话,就不能保证第一个线程的所有写操作(这里就是 singleton 实力的创建)被第二个线程读取到,即使m_pInstance已经被第二个线程能看到.

使用 C++11 的底层的内存顺序约束在 C++11 中也可以在单元操作时附加底层的内存顺序约束来达到同样的目的.一个

write-release 能同步于一个 read-release.

memory_order_acquire: A load operation with this memory order performs the acquire operation on the affected memory location: prior writes made to other memory locations by the thread that did the release become visible in this thread.
memory_order_release: A store operation with this memory order performs the release operation: prior writes to other memory locations become visible to the threads that do a consume or an acquire on the same location.

std::atomic<Singleton *> Singleton::m_pInstance;
std::mutex Singleton::m_mutex;

Singleton* Singleton::instance() {
  Singleton *tmp = m_pInstance.load(std::memory_order_acquire);
  if(tmp == nullptr) {
    std::lock_guard<std::mutex> lock(m_mutex);
    tmp = m_pInstance.load(std::memory_order_relaxed);
    if(tmp == nullptr) {
      tmp  = new Singleton;
      m_pInstance.store(tmp, std::memory_order_release);
    }
  }
  return m_pInstance;
}

从深层分析来看,这种形式的免锁机制的同步比上面单独 memory fences 来的约束更小.这种形式的操作只意味在这个操作周围防止内存乱序,而 memory fences 意味着在一块区域内防止内存乱序.更多细节参考 preshing 的 Acquire and Release Fences Don’t Work the Way You’d Expect 的分析. ## 使用 C++11 的 Sequentially-consistent ordering C++11 还提供了其他的方法来写 lock-free 的代码.当在 atomic 操作函数中忽略 std::memory_order参数项,那么默认值是std::memory_order_seq_cst,使得所有原子参数成为 sequentically consistent(SC) 原子.通过 SC 原子性,整个算法保证 sequentically consistent 只要没有 data races.

std::atomic<Singleton *> Singleton::m_pInstance;
std::mutex Singleton::m_mutex;

Singleton* Singleton::instance() {
  Singleton *tmp = m_pInstance.load();
  std::atomic_thread_fence(std::memory_order_acquire);
  if(tmp == nullptr) {
    std::lock_guard<std::mutex> lock(m_mutex);
    tmp = m_pInstance.load(std::memory_order_relaxed);
    if(tmp == nullptr) {
      tmp  = new Singleton;
      std::atomic_thread_fence(std::memory_order_release);
      m_pInstance.store(tmp);
    }
  }
  return m_pInstance;
}

SC 的原子性可能更容易理解.权衡点就是它产生的机器代码没有之前做法的高效.比如如下是 Gcc 4.8.2 intle X64 对上面代码产生的机器代码,通过g++ -O2 -std=c++11 -S.

因为使用了 SC 原子性,对m_pInstance的存储实现使用了mfence指令,起到一个在 X64 上的 full memory fence.这是个更严格的指令想对于 DCLP 在 X64 上的实际需求.一个普通的mov足以胜任.但也无关紧要,因为mfence指令也仅仅执行一次而已,就在创建 singleton 的实例的代码路径上.

使用 Preshing 的小型可移植的 lock-free 库,在没有 C++11 的支持下,使用它的 Mintomic Fences 实现 DCLP.

更多关于 C++11 的 multithreading 库的详解见之后的文章.

Algorithm Design Manual Chapter 6

2014-06-15T00:00:00+08:00

Book Notes

6.1 Minimum Spanning Trees

6.1.1 Prim’s Algorithm

A greedy algorithm suffices for correctness: we always add the lowest-weight edge linking a vertex in the tree to a vertex on the outside. （选取相邻最近的不在树内的点。）

prim(graph *g, int start)
{
  int i; /* counter */
  edgenode *p; /* temporary pointer */
  bool intree[MAXV+1]; /* is the vertex in the tree yet? */
  int distance[MAXV+1]; /* cost of adding to tree */
  int v; /* current vertex to process */
  int w; /* candidate next vertex */
  int weight; /* edge weight */
  int dist; /* best current distance from start */
  for (i=1; i<=g->nvertices; i++) {
    intree[i] = FALSE;
    distance[i] = MAXINT;
    parent[i] = -1;
  }
  distance[start] = 0;
  v = start;
  while (intree[v] == FALSE) {
    intree[v] = TRUE;
    p = g->edges[v];
    while (p != NULL) {
      w = p->y;
      weight = p->weight;
      if ((distance[w] > weight) && (intree[w] == FALSE)) {
        distance[w] = weight;
        parent[w] = v;
      }
      p = p->next;
    }
    v = 1;
    dist = MAXINT;
    for (i=1; i<=g->nvertices; i++)
      if ((intree[i] == FALSE) && (dist > distance[i])) {
        dist = distance[i];
        v=i;
      }
  }
}

6.1.2 Kruskal’s Algorithm

The algorithm repeatedly considers the lightest remaining edge and tests whether its two endpoints lie within the same connected component. (最短边）

a clever data structure calledunion-find,can support such queries in O(lgn) time. With this data structure, Kruskal’s algorithm runs in O(mlgm) time.

Implementation

kruskal(graph *g)
{
  int i; /* counter */
  set_union s; /* set union data structure */
  edge_pair e[MAXV+1]; /* array of edges data structure */
  bool weight_compare();
  set_union_init(&s, g->nvertices);
  to_edge_array(g, e); /* sort edges by increasing cost */
  qsort(&e,g->nedges,sizeof(edge_pair),weight_compare);
  for (i=0; i<(g->nedges); i++) {
    if (!same_component(s,e[i].x,e[i].y)) {
      printf("edge (%d,%d) in MST\n",e[i].x,e[i].y);
      union_sets(&s,e[i].x,e[i].y);
    }
  }
}

6.1.3 The Union-Find Data Structure

Find(i)– Find the root of tree containing elementi, by walking up the parent pointers until there is nowhere to go. Return the label of the root.
Union(i,j)– Link the root of one of the trees (say containingi)to the root of the tree containing the other (say j) so find(i) now equals find(j).

We must double the number of nodes in the tree to get an extra unit of height. How many doublings can we do before we use up allnnodes? At most, lg2ndoublings can be performed. Thus, we can do both unions and finds in O(logn), good enough for Kruskal’s algorithm. In fact, union-find can be done even faster, as discussed in Section 12.5.

Implementation

typedef struct {
  int p[SET_SIZE+1]; /* parent element */
  int size[SET_SIZE+1]; /* number of elements in subtree i */
  int n; /* number of elements in set */
} set_union;

set_union_init(set_union *s, int n)
{
  int i; /* counter */
  for (i=1; i<=n; i++) {
    s->p[i] = i;
    s->size[i] = 1;
  }
  s->n = n;
}
int find(set_union *s, int x)
{
  if (s->p[x] == x)
    return(x);
  else
    return( find(s,s->p[x]) );
}
int union_sets(set_union *s, int s1, int s2)
{
  int r1, r2; /* roots of sets */
  r1 = find(s,s1);
  r2 = find(s,s2);
  if (r1 == r2) return; /* already in same set */
  if (s->size[r1] >= s->size[r2]) {
    s->size[r1] = s->size[r1] + s->size[r2];
    s->p[ r2 ] = r1;
  }
  else {
    s->size[r2] = s->size[r1] + s->size[r2];
    s->p[ r1 ] = r2;
  }
}
bool same_component(set_union *s, int s1, int s2)
{
  return ( find(s,s1) == find(s,s2) );
}

More MST

http://www.seas.gwu.edu/~simhaweb/champalg/mst/mst.html

6.3 Shortest Paths

6.3.1 Dijkstra’s Algorithm

Given a particular start vertexs, it finds the shortest path from s to every other vertex in the graph, including your desired destination t.

Implementation

dijkstra(graph *g, int start) /* WAS prim(g,start) */
{
  int i; /* counter */
  edgenode *p; /* temporary pointer */
  bool intree[MAXV+1]; /* is the vertex in the tree yet? */
  int distance[MAXV+1]; /* distance vertex is from start */
  int v; /* current vertex to process */
  int w; /* candidate next vertex */
  int weight; /* edge weight */
  int dist; /* best current distance from start */
  for (i=1; i<=g->nvertices; i++) {
    intree[i] = FALSE;
    distance[i] = MAXINT;
    parent[i] = -1;
  }
  distance[start] = 0;
  v = start;
  while (intree[v] == FALSE) {
    intree[v] = TRUE;
    p = g->edges[v];
    while (p != NULL) {
      w = p->y;
      weight = p->weight;
      /* CHANGED */ if (distance[w] > (distance[v]+weight)) {
        /* CHANGED */ distance[w] = distance[v]+weight;
        /* CHANGED */ parent[w] = v;
      }
      p = p->next;
    }
    v=1;
    dist = MAXINT;
    for (i=1; i<=g->nvertices; i++)
      if ((intree[i] == FALSE) && (dist > distance[i])) {
        dist = distance[i];
        v=i;
      }
  }
}

As implemented here, the complexity is O(n²).

Dijkstra works correctly only on graphs without negative-cost edges. The reason is that midway through the execution we may encounter an edge with weight so negative that it changes the cheapest way to get froms to some other vertex already in the tree.

6.3.2 All-Pairs Shortest Path

typedef struct {
  int weight[MAXV+1][MAXV+1]; /* adjacency/weight info */
  int nvertices; /* number of vertices in graph */
} adjacency_matrix;

The critical issue in an adjacency matrix implementation is how we denote the edges absent from the graph. A common convention for unweighted graphs denotes graph edges by 1 and non-edges by 0. This gives exactly the wrong interpretation if the numbers denote edge weights, for the non-edges get interpreted as a free ride between vertices. Instead, we should initialize each non-edge to MAXINT.

floyd(adjacency_matrix *g)
{
  int i,j; /* dimension counters */
  int k; /* intermediate vertex counter */
  int through_k; /* distance through vertex k */
  for (k=1; k<=g->nvertices; k++)
    for (i=1; i<=g->nvertices; i++)
      for (j=1; j<=g->nvertices; j++) {
        through_k = g->weight[i][k]+g->weight[k][j];
        if (through_k < g->weight[i][j])
          g->weight[i][j] = through_k;
      }
}

The Floyd-Warshall all-pairs shortest path runs in O(n³) time, which is asymptotically no better thanncalls to Dijkstra’s algorithm. However, the loops are so tight and the program so short that it runs better in practice.

6.4 War Story: Dialing for Documents

“We can get good word-use frequencies and grammatical information from a big text database called the Brown Corpus. It contains thousands of typical English sentences, each parsed according to parts of speech. But how do we factor it all in?” Harald asked.

Each possible sentence interpretation can be thought of as a path in a graph. The vertices of this graph are the complete set of possible word choices. There will be an edge from each possible choice for the ith word to each possible choice for the (i + 1)st word. The cheapest path across this graph defines the best interpretation of the sentence.

Perhaps we can count how often that pair of words occurred together in previous texts. Or we can weigh them by the part of speech of each word. Maybe nouns don’t like to be next to nouns as much as they like being next to verbs.

We can pay a cost for walking through a particular vertex that depends upon the frequency of the word. Our best sentence will be given by the shortest path across the graph.

The constraints for many pattern recognition problems can be naturally formulated as shortest path problems in graphs. In fact, there is a particularly convenient dynamic programming solution for these problems (the Viterbi algorithm). Despite the fancy name, the Viterbi algorithm is basically solving a shortest path problem on a DAG.

6.5 Network Flows and Bipartite Matching

The network flow problem asks for the maximum amount of flow which can be sent from vertices s to t in a given weighted graph G while respecting the maximum capacities of each pipe.

6.5.1 Bipartite Matching

The largest bipartite matching can be readily found using network flow. Create a source nodes that is connected to every vertex in L by an edge of weight 1. Create a sink node t and connect it to every vertex in R by an edge of weight 1. Finally, assign each edge in the bipartite graph G a weight of 1. Now, the maximum possible flow fromstotdefines the largest matching in G.

6.5.2 Computing Network Flows

The key structure is the residual flow graph, denoted as R(G, f), where Gis the input graph andfis the current flow through G.

The maximum flow fromstotalways equals the weight of the minimums-t cut. Thus, flow algorithms can be used to solve general edge and vertex connectivity problems in graphs.

Implementation

        typedef struct {
          int v; /* neighboring vertex */
          int capacity; /* capacity of edge */
          int flow; /* flow through edge */
          int residual; /* residual capacity of edge */
          struct edgenode *next; /* next edge in list */
        } edgenode;

      netflow(flow_graph *g, int source, int sink)
      {
        int volume; /* weight of the augmenting path */
        add_residual_edges(g);
        initialize_search(g);
        bfs(g,source);
        volume = path_volume(g, source, sink, parent);
        while (volume > 0) {
          augment_path(g,source,sink,parent,volume);
          initialize_search(g);
          bfs(g,source);
          volume = path_volume(g, source, sink, parent);
        }
      }

    bool valid_edge(edgenode *e)
    {
      if (e->residual > 0) return (TRUE);
      else return(FALSE);
    }

  int path_volume(flow_graph *g, int start, int end, int parents[])
  {
    edgenode *e; /* edge in question */
    edgenode *find_edge();
    if (parents[end] == -1) return(0);
    e = find_edge(g,parents[end],end);
    if (start == parents[end])
      return(e->residual);
    else
      return( min(path_volume(g,start,parents[end],parents),
                  e->residual) );
  }
  edgenode *find_edge(flow_graph *g, int x, int y)
  {
    edgenode *p; /* temporary pointer */
    p = g->edges[x];
    while (p != NULL) {
      if (p->v == y) return(p);
      p = p->next;
    }
    return(NULL);
  }

augment_path(flow_graph*g,intstart,intend,intparents[],intvolume)
{
  edgenode *e; /* edge in question */
  edgenode *find_edge();
  if (start == end) return;
  e = find_edge(g,parents[end],end);
  e->flow += volume;
  e->residual -= volume;
  e = find_edge(g,end,parents[end]);
  e->residual += volume;
  augment_path(g,start,parents[end],parents,volume);
}

Edmonds and Karp [EK72] proved that always selecting ashortest unweighted augmenting path guarantees that O(n³) augmentations suffice for optimization.

6.6 Design Graphs, Not Algorithms

The secret is learning to design graphs, not algorithms. We have already seen a few instances of this idea:

The maximum spanning tree can be found by negating the edge weights of the input graph G and using aminimumspanning tree algorithm on the result. The most negative weight spanning tree will define the maximum weight tree in G.
To solve bipartite matching, we constructed a special network flow graph such that the maximum flow corresponds to a maximum cardinality matching.

Bucketing Rectangles

Problem: “In my graphics work I need to solve the following problem. Given an arbitrary set of rectangles in the plane, how can I distribute them into a minimum number of buckets such that no subset of rectangles in any given bucket intersects another? In other words, there can not be any overlapping area between two rectangles in the same bucket.”

Solution: We formulate a graph where each vertex is a rectangle, and there is an edge if two rectangles intersect. Each bucket corresponds to anindependent set of rectangles, so there is no overlap between any two. Avertex coloringof a graph is a partition of the vertices into independent sets, so minimizing the number of colors is exactly what you want.

Names in Collision

Problem:“In porting code from UNIX to DOS, I have to shorten several hundred file names down to at most 8 characters each. I can’t just use the first eight characters from each name, because “filename1” and “filename2” would be assigned the exact same name. How can I meaningfully shorten the names while ensuring that they do not collide?”

Solution: Construct a bipartite graph with vertices corresponding to each original file namefi for 1≤i≤n, as well as a collection of acceptable shortenings for each name f_i1,…,f_ik. Add an edge between each original and shortened name. We now seek a set of n edges that have no vertices in common, so each file name is mapped to a distinct acceptable substitute. Bipartite matching, discussed in Section 15.6 (page 498), is exactly this problem of finding an independent set of edges in a graph.

Separate the Text

Problem: “We need a way to separate the lines of text in the optical characterrecognition system that we are building. Although there is some white space between the lines, problems like noise and the tilt of the page makes it hard to find. How can we do line segmentation?

Solution: Consider the following graph formulation. Treat each pixel in the image as a vertex in the graph, with an edge between two neighboring pixels. The weight of this edge should be proportional to how dark the pixels are. A segmentation between two lines is a path in this graph from the left to right side of the page. We seek a relatively straight path that avoids as much blackness as possible. This suggests that theshortest pathin the pixel graph will likely find a good line segmentation.

Exercises

2-3

Is the path between two vertices in a minimum spanning tree necessarily a shortest path between the two vertices in the full graph? Give a proof or a counterexample.

Assume that all edges in the graph have distinct edge weights (i.e. , no pair of edges have the same weight). Is the path between a pair of vertices in a minimum spanning tree necessarily a shortest path between the two vertices in the full graph? Give a proof or a counterexample.

不必要. 如下图,若 a 是 6 的话,minimum spanning tree 不会选择 a,但 A 和 C 间的最短路径会选择 a.

4

Can Prim’s and Kruskal’s algorithm yield different minimum spanning trees? Explain why or why not.

能.当有相同 weight 的边.

当所有边的 weight 不同时,图存在唯一的 minimum spanning trees,两者生成同样的树.

5

Does either Prim’s and Kruskal’s algorithm work if there are negative edge weights? Explain why or why not.

可以.Prim 每次选相邻最近的不在树内的点,有负 weight 的边并不影响它. 而 Kruskal 每次选最短的边,同样不受影响.

6

Suppose we are given the minimum spanning tree T of a given graph G (with n vertices and m edges) and a new edge e = (u,v) of weight w that we will add to G. Give an efficient algorithm to find the minimum spanning tree of the graph G + e. Your algorithm should run in O(n) time to receive full credit.

新添加的 e 在顶点 u 和 v 中间,原本的 MST 中 u 和 v 通过 u->a1->ai->v,把此路径的边与 e 比较,用 Prim 算法选最临近点.

7

(a) Let T be a minimum spanning tree of a weighted graph G. Construct a new graph G′ by adding a weight of k to every edge of G. Do the edges of T form a minimum spanning tree of G′? Prove the statement or give a counterexample.

(b) Let P = {s, … , t} describe a shortest weighted path between vertices s and t of a weighted graph G. Construct a new graph G′ by adding a weight of k to every edge of G. Does P describe a shortest path from s to t in G′? Prove the statement or give a counterexample.

(a)和(b)都对,并没有改变边之间的比较关系.

8

Devise and analyze an algorithm that takes a weighted graph G and finds the smallest change in the cost to a non-MST edge that would cause a change in the minimum spanning tree of G. Your algorithm must be correct and run in polynomial time.

遍历图的边,掠过 MST 中的边,当遇到 non-MST 的边 E(i,j).
利用 MST 中的 parent 遍历出顶点 i 到 j 的所有 MST 边,并得到其中最大 weight 的边 mst_e_max.
计算 E(i,j)与 mst_e_{max 的差值}.
遍历所有的 non-MST 的边,得到最小差值就是改变的最小值.

总共边数 m,算法复杂度 O(m²).

9

Consider the problem of finding a minimum weight connected subset T of edges from a weighted connected graph G. The weight of T is the sum of all the edge weights in T.

Why is this problem not just the minimum spanning tree problem? Hint: think negative weight edges.
Give an efficient algorithm to compute the minimum weight connected subset T.

MST 不能有环路,minimum weight connected subset T 可以有环路,所以如果一条负数 weight 的边,不在 MST 中,但却包含在 T 中,因为它能使 T 的总权值减小.

把所有负数 weight 的边加入 T 中,若剩下 1 个连通图,结束.
若剩下 C(>1)个连通图,对 C 个连通图使用 Kruskal,直到剩下一个连通图 T.

sort(edges);
c := n;
for edge in edges:
    if edge.weight < 0:
        if find(edge.firstEnd) != find(edge.secondEnd):
            --c;
        unite(edge.firstEnd, edge.secondEnd);
    else:
        if c == 1: break;
        if find(edge.firstEnd) != find(edge.secondEnd):
            unite(edge.firstEnd, edge.secondEnd);
            --c;

10

Let G=(V,E) be an undirected graph. A set F⊆E of edges is called a feedback-edge set if every cycle of G has at least one edge in F.

Suppose that Gis unweighted. Design an efficient algorithm to find a minimum-size feedback-edge set.
Suppose that Gis a weighted undirected graph with positive edge weights. Design an efficient algorithm to find a minimum-weight feedback-edge set.
Minimum size feedback edge set: 使用 DFS,从任意点开始,遇到回归边, 把它加入结果 set 中,当 DFS 完成,结果 set 就是答案.
minimum-weight feedback-edge set: 对所有 weight 值取反,用 Kruskal 算法,当遇到边 E 的顶点在同一个集合中,把 E 加入到结果 set 中,当 Kruskal 遍历完所有边后,结果 set 就是答案.

11

Modify Prim’s algorithm so that it runs in time O(nlogk) on a graph that has only k different edges costs.

k 个不同的边值,使用一个 k 个元素的 min-heap,heap 的节点是相同距离的顶点链表.
Prim 每次选择和更新顶点的距离在 min-heap 完成,做到 O(nlogk).

12

Devise an efficient data structure to handle the following operations on a weighted directed graph:

Merge two given components.
Locate which component contains a given vertex v.
Retrieve a minimum edge from a given component.

使用 Union-Find 并添加 minimum edge.

typedef struct {
  int p[SET_SIZE+1]; /* parent element */
  int size[SET_SIZE+1]; /* number of elements in subtree i */
  int minedge[SET_SIZE+1];
  int n; /* number of elements in set */
} set_union;

14

The single-destination shortest path problem for a directed graph seeks the shortest path from every vertex to a specified vertex v. Give an efficient algorithm to solve the single-destination shortest paths problem.

用 Floyd-Warshall 对于顶点 v 反向更新距离值.得到最终 shortest paths.

19

Let G be a weighted directed graph with n vertices and m edges, where all edges have positive weight. A directed cycle is a directed path that starts and ends at the same vertex and contains at least one edge. Give an O(n³) algorithm to find a directed cycle in G of minimum total weight. Partial credit will be given for an O(n²m) algorithm.

run Floyd Warshall on the graph
min <- MAX_INT
vertex <- None
for each pair of vertices u,v
    if (dist(u,v) + dist(v,u) < min):
           min <- dist(u,v) + dist(v,u)
           pair <- (u,v)
return path(u,v) + path(v,u)

20

Can we modify Dijkstra’s algorithm to solve the single-source longest path problem by changing minimum to maximum? If so, then prove your algorithm correct. If not, then provide a counterexample.

没有负 weight 的边,可以.

21

LetG=(V,E) be a weighted acyclic directed graph with possibly negative edge weights. Design a linear-time algorithm to solve the single-source shortest-path problem from a given source v.

for each vertex y in a topological ordering of G
    choose edge (x,y) minimizing d(s,x)+length(x,y)
    path(s,y) = path(s,x) + edge (x,y)
    d(s,y) = d(s,x) + length(x,y)

22

Let G=(V,E) be a directed weighted graph such that all the weights are positive. Let v and w be two vertices in G and k≤|V| be an integer. Design an algorithm to find the shortest path from v to w that contains exactly k edges. Note that the path need not be simple.

create the table D[V,k];
D[v,1] = 0;
for i in other vertex except v:
         D[i,1] = MAX_INT;
for m=2 to k:
    for every edge(i,j):
        D[j,m] = D[i,m-1] + D[i,j]
        P[i,m] = i
Path = emtpy list
i = w
for m=k down to 1:
    Path.append(m);
    i = P[m,k]
 Path.append(V);
 Path.reverse();

23

Arbitrage is the use of discrepancies in currency-exchange rates to make a profit. For example, there may be a small window of time during which 1 U.S. dollar buys 0.75 British pounds, 1 British pound buys 2 Australian dollars, and 1 Australian dollar buys 0.70 U.S. dollars. At such a time, a smart trader can trade one U.S. dollar and end up with 0.75 × 2 × 0.7 = 1.05 U.S. dollars—a profit of 5%. Suppose that there are n currencies c1 , …, cn and an n × n table R of exchange rates, such that one unit of currency ci buys R[i,j] units of currency cj. Devise and analyze an algorithm to determine the maximum value of R[c1, ci1] · R[ci1, ci2] · · · R[cik−1, cik] · R[cik, c1]

log(a*b*c) = loga + lgob + log.所以求最长路径.

用 Floyd-Warshall 算法算出 i,j 的最长路径;
计算所有 C_(1i)*C_(i1)的值,得出最大值.

Algorithm Design Manual Chapter 5

2014-06-12T00:00:00+08:00

Book Notes

5.1 Flavors of Graphs

Undirected vs. Directed
Weighted vs. Unweighted
Simple vs. Non-simple
Sparse vs. Dense
Cyclic vs. Acyclic
Embedded vs. Topological
Implicit vs. Explicit
Labeled vs. Unlabeled

5.2 Data Structures for Graphs

Adjacency Matrix: We can represent G using an n×n matrix M, where element M[i, j] = 1 if(i, j) is an edge of G, and 0 if it isn’t.
Adjacency Lists: We can more efficiently represent sparse graphs by using linked lists to store the neighbors adjacent to each vertex.

Adjacency lists are the right data structure for most applications of graphs.

Adjacency Lists

#define MAXV 1000  // maximum number of vertices

typedef struct {
  int y;  // adjacency info
  int weight;  // edge weight, if any
  struct edgenode *next;  // next edge in list
} edgenode;

typedef struct {
  edgenode *edges[MAXV + 1];   // adjacency info
  int degree[MAXV + 1];  // outdegree of each vertex
  int nvertices;  // number of vertices in graph
  int nedges;  // number of edges in graph
  bool directed;  // is the graph directed
} graph;

void initialize_graph(graph *g, bool directed) {
  int i;
  g->nvertices = 0;
  g->nedges = 0;
  g->directed = directed;
  for (i = 1; i <= NMAX; ++i) {
    g->degree[i] = 0;
    g->edges[i] = NULL;
  }
}

void insert_edge(graph *g, int x, int y, bool directed) {
  edgenode *p;
  p = new edgenode;
  p-> weight = 0;
  p->y = y;
  p->next = g->edges[x];
  g->edges[x] = p;
  g->degree[x]++;
  if (directed == false) {
    insert_edge(g, y, x, true);
  } else {
    g->nedges++;
  }
}

void read_graph(graph *g, bool directed) {
  int i;
  int m;
  int x, y;
  initialize_graph(g, directed);
  scanf("%d %d", &(g->nvertices), &m);
  for (i = 1; i <= m; ++i) {
    scanf("$d %d", &x, &y);
    insert_edge(g, x, y, directed);
  }
}

print_graph(graph *g) {
  int i;
  edgenode *p;
  for (i = 1; i <= g->nvertices; ++i) {
    printf("%d: ", i);
    p = g->edges[i];
    while (p != NULL) {
      printf("%d ", p->y);
      p = p->next;
    }
    printf("\n");
  }
}

5.5 Traversing a Graph

The key idea behind graph traversal is to mark each vertex when we first visit it and keep track of what we have not yet completely explored. Although bread crumbs or unraveled threads have been used to mark visited places in fairy-tale mazes, we will rely on Boolean flags or enumerated types.

Each vertex will exist in one of three states:

undiscovered– the vertex is in its initial, virgin state.
discovered– the vertex has been found, but we have not yet checked out all its incident edges.
processed– the vertex after we have visited all its incident edges.

5.6 Breadth-First Search

先遍历完一个点的所有相邻点。

    bool processed[MAXV+1]; /* which vertices have been processed */
    bool discovered[MAXV+1]; /* which vertices have been found */
    int parent[MAXV+1]; /* discovery relation */

    initialize_search(graph *g)
    {
      int i; /* counter */
      for (i=1; i<=g->nvertices; i++) {
        processed[i] = discovered[i] = FALSE;
        parent[i] = -1;
      }
    }

  bfs(graph *g, int start)
  {
    queue q; /* queue of vertices to visit */
    int v; /* current vertex */
    int y; /* successor vertex */
    edgenode *p; /* temporary pointer */
    init_queue(&q);
    enqueue(&q,start);
    discovered[start] = TRUE;
    while (empty_queue(&q) == FALSE) {
      v = dequeue(&q);
      process_vertex_early(v);
      processed[v] = TRUE;
      p = g->edges[v];
      while (p != NULL) {
        y = p->y;
        if ((processed[y] == FALSE) || g->directed)
          process_edge(v,y);
        if (discovered[y] == FALSE) {
          enqueue(&q,y);
          discovered[y] = TRUE;
          parent[y] = v;
        }
        p = p->next;
      }
      process_vertex_late(v);
    }
  }

find_path(int start, int end, int parents[])
{
  if ((start == end) || (end == -1))
    printf("\n%d",start);
  else {
    find_path(start,parents[end],parents);
    printf(" %d",end);
  }
}

Because vertices are discovered in order of increasing distance from the root, this tree has a very important property. The unique tree path from the root to each node x∈V uses the smallest number of edges (or equivalently, intermediate nodes) possible on any root-to-xpath in the graph.

There are two points to remember when using breadth-first search to find a shortest path fromxtoy: First, the shortest path tree is only useful if BFS was performed with x as the root of the search. Second, BFS gives the shortest path only if the graph is unweighted.

5.7 Applications of Breadth-First Search

Properly implemented using adjacency lists, any such algorithm is destined to be linear, since BFS runs in O(n+m) time on both directed and undirected graphs. This is optimal, since it is as fast as one can hope to read any n-vertex, m-edge graph.

5.8 Depth-First Search

The difference between BFS and DFS results is in the order in which they explore vertices. This order depends completely upon the container data structure used to store the discovered but not processed vertices.

Queue– By storing the vertices in a first-in, first-out (FIFO) queue, we explore the oldest unexplored vertices first. Thus our explorations radiate out slowly from the starting vertex, defining a breadth-first search.
Stack– By storing the vertices in a last-in, first-out (LIFO) stack, we explore the vertices by lurching along a path, visiting a new neighbor if one is available, and backing up only when we are surrounded by previously discovered vertices. Thus, our explorations quickly wanderaway from our starting point, defining a depth-first search.

DFS organizes vertices by entry/exit times, and edges into tree and back edges. This organization is what gives DFS its real power.

Implementation

The beauty of implementingdfsrecursively is that recursion eliminates the need to keep an explicit stack:

dfs(graph *g, int v)
{
  edgenode *p; /* temporary pointer */
  int y; /* successor vertex */
  if (finished) return; /* allow for search termination */
  discovered[v] = TRUE;
  time = time + 1;
  entry_time[v] = time;
  process_vertex_early(v);
  p = g->edges[v];
  while (p != NULL) {
    y = p->y;
    if (discovered[y] == FALSE) {
      parent[y] = v;
      process_edge(v,y);
      dfs(g,y);
    }
    else if ((!processed[y]) || (g->directed))
      process_edge(v,y);
    if (finished) return;
    p = p->next;
  }
  process_vertex_late(v);
  time = time + 1;
  exit_time[v] = time;
  processed[v] = TRUE;
}

5.9 Applications of Depth-First Search

Finding Cycles

But any back edge going from x to an ancestorycreates a cycle with the tree path fromytox. Such a cycle is easy to find using dfs:

process_edge(int x, int y)
{
  if (parent[x] != y) { /* found back edge! */
    printf("Cycle from %d to %d:",y,x);
    find_path(y,x,parent);
    printf("\n\n");
    finished = TRUE;
  }
}

Articulation Vertices

Observe that there is a single point of failure—a single vertex whose deletion disconnects a connected component of the graph. Such a vertex is called an articulation vertex or cut-node.

More robust graphs without such a vertex are said to be biconnected.

Temporarily delete each vertex v, and then do a BFS or DFS traversal of the remaining graph to establish whether it is still connected. The total time fornsuch traversals is O(n(m+n)). There is a clever linear-time algorithm, however, that tests all the vertices of a connected graph using a single depth-first search.

Let reachable_ancestor[v] denote the earliest reachable ancestor of vertex v, meaning the oldest ancestor ofvthat we can reach by a combination of tree edges and back edges. Initially, reachable_ancestor[v] = v:

int reachable_ancestor[MAXV+1]; /*earliestreachableancestorofv*/
int tree_out_degree[MAXV+1];  /* DFStree outdegree ofv*/
process_vertex_early(int v)
{
  reachable_ancestor[v] = v;
}

We update reachable_ancestor[v] whenever we encounter a back edge that takes us to an earlier ancestor than we have previously seen. The relative age/rank of our ancestors can be determined from their entry_time’s:

process_edge(int x, int y)
{
  int class; /* edge class */
  class = edge_classification(x,y);
  if (class == TREE)
    tree_out_degree[x] = tree_out_degree[x] + 1;
  if ((class == BACK) && (parent[x] != y)) {
    if (entry_time[y] < entry_time[ reachable_ancestor[x] ] )
      reachable_ancestor[x] = y;
  }
}

The key issue is determining how the reachability relation impacts whether vertexv is an articulation vertex. There are three cases:

Root cut-nodes– If the root of the DFS tree has two or more children, it must be an articulation vertex. No edges from the subtree of the second child can possibly connect to the subtree of the first child.
Bridge cut-nodes– If the earliest reachable vertex fromvis v, then deleting the single edge (parent[v],v) disconnects the graph. Clearlyparent[v] must be an articulation vertex, since it cuts v from the graph. Vertex vis also an articulation vertex unless it is a leaf of the DFS tree. For any leaf, nothing falls off when you cut it.
Parent cut-nodes– If the earliest reachable vertex fromvis the parent of v, then deleting the parent must severvfrom the tree unless the parent is the root.

The routine below systematically evaluates each of the three conditions as we back up from the vertex after traversing all outgoing edges. We use entry_time[v] to represent the age of vertex v. The reachability time time_v calculated below denotes the oldest vertex that can be reached using back edges.

process_vertex_late(int v)
{
  bool root; /* is the vertex the root of the DFS tree? */
  int time_v; /* earliest reachable time for v */
  int time_parent; /* earliest reachable time for parent[v] */
  if (parent[v] < 1) { /* test if v is the root */
    if (tree_out_degree[v] > 1)
      printf("root articulation vertex: %d \n",v);
    return;
  }
  root = (parent[parent[v]] < 1); /* is parent[v] the root? */
  if ((reachable_ancestor[v] == parent[v]) && (!root))
    printf("parent articulation vertex: %d \n",parent[v]);
  if (reachable_ancestor[v] == v) {
    printf("bridge articulation vertex: %d \n",parent[v]);
    if (tree_out_degree[v] > 0) /* test if v is not a leaf */
      printf("bridge articulation vertex: %d \n",v);
  }
  time_v = entry_time[reachable_ancestor[v]];
  time_parent = entry_time[ reachable_ancestor[parent[v]] ];
  if (time_v < time_parent)
    reachable_ancestor[parent[v]] = reachable_ancestor[v];
}

We can alternately talk about reliability in terms of edge failures instead of vertex failures.

In fact all bridges can be identified in the same O(n+m) time. Edge (x, y) is a bridge if (1) it is a tree edge, and (2) no back edge connects from yor below toxor above. This can be computed with a minor modification of the reachable_ancestor function.

5.10 Depth-First Search on Directed Graphs

For directed graphs, depth-first search labelings can take on a wider range of possibilities. Indeed, all four of the edge cases in Figure below can occur in traversing directed graphs.

The correct labeling of each edge can be readily determined from the state, discovery time, and parent of each vertex, as encoded in the following function:

int edge_classification(int x, int y)
{
  if (parent[y] == x) return(TREE);
  if (discovered[y] && !processed[y]) return(BACK);
  if (processed[y] && (entry_time[y]>entry_time[x])) return(FORWARD);
  if (processed[y] && (entry_time[y]return(CROSS);
  printf("Warning: unclassified edge (%d,%d)\n",x,y);
}

Strongly Connected Components

A directed graph isstrongly connectedif there is a directed path between any two vertices.

The algorithm is based on the observation that it is easy to find a directed cycle using a depth-first search, since any back edge plus the down path in the DFS tree gives such a cycle. All vertices in this cycle must be in the same strongly connected component. Thus, we can shrink (contract) the vertices on this cycle down to a single vertex representing the component, and then repeat. This process terminates when no directed cycle remains, and each vertex represents a different strongly connected component.

We update our notion of the oldest reachable vertex in response to (1) nontree edges and (2) backing up from a vertex.

strong_components(graph *g)
{
  int i; /* counter */
  for (i=1; i<=(g->nvertices); i++) {
    low[i] = i;
    scc[i] = -1;
  }
  components_found = 0;
  init_stack(&active);
  initialize_search(&g);
  for (i=1; i<=(g->nvertices); i++)
    if (discovered[i] == FALSE) {
      dfs(g,i);
    }
}

Define low[v]to be the oldest vertex known to be in the same strongly connected component asv. This vertex is not necessarily an ancestor, but may also be a distant cousin of v because of cross edges. Cross edges that point vertices from previous strongly connected components of the graph cannot help us, because there can be no way back from them tov, but otherwise cross edges are fair game. Forward edges have no impact on reachability over the depth-first tree edges, and hence can be disregarded:

int low[MAXV+1]; /* oldest vertex surely in component of v */
int scc[MAXV+1]; /* strong component number for each vertex */
process_edge(int x, int y)
{
  int class; /* edge class */
  class = edge_classification(x,y);
  if (class == BACK) {
    if (entry_time[y] < entry_time[ low[x] ] )
      low[x] = y;
  }
  if (class == CROSS) {
    if (scc[y] == -1) /* component not yet assigned */
      if (entry_time[y] < entry_time[ low[x] ] )
        low[x] = y;
  }
}

A new strongly connected component is found whenever the lowest reachable vertex fromvis v. If so, we can clear the stack of this component. Otherwise, we give our parent the benefit of the oldest ancestor we can reach and backtrack:

  process_vertex_early(int v)
  {
    push(&active,v);
  }

process_vertex_late(int v)
{
  if (low[v] == v) { /* edge (parent[v],v) cuts off scc */
    pop_component(v);
  }
  if (entry_time[low[v]] < entry_time[low[parent[v]]])
    low[parent[v]] = low[v];
}

pop_component(int v)
{
  int t; /* vertex placeholder */
  components_found = components_found + 1;
  scc[ v ] = components_found;
  while ((t = pop(&active)) != v) {
    scc[ t ] = components_found;
  }
}

Exercises

5

Give a linear algorithm to compute the chromatic number of graphs where each vertex has degree at most 2. Must such graphs be bipartite?

这样的图不必要是 bipartite 的.反例是:3 个顶点,两两相连.

因为每个顶点最多 2 度,使用 DFS 遍历,对子顶点着色与父顶点相反的颜色.当遇到一个回归的边,那么对当前定点着色与父顶点不同,并且与回归边上的祖先定点不同.

只有一次遍历,复杂度 O(m+n) (m edges, n vertices).

7

Given pre-order and in-order traversals of a binary tree, is it possible to reconstruct the tree? If so, sketch an algorithm to do it. If not, give a counterexample. Repeat the problem if you are given the pre-order and post-order traversals.

没有相同元素,给予 pre-order and in-order traversals 能重构 binary search tree.代码如下.若有相同元素,给予:

preorder = {1,1}
inorder = {1,1}

可以重构:

  1                     1
 /           or          \
1                         1

每次 preorder 的数都要去搜索在 inoder 所在位置,若书是平衡的,那么 n 个元素每次搜索后总的算法复杂度 O(nlogn),但不是平衡的,一下就变成 O(n²).

所以利用 hash table,先把 inorder 的元素和位置 hash 起来,那么总的算法时间:O(n).

以下假设元素都小于 255,简单的利用数组映射来模拟 hash table.

struct Node {
  int val;
  struct Node* left;
  struct Node* right;
  Node(int val_in) {
    val = val_in;
    left = NULL;
    right = NULL;
  }
};

const int kMax = 256;
int map_index[kMax];

void MapToIndex(int inorder[], int n) {
  for (int i = 0; i < n; ++i) {
    map_index[inorder[i]] = i;
  }
}

Node *BuildInorderPreorder(in in[], in pre[], int n, int offset) {
  if (n == 0) {
    return NULL:
  }
  int root_val = pre[0];
  int i = map_index[root_val] - offset;
  Node *root = new Node(root_val);
  root->left = BuildInorderPreorder(in, pre+1, i, offset);
  root->right = BuildInorderPreorder(in+i+1, pre+i+1, offset+i+1);
  return root;
}

给予 pre-order and post-order traversals, 不能重构 binary search tree.

12

The square of a directed graph G = (V,E) is the graph G² = (V,E²) such that (u,w)∈E² iff there exists v∈V, such that (u,v)∈E and (u,w)∈E; i.e., there is a path of exactly two edges from u to w. square of a graph Give efficient algorithms for both adjacency lists and matrices.

adjacency matrices 算法复杂度：O(n³).

MakeSquareGraph(G, n)
for i=1 to n
    for j=1 to n
        G2[i][j] = 0
for i=1 to n
    for j=1 to n
        if (G[i][j] == 1)
           for k=1 to n
               if (G[j][k] == 1)
                  G2[i][k] = 1
return G2

18

Consider a set of movies $M_1, M_2, \ldots, M_k$. There is a set of customers, each one of which indicates the two movies they would like to see this weekend. Movies are shown on Saturday evening and Sunday evening. Multiple movies may be screened at the same time. You must decide which movies should be televised on Saturday and which on Sunday, so that every customer gets to see the two movies they desire. Is there a schedule where each movie is shown at most once? Design an efficient algorithm to find such a schedule if one exists.

把问题转换成图问题解决。建立无向图，顶点是每部电影，边 E(1,2)表示有个客户想看 M1 和 M2.如下图实例，有电影 M1-M4,3 个客户，1 个客户想看 M1 和 M3,一个客户想看 M1 和 M4，一个客户想看 M2 和 M4.那么把图分成（M1，M2）和（M3,M4），周六日各放一组,满足所有客户要求。

若多一个客户想看 M3 和 M4,如下图，无论怎么分图，都有 2 部电影相连，所以不能满足所有客户的要求。可以得出：若原本的图是 bipartite graph，那么能找到满足客户的放映安排。若不是，就不能满足客户要求。

23

Your job is to arrange n ill-behaved children in a straight line, facing front. You are given a list of m statements of the form i hates j. If i hates j, then you do not want put i somewhere behind j, because then i is capable of throwing something at j.

Give an algorithm that orders the line, (or says that it is not possible) in O(m + n) time.
Suppose instead you want to arrange the children in rows such that if i hates j, then i must be in a lower numbered row than j. Give an efficient algorithm to find the minimum number of rows needed, if it is possible.
创建一幅有向图,顶点代表孩子,有向边 E(i,j)代表孩子 i hates 孩子 j;
topological sort 得到队列，或 BFS 时发现环，证明不可能。只 BFS 遍历一次， O(m + n)。
如题 1 创建有向图;
作 DFS 遍历，用遍历的 level 作为行号。

31

Which data structures are used in depth-first and breath-first search?

BFS:使用 queue
DFS:使用 stack,通常使用递归代替 stack.

32

Write a function to traverse binary search tree and return the ith node in sorted order.

struct Node {
  int val;
  struct Node* left;
  struct Node* right;
  Node(int val_in) {
    val = val_in;
    left = NULL;
    right = NULL;
  }
};

bool FindIthElementCore(struct Node *root, int ith, int *index, int *value) {
  if (root == NULL) {
    return false;
  }
  if (FindIthElementCore(root->left, ith, index, value)) {
    return true;
  }
  cout << ith << ": " << *index << ": " << root->val << endl;
  if (ith == *index) {
    *value = root->val;
    return true;
  }
  (*index)++;
  if (FindIthElementCore(root->right, ith, index, value)) {
    return true;
  } else {
    return false;
  }
}

bool FindIthElement(struct Node *root, int ith, int *value) {
  int start = 0;
  return FindIthElementCore(root, ith, &start, value);
}

Algorithm Design Manual Chapter 4

2014-06-09T00:00:00+08:00

Book Notes

4.3 Heapsort: Fast Sorting via Data Structures

Where in the Heap?

Problem: Given an array-based heap on n elements and a real number x, efficiently determine whether the kth smallest element in the heap is greater than or equal to x. Your algorithm should be O(k) in the worst-case, independent of the size of the heap. Hint: you do not have to find the kth smallest element; you need only determine its relationship to x.

Solution: There are at least two different ideas that lead to correct but inefficient algorithms for this problem:

Call extract-minktimes, and test whether all of these are less thanx. This explicitly sorts the firstkelements and so gives us more information than the desired answer, but it takes O(klogn) time to do so.
The kth smallest element cannot be deeper than the kth level of the heap, since the path from it to the root must go through elements of decreasing value. Thus we can look at all the elements on the first k levels of the heap, and count how many of them are less thanx, stopping when we either find k of them or run out of elements. This is correct, but takes O(min(n,2^k-1)) time, since the top k elements have 2^k-1 elements.

An O(k) solution can look at only k elements smaller than x, plus at most O(k) elements greater than x. Consider the following recursive procedure, called at the root with i= 1 with count=k:

int heap_compare(priority_queue *q, int i, int count, int x)
{
  if ((count <= 0) || (i > q->n)) return(count);
  if (q->q[i] < x) {
    count = heap_compare(q, pq_young_child(i), count-1, x);
    count = heap_compare(q, pq_young_child(i)+1, count, x);
  }
  return(count);
}

If the root of the min-heap is ≥ x, then no elements in the heap can be less than x, as by definition the root must be the smallest element. This procedure searches the children of all nodes of weight smaller than x until either (a) we have found k of them, when it returns 0, or (b) they are exhausted, when it returns a value greater than zero. Thus it will find enough small elements if they exist.

But how long does it take? The only nodes whose children we look at are those < x, and at most k of these in total. Each have at most visited two children, so we visit at most 3k nodes, for a total time of O(k).

4.5 Mergesort: Sorting by Divide-and-Conquer

Mergesort is a great algorithm for sorting linked lists, because it does not rely on random access to elements as does heapsort or quicksort. Its primary disadvantage is the need for an auxilliary buffer when sorting arrays. It is easy to merge two sorted linked lists without using any extra space, by just rearranging the pointers. However, to merge two sorted arrays (or portions of an array), we need use a third array to store the result of the merge to avoid stepping on the component arrays

4.9 Binary Search and Related Algorithms

int binary_search(item_type s[], item_type key, int low, int high)
{
  int middle; /* index of middle element */
  if (low > high) return (-1); /* key not found */
  middle = (low+high)/2;
  if (s[middle] == key) return(middle);
  if (s[middle] > key)
    return (binary_search(s,key,low,middle-1));
  else
    return (binary_search(s,key,middle+1,high));
}

Counting Occurrences

This algorithm runs inO(lgn+s), wheresis the number of occurrences of the key. This can be as bad as linear if the entire array consists of identical keys. A faster algorithm results by modifying binary search to search for the boundary of the block containing k, instead of kitself. Suppose we delete the equality test

if (s[middle] == key) return(middle);

from the implementation above and return the index low instead of −1 on each unsuccessful search. All searches will now be unsuccessful, since there is no equality test. The search will proceed to the right half whenever the key is compared to an identical array element, eventually terminating at the right boundary. Repeating the search after reversing the direction of the binary comparison will lead us to the left boundary. Each search takes O(lgn) time, so we can count the occurrences in logarithmic time regardless of the size of the block.

One-Sided Binary Search

Now suppose we have an array A consisting of a run of 0’s, followed by an unbounded run of 1’s, and would like to identify the exact point of transition between them. Binary search on the array would provide the transition point in lgn tests, if we had a bound non the number of elements in the array. In the absence of such a bound, we can test repeatedly at larger intervals (A[1], A[2], A[4], A[8], A[16],...) until we find a first nonzero value. Now we have a window containing the target and can proceed with binary search. This one-sided binary search finds the transition pointpusing at most 2lgp comparisons, regardless of how large the array actually is.

Square and Other Roots

First, observe that the square root ofn≥1 must be at least 1 and at most n. Let l = 1 and r = n. Consider the midpoint of this interval, m=(l+r)/2. How does m² compare to n? If n≥m² , then the square root must be greater than m, so the algorithm repeats with l=m. If n2 , then the square root must be less than m, so the algorithm repeats with r=m.

Suppose that we start with values l and r such that f(l)>0 and f(r)<0. If f is a continuous function, there must exist a root between l and r. Depending upon the sign of f(m), where m=(l+r)/2, we can cut this window containing the root in half with each test and stop soon as our estimate becomes sufficiently accurate.

4.10 Divide-and-Conquer

divide-and-conquer recurrences of the form T(n)=aT(n/b)+f(n)

1. If $f(n) = O(n^{log_{b}^{a-\epsilon}})$ for some constant $\epsilon > 0$, then $T(n) = \Theta(n^{log_{b}^a})$.
2. If $f(n) = O(n^{log_{b}^{a}})$, then $T(n) = \Theta(n^{log_{b}^a}lgn)$.
3. If $f(n) = O(n^{log_{b}^{a+\epsilon}})$ for some constant $\epsilon > 0$ and if $af(n/b) \leq cf(n)$ for some $c<1$, then $T(n) = \Theta(f(n))$.

Exercises

1

The Grinch is given the job of partitioning 2n players into two teams of n players each. Each player has a numerical rating that measures how good he/she is at the game. He seeks to divide the players as unfairly as possible, so as to create the biggest possible talent imbalance between team A and team B. Show how the Grinch can do the job in O(nlogn) time.

用个 O(nlogn)的排序算法对 2n 个队根据实力排序，前 n 个作为一队，后 n 个作为一队。

2

For each of the following problems, give an algorithm that finds the desired numbers within the given amount of time. To keep your answers brief, feel free to use algorithms from the book as subroutines. For the example,S={6,13,19,3,8}, 19−3 maximizes the difference, while 8−6 minimizes the difference.

(a) Let S be an unsorted array of n integers. Give an algorithm that finds the pair x, y∈S that maximizes|x−y|. Your algorithm must run in O(n) worst-case time.

(b) Let S be a sorted array of n integers. Give an algorithm that finds the pair x, y∈S that maximizes |x−y|. Your algorithm must run in O(1) worst-case time.

(c) Let S be an unsorted array of n integers. Give an algorithm that finds the pair x, y∈S that minimizes |x−y|, for x ≠ y. Your algorithm must run in O(nlogn) worst-case time.

(d) Let S be a sorted array of n integers. Give an algorithm that finds the pair x, y∈S that minimizes |x−y|, for x ≠ y. Your algorithm must run in O(n) worst-case time.

(a) 扫描 S 一次获得最小和最大值.
(b) 取首尾数。
(c) O(nlogn)的算法排序，扫描排序好的 S，获得最小差的相邻元素对。
(d) 扫描排序好的 S，获得最小差的相邻元素对。

3

Take a sequence of 2n real numbers as input. Design an O(nlogn) algorithm that partitions the numbers intonpairs, with the property that the partition minimizes the maximum sum of a pair. For example, say we are given the numbers (1,3,5,9). The possible partitions are ((1,3),(5,9)), ((1,5),(3,9)), and ((1,9),(3,5)). The pair sums for these partitions are (4,14), (6,12), and (10,8). Thus the third partition has 10 as its maximum sum, which is the minimum over the three partitions.

O(nlogn)的算法排序

start = 0;
end = 2n - 1;
while (start < end) {
  pair(S[star], S[end]);
  start++;
  end--;

4

Assume that we are given n pairs of items as input, where the first item is a and the second item is one of three colors (red, blue, or yellow). Further assume that the items are sorted by number. Give an O(n) algorithm to sort the items by color (all reds before all blues before all yellows) such that the numbers for identical colors stay sorted. For example: (1,blue), (3,red), (4,blue), (6,yellow), (9,red) should become (3,red), (9,red), (1,blue), (4,blue), (6,yellow).

创建 3 个分别存储 red，blue，yellow 的数组;
扫描 input，依次按颜色装入不同的数组;
分别从 red，blue，yellow 的数组中输出结果。

5

The mode of a set of numbers is the number that occurs most frequently in the set. The set (4,6,2,4,3,1) has a mode of 4. Give an efficient and correct algorithm to compute the mode of a set of n numbers.

O(nlogn): O(nlogn)排序，扫描 Set 一遍得到频率最大的数。
O(n): 使用 hash map 扫描一遍存储数字频率，扫描 hash map 得到频率最大数。

6

Given two sets S1 and S2 (each of size n), and a number x, describe an O(nlogn) algorithm for finding whether there exists a pair of elements, one from S1 and one from S2, that add up to x. (For partial credit, give a Θ(n²) algorithm for this problem.)

从 S1 中减去 n，O(nlogn)排序 S1 和 S2,然后能否找出相同的元素（binary search 或扫描比较）。

Sort and Scan

sort S1 in O(nlogn)
sort S2 in O(nlogn)
begin = 0;
end = n - 1;
while (begin < n && end >=0) {
          if ((S1[begin] + S2[end]) < X) {
                 begin++;
          }
          else if ((S1[begin] + S2[end]) > X) {
                 end--;
          } else {
              return true;
          }
}
return false;

Binary Search
- O(nlogn)排序 S1
- X-S2[i]去 binary search 排序好的 S1，是否找到元素。

7

Outline a reasonable method of solving each of the following problems. Give the order of the worst-case complexity of your methods.

You are given a pile of thousands of telephone bills and thousands of checks sent in to pay the bills. Find out who did not pay.
You are given a list containing the title, author, call number and publisher of all the books in a school library and another list of 30 publishers. Find out how many of the books in the library were published by each company.
You are given all the book checkout cards used in the campus library during the past year, each of which contains the name of the person who took out the book. Determine how many distinct people checked out at least one book.

都使用 Hash Table，O(n)

8

Given a set of S containing n real numbers, and a real number x. We seek an algorithm to determine whether two elements of S exist whose sum is exactly x.

Assume that S is unsorted. Give an O(nlogn) algorithm for the problem.
Assume that S is sorted. Give an O(n) algorithm for the problem.

(1): Binary search

sort S in O(nlogn);
for (int i = 0; i < n; ++i) {
        binarysearch S[i] in S[i+1,n]
}

Scan

sort S in O(nlogn);
i = 0;
j = n - 1;
while (i < j) {
  if (s[i] + s[j] < X) {
    i++;
  } else if (s[i] + s[j] > X) {
    j--;
  } else {
    break;
  }
}

(2)

i = 0;
j = n - 1;
while (i < j) {
  if (s[i] + s[j] < X) {
    i++;
  } else if (s[i] + s[j] > X) {
    j--;
  } else {
    break;
  }
}

9

Give an efficient algorithm to compute the union of sets A and B, where n = max( | A | , | B | ). The output should be an array of distinct elements that form the union of the sets, such that they appear more than once in the union.

Assume that A and B are unsorted. Give an O(nlogn) algorithm for the problem.
Assume that A and B are sorted. Give an O(n) algorithm for the problem.
O(nlogn)对Ａ和Ｂ排序，然后用 2 的 O(n)的方法。

若Ａ和Ｂ以升序排序

set U to empty;
int i = 0;
int j = 0;
while (i < na && j < na) {
  if (A[i] < B[j]) {
    add A[i] into U;
    i++;
  } else (A[i] > B[j]) {
      add B[j] into U;
      j++;
    }
  else {
     add A[i] into U;
    i++;
    j++;
  }
}
if (i < na) {
  while (i < na) {
    add A[i] into U;
    i++;
  }
if (j < nb) {
  while (j < nb) {
   add B[j] into U;
      j++;
    }
}

10

Given a set S of n integers and an integer T, give an O(n^{k − 1}logn) algorithm to test whether k of the integers in S add up to T.

O(nlogn）对数组排序
(k-1)个数的组合有 n^k-1，并计算 k-1 个数的和 sum
用 binary search 在数组中搜索 T-sum

11

Design an O(n) algorithm that, given a list of n elements, finds all the elements that appear more than n / 2 times in the list. Then, design an O(n) algorithm that, given a list of n elements, finds all the elements that appear more than n / 4 times.

Hash Table 可以解决。或

Find the elements that appear more than n / 2 times

数组中最多有一个数超过重复 n/2 次，并且排序后的第 ceiling(n/2)个数必定是这个数。

method1
- 利用BFPRT 以 O(n)的复杂度找到第 ceiling(n/2)个小数;
- 扫描数组，计数这个数的重复数是否大于 n/2.

method2

#include 
using std::stack;

bool FindMoreThanHalf(int *array, int n, int *res) {
  stack<int> stk;
  int i;
  for (i = 0; i < n; ++i) {
    if (stk.empty()) {
      stk.push(array[i]);
    } else {
      if (stk.top() == array[i]) {
        stk.push(array[i]);
      } else {
        stk.pop();
      }
    }
  }
  if (stk.empty()) {
    return false;
  }
  int candidate = stk.top();
  int times = 0;
  for (i = 0; i < n; ++i) {
    if (array[i] == candidate) {
      times++;
    }
  }
  if (times > n / 2) {
    *res = candidate;
    return true;
  }
  return false;
}

Find the elements that appear more than n / 4 times

method1
1. 利用BFPRT 以 O(n)的复杂度找到中间数，验证中中间数是否重复 n/4(O(n));
2. 以中间元素划分数组为两部分(O(n));
3. 在上下半部分 n/2 中重复 n/4 次数的元素，同第一个问题一样找(O(n));
method2
1. 初始 3 个空的槽，想对应的槽的 3 个计数为 0;
2. 对于数组中每个元素：
  - 若等于其中任何一个槽中数，增加计数;
  - 若有槽空，放入这个槽，并计数为 1;
  - 否则，对所有槽内数的计数减 1
3. 对槽内剩下的数，扫描一遍数组，计算它们重复次数是否符合要求。

12

Devise an algorithm for finding the k smallest elements of an unsorted set of n integers in O(n + klogn).

O(n)的复杂度建立一个最小堆;
连续 k 次取出最小值，最后得到第 k 个最小值。

13

You wish to store a set of n numbers in either a max-heap or a sorted array. For each application below, state which data structure is better, or if it does not matter. Explain your answers.

Want to find the maximum element quickly.
Want to be able to delete an element quickly.
Want to be able to form the structure quickly.
Want to find the minimum element quickly.
都开销 O(1)。
若知道删除的地方，max-heap 花费 O(logn)，sorted array 花费 O(n)。若不知道删除的地方，max-heap 花费 O(n)查找，删除花费 O(logn); sorted array binary search 花费 O(logn)，删除花费 O(n)。
max-heap 花费 O(n);sorted array 花费 O(logn)。
max-heap 花费 O(n);sorted array 花费 O(1)。

14

Give an O(nlogk)-time algorithm that merges k sorted lists with a total of n elements into one sorted list. (Hint: use a heap to speed up the elementary O(kn)-time algorithm).

扫描 k 组 sorted lists 组成一个大小 k 的 min-heap;
从 min-heap 中取出最小值放入结果 list。

15

(a) Give an efficient algorithm to find the second-largest key among n keys. You can do better than 2n − 3 comparisons. (b) Then, give an efficient algorithm to find the third-largest key among n keys. How many key comparisons does your algorithm do in the worst case? Must your algorithm determine which key is largest and second-largest in the process?

找第二大元素：大小为 2 个的数组初始化为第一二个元素，之后每个元素与这数组对比，剔除最小的，最后数组内 2 个元组对比得到最大和第二大元素，一共比较 2(n-2)+1=2n-3，找出第二大元素。
找第三大元素：同样已大小为 3 的数组，最后比较数 3(n-3)+2=3n-7。

Random Selection可以找出任意的第几大值，平均时间复杂度：O(n)，比较次数将是 n 的倍数，最坏时间复杂度可以达到：O(nlogn)。

Tournament Algorithm找第二大元素比较次数 O(n+logn);找第 k 个最大元素，比较次数为 O(n+klogn)。

16

Use the partitioning idea of quicksort to give an algorithm that finds the median element of an array of n integers in expected O(n) time. (Hint: must you look at both sides of the partition?)

unsigned int seed = time(NULL);
int randint(int m, int n) {
  return m + rand_r(&seed) / (RAND_MAX / (n + 1 - m) + 1);
}

void RandomSelectionK(int *array, int l, int u, int k) {
  if (l >= u) {
    return;
  }
  swap(array[l], array[randint(l, u)]);
  int pivot = array[l];
  int i = l;
  int j = u + 1;
  while (true) {
    do {
      ++i;
    } while (i <= u && array[i] < pivot);
    do {
      --j;
    } while (array[j] > pivot);
    if (i > j) {
      break;
    }
    swap(array[i], array[j]);
  }
  swap(array[l], array[j]);
  if (j < k) {
    RandomSelectionK(array, j + 1, u, k);
  } else if (j > k) {
    RandomSelectionK(array, l, j - 1, k);
  }
}

17

The median of a set of n values is the $\lceil n/2 \rceil$ th smallest value.
1. Suppose quicksort always pivoted on the median of the current sub-array. How many comparisons would Quicksort make then in the worst case?
2. Suppose quicksort were always to pivot on the $\lceil n/3 \rceil$ th smallest value of the current sub-array. How many comparisons would be made then in the worst case?

f(n) = 2*f(n/2) + n ==> f(n) = 2^k * f(n/2^k) + kn = (n+2)logn f(n) = f(n/3) + f(2n/3) + n ==> f(n) = O(nlogn)

18

Suppose an array A consists of n elements, each of which is red, white, or blue. We seek to sort the elements so that all the reds come before all the whites, which come before all the blues The only operation permitted on the keys are

Examine(A,i) – report the color of the ith element of A.
Swap(A,i,j) – swap the ith element of A with the jth element.

Find a correct and efficient algorithm for red-white-blue sorting. There is a linear-time solution.

2 次扫描。

第一次：把 red 和 white 当成一样，用 quick 的 partition 分开与 blue。
第二次：只区分 red 和 white 的子区间。

21

Stable sorting algorithms leave equal-key items in the same relative order as in the original permutation. Explain what must be done to ensure that mergesort is a stable sorting algorithm.

在合并时元素相等时选 index 小的元素在前。

22-23

Show that n positive integers in the range 1 to k can be sorted in O(nlogk) time. The interesting case is when k < < n.

We seek to sort a sequence S of n integers with many duplications, such that the number of distinct integers in S is O(logn). Give an O(nloglogn) worst-case time algorithm to sort such sequences.

balanced binary search tree.

24

Let A[1..n] be an array such that the first $n-\sqrt n$ elements are already sorted (though we know nothing about the remaining elements). Give an algorithm that sorts A in substantially better than nlogn steps.

+ $O(\sqrt{n}log(\sqrt{n})$ 排序后面的 $\sqrt{n}$ 个元素。
+ O(n)去 mergesort 前半部分和后半部分。

25

Assume that the array A[1..n] only has numbers from $\{1,\ldots, n^2\}$ but that at most loglogn of these numbers ever appear. Devise an algorithm that sorts A in substantially less than O(nlogn).

和 23 一样，用 balanced binary search tree，树的高度不超过 loglogn,最后的复杂度 O(n*logloglogn)。

27

Let P be a simple, but not necessarily convex, polygon and q an arbitrary point not necessarily in P. Design an efficient algorithm to find a line segment originating from q that intersects the maximum number of edges of P. In other words, if standing at point q, in what direction should you aim a gun so the bullet will go through the largest number of walls. A bullet through a vertex of P gets credit for only one wall. An O(nlogn) algorithm is possible.

以ｑ为中心点，顺时针旋转，Ｐ中所有边随着顺时针旋转都有一个起始点 (head)和结束点（end），计算它们的极角（polar angle）; O(n)
对所有 head 和 end 按照 angle 大小排序，若相等，head 在前; O(nlogn)
扫描这个排序好的队列，遇到 head 加 1,遇到 end 减 1,最后算出这个区间的最大值。O(n)

30

A company database consists of 10,000 sorted names, 40% of whom are known as good customers and who together account for 60% of the accesses to the database. There are two data structure options to consider for representing the database:

Put all the names in a single array and use binary search.
Put the good customers in one array and the rest of them in a second array.

Only if we do not find the query name on a binary search of the first array do we do a binary search of the second array. Demonstrate which option gives better expected performance. Does this change if linear search on an unsorted array is used instead of binary search for both options?

single array: log10000=4
two array: 0.6*log4000+0.4*(log4000+log6000) = 5.11

single array is better.

single array: 10000
two array: 0.6*4000+0.4*6000 = 6400

two array is better.

31

Suppose you are given an array A of n sorted numbers that has been circularly shifted k positions to the right. For example, {35,42,5,15,27,29} is a sorted array that has been circularly shifted k = 2 positions, while {27,29,35,42,5,15} has been shifted k = 4 positions.

Suppose you know what k is. Give an O(1) algorithm to find the largest number in A.
Suppose you do not know what k is. Give an O(lgn) algorithm to find the largest number in A. For partial credit, you may give an O(n) algorithm.

if (k == 0) {
  return A[n-1];
} else {
  return A[k-1];
}

int FindLargestNumber(int *array, int l, int h) {
  if (array[l] < array[h]) {
    return array[h];
  }
  if (l == h) {
    return array[h];
  }
  int mid;
  mid = (l + h) / 2;
  if ((mid + 1 <= h) && array[mid] > array[mid + 1]) {
    return array[mid];
  }
  if ((mid - 1 >= l) && array[mid - 1] > array[mid]) {
    return array[mid - 1];
  }
  if (array[mid] < array[h]) {
    return FindLargestNumber(array, l, mid - 1);
  } else {
    return FindLargestNumber(array, mid + 1, h);
  }
}

32

Consider the numerical 20 Questions game. In this game, Player 1 thinks of a number in the range 1 to n. Player 2 has to figure out this number by asking the fewest number of true/false questions. Assume that nobody cheats.

What is an optimal strategy if n is known?
What is a good strategy if n is not known?

binary search.
2ⁱ随机选一个 i，若数小，增加到 2ⁱ⁺¹,若大就二分搜索。

33

Suppose that you are given a sorted sequence of distinct integers . Give an O(lgn) algorithm to determine whether there exists an i index such as ai = i. For example, in { − 10, − 3,3,5,7}, a3 = 3. In {2,3,4,5,6,7}, there is no such i.

bool CheckEqualIndex(int *array, int l, int h) {
  while (l <= h) {
    int mid = (l + h) / 2;
    if (array[mid] > (mid + 1)) {
      h = mid - 1;
    } else if (array[mid] < (mid + 1)) {
      l = mid + 1;
    } else {
      return true;
    }
  }
  return false;
}

34

Suppose that you are given a sorted sequence of distinct integers , drawn from 1 to m where n < m. Give an O(lgn) algorithm to find an integer that is not present in a. For full credit, find the smallest such integer.

int FindMissingElement(int *array, int l, int h) {
  while (l <= h) {
    int mid = (l + h) / 2;
    if (array[mid] > (mid + 1)) {
      h = mid - 1;
    } else if (array[mid] <= (mid + 1)) {
      l = mid + 1;
    }
  }
  return l + 1;
}

35

Let M be an n*m integer matrix in which the entries of each row are sorted in increasing order (from left to right) and the entries in each column are in increasing order (from top to bottom). Give an efficient algorithm to find the position of an integer x in M, or to determine that x is not there. How many comparisons of x with matrix entries does your algorithm use in worst case?

O(m+n)

bool FindElement(int **array, int x, int n, int m, int *pos_x, int *pos_y) {
  int row = 0, col = m - 1;
  while (row < n && col >= 0) {
    if (array[row][col] == x) {
      *pos_x = row;
      *pos_y = col;
      return true;
    } else if (array[row][col] > x) {
      col--;
    } else {
      row++;
    }
  }
  return true;
}

36

Consider an n*n array A containing integer elements (positive, negative, and zero). Assume that the elements in each row of A are in strictly increasing order, and the elements of each column of A are in strictly decreasing order. (Hence there cannot be two zeroes in the same row or the same column.) Describe an efficient algorithm that counts the number of occurrences of the element 0 in A. Analyze its running time.

int CountZero(int **array, int n) {
  int row = n - 1, col = n - 1;
  int count = 0;
  while (row >=0 && col >= 0) {
    if (array[row][col] == 0) {
      count++;
      row--;
    } else if(array[row][col] > 0) {
      col--;
    } else {
      row--;
    }
  }
  return count;
}

40

If you are given a million integers to sort, what algorithm would you use to sort them? How much time and memory would that consume?

一个整数４字节，10⁹*4=4G,需要 4G 的内存，可以用快排等 O(nlogn)的排序算法．
用 bitmap,需要 10⁹/8=128M 的内存.
若内存有限,就 external merge sort,利用外部存储多进行几次.

41

Describe advantages and disadvantages of the most popular sorting algorithms.

Merge sort:

优点:适合链表,适合外排.
缺点:需要多余的内存来保存合并的数据.

Insertion/Selection sort:

优点:简单实现.
缺点:太慢,当数据很大时,运行不实际.

Heap sort:

优点:不需要递归,适合大数据.
缺点:时常慢于 merge sort 和 quick sort.

Quick sort:

优点:很快.
缺点:递归,最坏情况比较慢.

42

Implement an algorithm that takes an input array and returns only the unique elements in it.

排序,然后扫描输出.O(nlogn).

43

You have a computer with only 2Mb of main memory. How do you use it to sort a large file of 500 Mb that is on disk?

利用external merge sort.

44

Design a stack that supports push, pop, and retrieving the minimum element in constant time. Can you do this?

只有一个 stack 办不到.如果两个 stack,可以利用另外一个 stack 存储最小值.

45

Given a search string of three words, find the smallest snippet of the document that contains all three of the search words—i.e., the snippet with smallest number of words in it. You are given the index positions where these words occur in the document, such as word1: (1, 4, 5), word2: (3, 9, 10), and word3: (2, 6, 15). Each of the lists are in sorted order, as above.

选取每个字母 index 的首个元素作为起始方案。
如何改进它的长度：a.增加最小的位置，b.减小最大的位置，这里只能增加最小的位置。
用 heap 来保存位置，每次取出最小的位置为 O(logk).

复杂度：O(nlogk)，n 是所有字母的位置个数，k 是字母个数。这里 k=3,所以 O(n)

#include 
using std::priority_queue;
#include 
using std::make_pair;
using std::pair;
#include 
using std::vector;
#include 
using std::max;
using std::min;
#include 
using std::numeric_limits;


int FindSmallestSnippet(vector<vector<int> > &index_positions) {
  // max-priority, select smallest position, use -index_positions[i][j], (i,j)
  priority_queue<pair<int, pair<int ,int> > > queue;
  int max_pos = 0; // the max pos of  the snippet
  int i;
  for (i = 0; i < index_positions.size(); ++i) {
    int pos = index_positions[i][0];
    max_pos = max(max_pos, pos);
    queue.push(make_pair(-pos, make_pair(i, 0)));
  }
  int smallest_len = numeric_limits<int>::max();
  while (queue.size() == index_positions.size()) {
    int min_pos = -queue.top().first;
    smallest_len = min(smallest_len, max_pos - min_pos + 1);
    int word_pos = queue.top().second.first;
    int index = queue.top().second.second;
    queue.pop();
    ++index;
    if (index < index_positions[word_pos].size()) {
      int next_pos = index_positions[word_pos][index];
      max_pos = max(max_pos, next_pos);
      queue.push(make_pair(-next_pos, make_pair(word_pos, index)));
    }
  }
  return smallest_len;
}

46

You are given 12 coins. One of them is heavier or lighter than the rest. Identify this coin in just three weighings.

分 3 组,每组 4 个,其中两组称重,若相等,重的在第三组.若不等,重的在重的那一组.
重的那组分 4 组,每组 1 个,第一组和第二组称重,谁重就重的那个.
若 step2 相等,剩下第三组和第四组称重,谁重就重的那个.

Algorithm Design Manual Chapter 3

2014-06-04T00:00:00+08:00

Book Notes

Contiguous vs. Linked Data Structures

Contiguously-allocated structuresare composed of single slabs of memory, and include arrays, matrices, heaps, and hash tables.
Linked data structuresare composed of distinct chunks of memory bound together bypointers, and include lists, trees, and graph adjacency lists.

Arrays

Advantages of contiguously-allocated arrays include:

Constant-time access given the index– Because the index of each element maps directly to a particular memory address, we can access arbitrary data items instantly provided we know the index.
Space efficiency– Arrays consist purely of data, so no space is wasted with links or other formatting information. Further, end-of-record information is not needed because arrays are built from fixed-size records.
Memory locality– A common programming idiom involves iterating through all the elements of a data structure. Arrays are good for this because they exhibit excellent memory locality. Physical continuity between successive data accesses helps exploit the high-speedcache memory on modern computer architectures.

The downside of arrays is that we cannot adjust their size in the middle of a program’s execution.

Actually, we can efficiently enlarge arrays as we need them, through the miracle of dynamic arrays. The apparent waste in this procedure involves the recopying of the old contents on each expansion. Thus, each of thenelements move only two times on average, and the total work of managing the dynamic array is the sameO(n) as it would have been if a single array of sufficient size had been allocated in advance! The primary thing lost using dynamic arrays is the guarantee that each array access takes constant time in the worst case.

Pointers and Linked Structures

The relative advantages of linked lists over static arrays include:

Overflow on linked structures can never occur unless the memory is actually full.
Insertions and deletions aresimplerthan for contiguous (array) lists.
With large records, moving pointers is easier and faster than moving the items themselves.

while the relative advantages of arrays include:

Linked structures require extra space for storing pointer fields.
Linked lists do not allow efficient random access to items.
Arrays allow better memory locality and cache performance than random pointer jumping.

Exercises

1

A common problem for compilers and text editors is determining whether the parentheses in a string are balanced and properly nested. For example, the string ((())())() contains properly nested pairs of parentheses, which the strings )()( and ()) do not. Give an algorithm that returns true if a string contains properly nested and balanced parentheses, and false if otherwise. For full credit, identify the position of the first offending parenthesis if the string is not properly nested and balanced.

#include 
using std::string;
#include 
using std::stack;

bool BalancedParentheses(string parentheses, int *pos) {
  stack<int> stk;
  const int kLeftPar = 1;
  int i;
  for (i = 0; i < parentheses.size(); ++i) {
    if (parentheses[i] == '(') {
      stk.push(kLeftPar);
    } else {
      if (stk.empty()) {
        *pos = i;
        return false;
      }
      stk.pop();
    }
  }
  if (!stk.empty()) {
    *pos = --i;
    return false;
  }
  return true;
}

2

Write a program to reverse the direction of a given singly-linked list. In other words, after the reversal all pointers should now point backwards. Your algorithm should take linear time.

struct Node {
  int value;
  struct Node *next;
  Node(int in_value, struct Node* in_next) : value(in_value), next(in_next) {
  }
};

void ReverseLinkedList(Node **head) {
  if (!head || *head == NULL) {
    return;
  }
  Node *prev, *p, *next;
  prev = *head;
  p = prev->next;
  prev->next = NULL;
  while (p != NULL) {
    next = p->next;
    p->next = prev;
    prev = p;
    p = next;
  }
  *head = prev;
}

3

We have seen how dynamic arrays enable arrays to grow while still achieving constant-time amortized performance. This problem concerns extending dynamic arrays to let them both grow and shrink on demand.

(a) Consider an underflow strategy that cuts the array size in half whenever the array falls below half full. Give an example sequence of insertions and deletions where this strategy gives a bad amortized cost.

(b) Then, give a better underflow strategy than that suggested above, one that achieves constant amortized cost per deletion.

容量是 6 的数组，当有 3 个元素是，insertion，然后 delete。它不断收缩和扩展容量。
当元素个数是总个数的 1/4 时，把容量收缩成 1/2。

4

Design a dictionary data structure in which search, insertion, and deletion can all be processed inO(1) time in the worst case. You may assume the set elements are integers drawn from a finite set 1,2, .., n, and initialization can take O(n)time.

因为元素个数是有限集合中的数，用 bit array 表示每个数。

5

Find the overhead fraction (the ratio of data space over total space) for each of the following binary tree implementations on n nodes:

(a) All nodes store data, two child pointers, and a parent pointer. The data field requires four bytes and each pointer requires four bytes.

(b) Only leaf nodes store data; internal nodes store two child pointers. The data field requires four bytes and each pointer requires two bytes.

所有点都一样： 4/(4+4*3) = 1/4
满树中，若页节点个数是 n，那么内部节点个数是 n-1, 4*n/(4*n + 4*(n-1)) = n/(2n-1)

6

Describe how to modify any balanced tree data structure such that search, insert, delete, minimum, and maximum still take O(logn) time each, but successor and predecessor now take O(1) time each. Which operations have to be modified to support this?

在树节点中添加指向 successor 和 predecessor 的指针。不影响操作 search, minimum, 和 maximum。只需在 insert 和 delete 操作相应更新指向 successor 和 predecessor 的指针。

7

Suppose you have access to a balanced dictionary data structure, which supports each of the operations search, insert, delete, minimum, maximum, successor, and predecessor in O(logn) time. Explain how to modify the insert and delete operations so they still take O(logn) but now minimum and maximum take O(1) time. (Hint: think in terms of using the abstract dictionary operations, instead of mucking about with pointers and the like.)

存储 max 和 min 这两个数。

insert 时，新元素与这个两数对比并相应更新。
delete 时，若是 min 元素被 delete，用它的 successor 更新；若是 max 元素被 delete，用它的 predecessor 更新。

8

Design a data structure to support the following operations:

insert(x,T) – Insert item x into the set T.
delete(k,T) – Delete the kth smallest element from T.
member(x,T) – Return true iff x∈T.

All operations must take O(logn) time on an n-element set.

Balanced binary tree.

9

A concatenate operation takes two sets S1 and S2, where every key in S1 is smaller than any key in S2, and merges them together. Give an algorithm to concatenate two binary search trees into one binary search tree. The worst-case running time should be O(h), where h is the maximal height of the two trees.

S1 中的所有元素小于 S2,用 O（logn）的时间找出 S2 的最小元素，然后 S1 成为它的左子树，S2 成为它的右子树，组成新的搜索树。

10

In the bin-packing problem, we are given n metal objects, each weighing between zero and one kilogram. Our goal is to find the smallest number of bins that will hold the n objects, with each bin holding one kilogram at most.

The best-fit heuristicfor bin packing is as follows. Consider the objects in the order in which they are given. For each object, place it into the partially filled bin with the smallest amount of extra room after the object is inserted.. If no such bin exists, start a new bin. Design an algorithm that implements the best-fit heuristic (taking as input the n weights w1,w2, …, wn and outputting the number of bins used) in O(nlogn)time.
Repeat the above using the worst-fit heuristic, where we put the next object in the partially filled bin with the largest amount of extra room after the object is inserted.

使用 BST。主要找到能容纳这个元素的最小 bin，若所有 bin 都小于这个元素大小，就插入一个新的。

min_node = NULL;
while node != NULL:
    if (node->weight >= w && node->left < w) {
      min_node = node;
      break;
    } else if (node->left >= w) {
      node = node->left;
    } else {
      node = node->right;
    }
if (min_node == NULL) {
  bst->insert(new node(w));
} else {
  bst->delete(min_node);
  min_node->weight -= w;
  bst->insert(min_node);
}

最大堆使用。每次选最大容量的 bin。若最大 bin 小于这个元素大小，就插入一个新的。

11

Suppose that we are given a sequence of n values x1,x2, …, xn and seek to quickly answer repeated queries of the form: given i and j, find the smallest value in xi,…,xj.

(a) Design a data structure that uses O(n²) space and answers queries in O(1) time.

(b) Design a data structure that uses O(n) space and answers queries in O(logn) time. For partial credit, your data structure can use O(nlogn) space and have O(logn) query time.

n*n 的矩阵，i,j 中存的就是 i-j 的最小元素。
使用Cartesian tree或Treap。

12

Suppose you are given an input set S of n numbers, and a black box that if given any sequence of real numbers and an integer k instantly and correctly answers whether there is a subset of input sequence whose sum is exactly k. Show how to use the black box O(n) times to find a subset of S that adds up to k.

R = S
for i = 1 to n:
  if bb(R/{si}) is True:
      R = R / {si}

13

Let A[1..n] be an array of real numbers. Design an algorithm to perform any sequence of the following operations:

• Add(i,y)– Add the value y to the ith number.

• Partial-sum(i)– Return the sum of the first i numbers

There are no insertions or deletions; the only change is to the values of the numbers. Each operation should take O(logn) steps. You may use one additional array of size n as a work space.

建立叶节点数ｎ的 balanced binary tree，ｎ个叶节点依次存储 A[1..n]，书的内节点存储子树的和。

Add(i,y)，比较ｉ与 n/2，决定左子树还是右子树，依次遍历到叶节点，并增加相应子树和，最后找到第ｉ个元素相加。
Partial-sum(i)，比较ｉ与 n/2，决定左子树还是右子树，每当遍历右子树，加上左子树的和，最后到叶节点，得到总的和。

14

Extend the data structure of the previous problem to support insertions and deletions. Each element now has both a key and a value. An element is accessed by its key. The addition operation is applied to the values, but the elements are accessed by its key. The Partial sum operation is different.

Add(k,y)– Add the value y to the item with key k.
Insert(k,y)– Insert a new item with key k and value y.
Delete(k)– Delete the item with key k.
Partial-sum(k)– Return the sum of all the elements currently in the set whose key is less than y,

The worst case running time should still be O(nlogn) for any sequence of O(n) operations.

建立以 key 排序的平衡搜索二叉树，并每个节点中添加一个左子树和的值。

Add(k,y)：随着搜索 key k，依次加左子树和，最后 key k 加上 y。
Insert(k,y)：随着搜索 key k 插入位置，依次加左子树和，最后插入 key k 的元素。
Delete(k)：：随着搜索 key k，依次减少左子树和，最后删除 key k 元素。
Partial-sum(k)：随着搜索 key k，依次加上左子树的和（因为左边的元素是小于的元素）。

15

Design a data structure that allows one to search, insert, and delete an integer X in O(1) time (i.e. , constant time, independent of the total number of integers stored). Assume that 1≤X≤n and that there are m+n units of space available, where m is the maximum number of integers that can be in the table at any one time. (Hint: use two arrays A[1..n] and B[1..m].) You are not allowed to initialize either A or B, as that would take O(m) or O(n) operations. This means the arrays are full of random garbage to begin with, so you must be very careful.

与Programming Pearls的 Column 课后题一样。

建立两个数组 A[1..n]，B[1..m]和一个表示元素个数的变量 k。

insert X: k = k + 1，A[X] = k, B[k] = X;
search X: return (A[X] <= k) && B[A[X]] == X;
delete X: 把 A[X]与末端 A[B[k]]交换，A[B[k]] = A[X], B[A[X]] = B[k]; k = k - 1;

18

What method would you use to look up a word in a dictionary?

Hash Table.

19

Imagine you have a closet full of shirts. What can you do to organize your shirts for easy retrieval?

以颜色排序，并二分搜索查找。

20

Write a function to find the middle node of a singly-linked list.

struct Node {
  int value;
  Node *next;
};

Node* FindMidNode(Node *head) {
  Node *p, *q;
  p = head;
  q = head;
  i = 0;
  while (p != NULL) {
    i++;
    p = p->next;
    if (i == 2) {
      q = q->next;
      i = 0;
    }
  }
  return q;
}

21

Write a function to compare whether two binary trees are identical. Identical trees have the same key value at each position and the same structure.

struct Node {
  int value;
  Node *left;
  Node *right;
};

bool CompareBinaryTree(Node *head_m, Node *head_n) {
  if (head_m == NULL && head_n == NULL) {
    return true;
  }
  if (head_m == NULL || head_n == NULL) {
    return false;
  }
  return (head_m->value == head_n->value) &&
      CompareBinaryTree(head_m->left, head_n->left) &&
      CompareBinaryTree(head_m->right, head_n->right);
}

22

Write a program to convert a binary search tree into a linked list

struct Node {
  int value;
  Node *next;
};

struct TNode {
  int value;
  TNode *left;
  TNode *right;
  TNode(int value_in) {
    value = value_in;
    left = NULL;
    right = NULL;
  }
};

void InsertToList(Node **head, int value) {
  Node *new_node = new Node;
  new_node->value = value;
  new_node->next = *head;
  *head = new_node;
}

void ConvertTreeToList(const TNode *root, Node **head) {
  if (root == NULL) {
    return;
  }
  ConvertTreeToList(root->right, head);
  InsertToList(head, root->value);
  ConvertTreeToList(root->left, head);
}

23

Implement an algorithm to reverse a linked list. Now do it without recursion.

void ReverseLinkedList(Node **head) {
  if (!head || *head == NULL) {
    return;
  }
  Node *prev, *p, *next;
  prev = *head;
  p = prev->next;
  prev->next = NULL;
  while (p != NULL) {
    next = p->next;
    p->next = prev;
    prev = p;
    p = next;
  }
  *head = prev;
}

24

What is the best data structure for maintaining URLs that have been visited by a Web crawler? Give an algorithm to test whether a given URL has already been visited, optimizing both space and time.

Hash Table.

26

Reverse the words in a sentence—i.e., “My name is Chris” becomes “Chris is name My.” Optimize for time and space.

reverse 每个单词;
reverse 整句。

void Reverse(char *begin, char *end) {
  char temp;
  while (begin < end) {
    temp = *begin;
    *begin = *end;
    *end = temp;
    begin++;
    end--;
  }
}

void ReverseWords(char *str) {
  char *word_begin;
  word_begin = NULL;
  char *p;
  p = str;
  while (*p != '\0') {
    if (word_begin == NULL && *p != ' ') {
      word_begin = p;
    }
    if (word_begin != NULL && (*(p+1) == ' ' || *(p+1) == '\0')) {
      Reverse(word_begin, p);
      word_begin = NULL;
    }
    ++p;
  }
  Reverse(str, p - 1);
}

27

Determine whether a linked list contains a loop as quickly as possible without using any extra storage. Also, identify the location of the loop.

利用两个指针，一个快指针和一个慢指针，快的每次都比慢的多前进一个节点，如果存在 loop,快的总会与慢的相重叠。

loop 的起始点：

当快的与慢的指针相重叠时，验证有 loop，之后慢的指针不动，通过快的指针计算出 loop 的长度
重新从链表头开始，快指针比慢指针先前进 loop 的长度距离，提增慢和快指针，直到第一次相遇，相遇点就是 loop 的起始点。

28

You have an unordered array X of n integers. Find the array M containing n elements where Mi is the product of all integers in X except for Xi. You may not use division. You can use extra memory. (Hint: There are solutions faster than O(n²).)

对数组 X 扫描 2 次计算出如下 2 组数组：

$$ \begin{align} P_{0} = 1; P_{k}=X_{k}P_{k-1}=\prod_{i=1}^{k}X_{i} \newline Q_{n+1} = 1; Q_{k}=X_{k}Q_{k+1}=\prod_{i=k}^{n}X_{i} \end{align} $$

所以得到 M：

$$ \begin{align} M_{i} = P_{i-1} Q_{i+1}, i\in[1,n] \end{align} $$

29

Give an algorithm for finding an ordered word pair (e.g., “New York”) occurring with the greatest frequency in a given webpage. Which data structures would you use? Optimize both time and space.

Hash Table.

The Last Thing D Needs总结

2014-06-01T16:24:47+08:00

Initialization
Type Deduction
Inheritance
Computational Complexity
APIs
Specifications
Essential and Accidental Complexity

Effective C++ 系列的作者 Scott Meyers 在 Dconf 中 The Last Thing D Needs 聊了些 C++的特性，稍微总结一下。

Initialization

int x1;         // unknown, initial(pay for it)
int x2;         // (at global scope) 0, no run time cost
static int x3;  // 0, static initialization
int *px = new int;  // heap memory, unknown, has run time cost
{
    int x4;    // unknown, has run time cost 
}
int a1[100];   // unknown
int a2[100];   // (at global scope) 0
static int a3[100];  // 0
std::vector<int> v(100);  // 0, use run time cost

Type Deduction

const int cx = 0;
auto my_cx1 = cx;          // int, new independent value
decltype(cx) my_cx2 = cx;  // const int, standard said

template<typname T>
void f1(T param);
f1(cx);                    // T's type, int, same rules with auto

template<typename T>
void f2(T& param);
f2(cx);                   //T's type, const int, reference a chunk of memory, preserve the const

template<typename T>
void f3(T&& param);
f3(cx);                  //T's type, const int&, perfect argument forwarding, a special rule

const int cx = 0;
auto lam= [cx] {cx = 10;};       //error!
class UpToTheCompiler {
private:
  ??? cx;                      //const int
  ...
};

const int cx = 0;
auto lam= [cx = cx] {cx = 10;};     //error!
class UpToTheCompiler {
private:
  ??? cx;                          //int (but acts like const int)
  ...
public:
  void operator() const
  {cx = 0;}
};

const int cx = 0;
auto lam1= [cx = cx] mutable {cx = 10;};     //error!
auto lam2= [cx = cx]() mutable {cx = 10;};     //correct
class UpToTheCompiler {
private:
  ??? cx;                          //int (but acts like int)
  ...
};

For

const int cx = 0;

type deduction for cx yields:

Context	Type
auto	int
decltype	const int
template(T parameter)	int
template(T& parameter)	const int
template(T&& parameter)	const int&
lambda (by-value capture)	const int
lambda (int capture)	int

Type Deduction

//all do the same thing
int x1 = 0;
int x2(0);
int x3 = {0};
int x4 {0};

auto x1 = 0;  // int
auto x2(0);   // int
auto x3 = {0};// initializer_list
auto x4 {0};  // initializer_list

template <typname T>
void f(T param);
f({0});       // error! "{0}" has no type

Inheritance

inheritance

class Base {
public:
  void doBaseWork();
};
class Derived : public Base {
public:
  void doDerivedWord() {
    doBaseWord();               //ok
  }
};

template <typename T>
class Base {
public:
  void doBaseWork();
};
template <typename T>
class Derived : public Base<T> {
public:
  void doDerivedWord() {
    doBaseWord();               //no compile, later specialized version
  }
};

template <>
class Base<int> ();  // no doBasework

Derived<int> d;
d.doDerivedWord();  // fail

In essence, the One Definition Rule states that the same entity should have the exact same definition throughout an application, otherwise the effects are undefined.

The fundamental problem is that the code that doesn’t see the specialized version of your class template member function might still compile, is likely to link, and sometimes might even run. This is because in the absence of (a forward declaration of) the explicit specialization, the non-specialized version kicks in, likely implementing a generic functionality that works for your specialized type as well.

Computational Complexity

computational Complexity

std::vector<int> v;
...
std::sort(v.begin(), v.end());   // O(nlogn)

std::list<int> li;
...
std::sort(li.begin(), li.end());  // not compile, list doesnot have random access iterator

auto it1 =
std::binary_search(v.begin(), v.end(), 10);  // O(logn)

auto it2 =
std::binary_search(li.begin(), li.end(), 10);  // O(n), officially(number of compares): O(logn)

APIs

example

std::set<int> si;
...
si.erase(14);    // eliminate all 14s from si

set –> erase
multiset –> erase
map –> erase
multimap –> erase
unordered_set –> erase
unordered_multiset –> erase
unordered_map –> erase
unordered_multimap –> erase
list –> remove
forward_list –> remove

Sorts can be stable or unstable. Which are guaranteed to be stable? * sort –> not guaranteed * stable_sort –> guaranteed * list::sort –> guaranteed

Specifications

Five sequence containers:

array –> No
deque –> Yes
forward_list –> No(fulfill of 1 of 16)
list –> Yes
vector – > Yes

Essential and Accidental Complexity

Essential Complexity: due to inherent design tensions.

Simplicity and regularity vs expressiveness.
Abstraction and portability vs efficiency.
New approaches vs compatibility with legacy systems.
Expressiveness vs ability to issue good diagnostics.

Essential Complexity

Point

struct Point {
  int x, y;
};

What is the type of Point::x?

Point

Point p;
const Point &cp = p;

What is the type of cp.x?

C++ soluction:

decltype(cp.x) = int
decltype((cp.x)) = const int&

inheritance

template <typename T>
class Base {
public:
  void doBaseWork();
};
template <typename T>
class Derived : public Base<T> {
public:
  void doDerivedWord() {
    doBaseWrd();               //okay?
  }
};

Assume typo and diagnose now?

Wrong if later specialization offers doBaseWrk.

Assume later specialization and defer lookup until instantiation?

If typo, imposes diagnostics for library errors on clients.

C++ solution:

Template author has control
- doBaseWrk() -> lookup name when parsing template.
- this->doBaseWrk() -> lookup name when instantiating template.

Accidental Complexity

ints are sometimes initialized to 0.
By-value lambda capture somtimes retains the constness of what’s captured.
mutable lambdas must declare a parameter list, but non-mutable lambdas don’t
Braced initializers (e.g.”{0}”) sometimes have a type.
Computation complexity guarantees usually meaningful.
Elimination all container elements with a given value usually means calling erase.
sort is sometimes stable.
Container “requirements” are sometimes required.

Algorithm Design Manual Chapter 2

2014-05-29T00:00:00+08:00

Book Notes

Our two most important tools are (1) the RAM model of computation and (2) the asymptotic analysis of worst-case complexity.

Exercises

1

What value is returned by the following function? Express your answer as a function of n. Give the worst-case running time using the Big Oh notation.

function mystery(n)
      r:=0
      for i:=1 to n-1 do
          for j:=i+1 to n do
              for k:=1 to j do
                  r:=r+1
       return(r)

$$ \begin{align} \sum_{i=1}^{n-1}\sum_{j=i+1}^{n}\sum_{k=1}^{j}1 = \sum_{i=1}^{n-1}\sum_{j=i+1}^{n}j = \sum_{i=1}^{n-1}(\sum_{j=1}^{n}j - \sum_{j=1}^{i}j) = \newline \sum_{i=1}^{n-1}(\frac{n(n+1)}{2} - \frac{i(i+1)}{2} = \frac{1}{2}\sum_{i=1}^{n-1}(n^2+n-i^2-i) = \newline \frac{1}{2}((n-1)n^2+(n-1)n-(\frac{n(n+1)(2n+1)}{6}-n^2)-(\frac{n(n+1)}{2}-n)) \end{align} $$

Time: O(n³)

2

What value is returned by the following function? Express your answer as a function of n. Give the worst-case running time using Big Oh notation.

function pesky(n)
      r:=0
      for i:=1 to n do
          for j:=1 to i do
              for k:=j to i+j do
                  r:=r+1
      return(r)

\begin{align} f(n) = \frac{n(n+1)(n+2)}{3} \end{align}

Time: O(n³).

3

What value is returned by the following function? Express your answer as a function of n. Give the worst-case running time using Big Oh notation.

function prestiferous(n)
       r:=0
       for i:=1 to n do
           for j:=1 to i do
               for k:=j to i+j do
                   for l:=1 to i+j-k do
                       r:=r+1
       return(r)

\begin{align} f(n) = \frac{n(n+1)(n+2)}{3} \end{align}

Time: O(n⁴).

19

\begin{align} (1/3)^n < 6 < loglogn < logn = lnn < (logn)^2 < n^\frac{1}{3}+logn < \sqrt{n} \newline < \frac{n}{logn} < n < nlogn < n^2 = n^2+logn < n^3 < n-n^3+7n^5 < (3/2)^2 \newline = 2^n < n! \end{align}

34

Assume that Christmas hasndays. Exactly how many presents did my “true love” send me? (Do some research if you do not understand this question.)

假设一共有ｎ天，每 i 天收到的礼物数是：

\begin{align} p_i = \sum_{k=1}^{i}k \end{align}

总的礼物数：

\begin{align} \sum_{i=1}^{n} p_i = \sum_{i=1}^{n}\sum_{k=1}^{i}k=\frac{n^3+3n^2+2n}{6} \end{align}

43

You are given a set S of n numbers. You must pick a subset S’ of k numbers from S such that the probability of each element of S occurring in S’ is equal (i.e., each is selected with probability k / n). You may make only one pass over the numbers. What if n is unknown?

Reservoir sampling issue.

44

We have 1,000 data items to store on 1,000 nodes. Each node can store copies of exactly three different items. Propose a replication scheme to minimize data loss as nodes fail. What is the expected number of data entries that get lost when three random nodes fail?

不考虑 RAID 的 XOR 做法这里。

1000 个数据做 3 份拷贝，如何做 3 份拷贝呢？

随机的 3 个点失败只损失一个数据

3 份拷贝以相邻一格的方式存储，如下

nodes:    1       2        3   ...   1000
copy1:   data1    data2    data3 ..  data1000
copy2:   data1000 data1    data2 ..  data999
copy3:   data999  data1000 data1 ..  data998

随机选取 3 个点的组合个数： $\binom{1000}{3}$ 其中丢失数据点的组合个数： 1000, 每种组合丢失一个数据所以丢失数据期望 $1000 / \binom{1000}{3} = 6.01804e-06$

随机的 3 个点失败损失 3 个数据

每 3 个点共享 3 个拷贝点，如下

nodes:    1       2        3   ...   1000
copy1:   data1    data2    data3 ..  data1000
copy2:   data3    data1    data2 ..  data999
copy3:   data2    data3    data1 ..  data998

随机选取 3 个点的组合个数： $\binom{1000}{3}$ 其中丢失数据点的组合个数： 1000/3, 每种组合丢失 3 个数据，所以丢失数据期望 $1000/3×3 / \binom{1000}{3} = 6.01804e-06$

45

Consider the following algorithm to find the minimum element in an array of numbers . One extra variable tmp is allocated to hold the current minimum value. Start from A[0]; “tmp” is compared against A[1], A[2], , A[N] in order. When A[i] < tmp, tmp = A[i]. What is the expected number of times that the assignment operation tmp = A[i] is performed?

期望的次数是第ｎ个元素是最小值的概率的总和。ｎ个元素平均分布，任意元素是最小值的概率是 1/ｎ。

E(n) = E(n-1) +1/n， E[1] = 1

\begin{align} E(n) = \sum_{i=1}^{n}\frac{1}{i} \approx lnn \end{align}

46

You have a 100-story building and a couple of marbles. You must identify the lowest floor for which a marble will break if you drop it from this floor. How fast can you find this floor if you are given an infinite supply of marbles? What if you have only two marbles?

无限个玻璃球，采用二分搜索法，celing(log100) = 7. 最快 7 次。
如果只有 2 个玻璃球

n 个球时在总楼层 r 中某个楼层 x 抛，两种情况： 1.破碎，剩下的总楼层 x-1 用剩下的 n-1 个球; 2.没破碎，剩下的总楼层 r-x 用 n 个球

如此把问题分解成小问题。如下代码求得最快的次数为 14。其中一条最坏情况： 9–>22–>34–>45–>55–>64–>72–>79–>85–>90–>94–>97–>99

/* Drop Marbles
   n: num of marbles
   r: num of floors
   drop_qeq: the drop sequence
   marble_drop: minimum number of trails needed to find the critical floor in
   worst case
   marble_drop[n][r] = 1 + min{max(marble_drop[n-1][x-1], marble[n][r-x]) :
   x in {1,2,...,r}}
*/
int DropMarbles(int n, int r, int **drop_seq) {
  int marble_drop[n+1][r+1];
  int i, j;
  for (j = 0; j <= r; ++j) {
    marble_drop[1][j] = j;
  }
  for (i = 0; i <= n; ++i) {
    marble_drop[i][1] = 1;
    marble_drop[i][0] = 0;
  }
  int min_sofar;
  for (i = 2; i <= n; ++i) {
    for (j = 2; j <= r; ++j) {
      marble_drop[i][j] = numeric_limits<int>::max();
      for (int x = 1; x <= j; ++x) {
        min_sofar = 1 + max(marble_drop[i-1][x-1], marble_drop[i][j-x]);
        if (min_sofar < marble_drop[i][j]) {
          marble_drop[i][j] = min_sofar;
          drop_seq[i][j] = x;
        }
      }
    }
  }
  return marble_drop[n][r];
}

47

You are given 10 bags of gold coins. Nine bags contain coins that each weigh 10 grams. One bag contains all false coins that weigh one gram less. You must identify this bag in just one weighing. You have a digital balance that reports the weight of what is placed on it.

一共 10 袋 bag1-10, 分别从 bag1 中取 1 个金币，bag2 中取 2 个金币……bag10 中取 10 个金币，称重总的重量 W。如果每个金币都是 10grams 的话，所以金币总重量是 550。N=550-W。得到缺失的重量，也是 bag 的号数，所以 bagN 中含有错误金币。

48

You have eight balls all of the same size. Seven of them weigh the same, and one of them weighs slightly more. How can you find the ball that is heavier by using a balance and only two weighings?

8==> 3,3,2

称重 3 和 3 两组
若不同，选出重的一组，3==> 1,1,1 称重 1 和 1,不同，那么重的就是，相同，另外个是。
若相同，2==>1,1，称重 1 和 1,重的就是

49

Suppose we start with n companies that eventually merge into one big company. How many different ways are there for them to merge?

1. 2 个公司(a,b)时，合并只有一种方法 [ab] 2. 当有 n 个公司时，如何把它用 n-1 个公司表示，f(n)=f(n-1)g(n) 3. n 个公司第一步从中选择两个公司合并，连带合并后的新公司一共 n-1 个公司，化简到 n-1 个公司表示。 4. n 个选 2 个的组合个数是： $\binom{1000}{2}=n(n-1)/2$

所以

f(n) = ∑_i=2ⁿ\frac{i(i-1)}{2} = \frac{n!(n-1)!}{2^n-2}

50

A Ramanujam number can be written two different ways as the sum of two cubes—i.e., there exist distinct a, b, c, and d such that a3 + b3 = c3 + d3. Generate all Ramanujam numbers where a,b,c,d < n.

#include 
using std::vector;

bool FindEqual(const vector<int> &num_cube, int low, int high, const int &sum,
               vector<int> *res) {
  if (low >= high) {
    return false;
  }
  int i, j;
  i = low;
  j = high;
  int add;
  while (i < j) {
    add = num_cube[i] + num_cube[j];
    if (add == sum) {
      res->push_back(i);
      res->push_back(j);
      return true;
    }
    if (add > sum) {
      --j;
    } else {
      ++i;
    }
  }
  return false;
}

void RamanujamNum(int n, vector<vector<int> > *res) {
  vector<int> num_cube(n);
  int i, j;
  for (i = 0; i < n; ++i) {
    num_cube[i] = i*i*i;
  }
  vector<int> ram_num;
  bool find;
  for (i = 0; i < n - 1; ++i) {
    for (j = i + 3; j < n; ++j) {
      find = FindEqual(num_cube, i+1, j-1, num_cube[i] + num_cube[j], &ram_num);
      if (find) {
        ram_num.push_back(i);
        ram_num.push_back(j);
        res->push_back(ram_num);
        ram_num.clear();
      }
    }
  }
}

51

Six pirates must divide $300 dollars among themselves. The division is to proceed as follows. The senior pirate proposes a way to divide the money. Then the pirates vote. If the senior pirate gets at least half the votes he wins, and that division remains. If he doesn’t, he is killed and then the next senior-most pirate gets a chance to do the division. Now you have to tell what will happen and why (i.e., how many pirates survive and how the division is done)? All the pirates are intelligent and the first priority is to stay alive and the next priority is to get as much money as possible.

从后往前推

2 个海盗，(1, 2) —> (0, 300)
3 个海盗，(1, 2, 3) —> (1, 0, 299)
4 个海盗，(1, 2, 3, 4) —> (0, 1, 0, 299)
5 个海盗，(1, 2, 3, 4, 5) —> (1, 0, 1, 0, 298)
6 个海盗，(1, 2, 3, 4, 5, 6) —> (0, 1, 0, 1, 298)

52

Reconsider the pirate problem above, where only one indivisible dollar is to be divided. Who gets the dollar and how many are killed?

2: (1, 2) —> (0, 1)
3: (1, 2, 3) —> （1, 0, 0)
4: (1, 2, 3, 4) —> (0, 0, 1, 0)
5: dead
6: (1, 2, 3, 4, 5, 6) —> (0, 0, 0, 1, 0, 0)

要至少一半的同意，间隔要有一半的人会死去才会同意之前那个人，所以之后每 2+2^K (K>=1)的海盗才能活。

Algorithm Design Manual Chapter 1

2014-05-26T00:00:00+08:00

Book Notes

Combinatorial Objects

Permutations - arrangements, or orderings, of items (“arrangement” “tour” “ordering” or “sequence” )
Subsets - selections from a set of items (“cluster” “collection” “committee” “group” “packaging” or “selection”)
Trees - hierarchical relationships between items (“hierarchy” “dominance relationship” “ancestor/descendant relationship” or “taxonomy”)
Graphs - relationships between arbitrary pairs of objects (“network” “circuit” “web” or “relationship”)
Points - locations in some geometric space (“sites” “positions” or “locations.)
Polygons - regions in some geometric spaces (“shapes” “regions” or “boundaries”)
Strings - sequences of characters or patterns. (“text” “characters” “patterns” or “labels”)

Recursive Objects

Big things that are made from smaller things of exactly the same type as the big thing. A decomposition rule describes how to get smaller things from big things.

As all combinatorial objects above are recursive objects here are a few possible decompositon rules for them:

Permutations - Deleting the first/last element of a permutation
Subsets - Deleting an element n if present
Trees - Deleting the root (results in a set of subtrees), deleting a leaf (a smaller tree)
Graphs - Deleting a vertex, dividing vertices to groups
Point - divide them to groups
Polygons - Inserting any internal chord between two nonadjacent vertices
Strings - Deleting a character (first or last)

Exercises

1-28.

Write a function to perform integer division without using either the / or * operators. Find a fast way to do it.

void DivideCore(int m, int n, int *quot, int *rem) {
  rem = m;
  quot = 0;
  while (rem >= n) {
    rem -= n;
    quot++;
  }
}

void Divide(int m, int n, int *quot, int *rem) {
  int mult_n = 0;
  int last_n;
  while (m % n == 0) {
    last_n = n;
    n = n + n;
    mult_n++;
  }
  DivideCore(m, n, quot, rem);
  for (int i = 0; i < mult_n; ++i) {
    quot = quot + quot;
  }
}

1-29.

There are 25 horses. At most, 5 horses can race together at a time. You must determine the fastest, second fastest, and third fastest horses. Find the minimum number of races in which this can be done.

7 次。

5 次：25 匹分成 5 组，比赛 5 次，得到前 5 名。
6 次：前 5 名比赛一次。因为只要得到前 3 名，这里剔除 5 名中的 2 名，剩下的 3 匹按比赛排名所在组为 G1，G2，G3。
7 次：G3 组只能去角逐第 3 名，派第一名 G31,G2 组只能去角逐第 2,3 名，派第一，二名，G21 和 G22。G1 组 G11 已经是第一名，去角逐第 2,3 名，派 G12,G13。最后 G12,G13，G21，G22 和 G31，得到第二，三名。

1-30.

How many piano tuners are there in the entire world?

需要把问题分解：1.世界有多少架钢琴;2.每位钢琴调音师能调多少台钢琴。

估算世界有多少架钢琴，需要知道：

世界的人口。
拥有钢琴的人口比例。
拥有钢琴的学校，教堂等场所的数量。

估算每位钢琴调音师能调多少台钢琴，需要知道：

每架钢琴平均多久需要调音一次。
对钢琴调音需要多长时间。
调音师的工作时间。
世界人口 70 亿，70×10⁹ 。
人口中弹奏乐器的人约占 10%（肯定大于 1%,小于 100%），其中最多 10%的人弹奏钢琴，而其中拥有钢琴的比例为 2%-3%，约人口总数 2×10^-3 。每 5000-10000 个人有一座教堂，每座教堂有一架钢琴，每 500-1000 个学生有一所学校，每所学校有一架钢琴，每人大约拥有 3×10^-3 架钢琴，所以钢琴数 70×10⁹ × 3×10^-3 = 2.1 * 10⁸ 。
钢琴调音的频率低于每月一次但多于 10 年一次，估计为一年一次。
调音所需时间多余 30 分钟，少于 1 天，估计为 2 小时。或钢琴有 88 个键，如果每个键花 1 分钟，需要 1.5 小时，若需 2 分钟，则需要 3 小时。
每天工作 8 小时，每周 5 天，每年工作 50 周，得出 8×5×50=2000 小时。2000 小时能调音大学 1000 架钢琴。
2.1 * 10⁸ / 1000 = 2.1 × 10⁵ 个调音师。

1-31.

How many gas stations are there in the United States?

分解问题成：

每天大约有多少辆汽车去加油。
每天一个加油站能给多少辆汽车加油。
美国人口总数约 300×10⁶ , 一家平均有 2 辆车左右，所以一共有车辆 150×10⁶ ，每辆汽车每 5 天加油一次，一天有 30×10⁶ 辆车去加油。
一个加油站平均每小时最少为 1 辆，最多 100 多辆汽车加油，取平均 20-30 辆每小时，一个加油站工作时间大概 14 小时（7am-9am），每个加油站每天平均大约为 280 辆车加油。
30×10⁶ / 280 = 1.07 × 10⁵ 个加油站。

1-32.

How much does the ice in a hockey rink weigh?

分解成：

冰的体积。
冰的密度。

做如下估算： 1.冰场的长度：70m; 2.冰场的宽度：30m; 3.冰的厚度：10cm=0.1; 4.冰的密度与水相当，估算 1000kg/m³ .

V = 70 * 30 * 0.1 = 210 m³ W = 210 *1000 = 210，000kg

1-33.

How many miles of road are there in the United States?

美国近似是一个矩形，高 1000mile 和长 3000mile。美国大部分地区是乡村，道路比较稀疏，平均下来可以把美国想成一个网状的道路结构，每隔 1mile 一条道路，最后如下网格，1000 条 3000mile 和 3000 条 1000mile 的路，总的 6,000,000mile 的路。

1-34.

On average, how many times would you have to flip open the Manhattan phone book at random in order to find a specific name?

假设电话本有 1000 页，也就是 500 个翻面。

简单答案：翻到正确页的概率是 1/500。

复杂点答案：上面没有考虑不断翻页，会翻到相同的页面。翻到错误页面的概率是 499/500，N 次后的错误概率是（499/500）^N ,所以 N 次后的正确页面概率是 P=1- （499/500）^N 。

那么： N=1 P = 0.002 N=2 P = 0.004 … N=1150 P = 0.89999

达到 90%的概率，所以需要 1150 翻页。

Programming Pearls Overview

2014-05-24T10:34:37+08:00

全书分为 3 部分：

预备知识：总的概括代码涉及的知识点，比如算法和数据结构的重要性和合理性，如何写出正确的代码并证明，如何测试代码，性能评估代码，debug 等。
性能：先是介绍一些估算的技巧，比如 72 法则，利特尔法则(Little’s law) 等，之后展开代码算法的性能，如何调试代码使得性能更好或更省空间。
产品：讲解具体的算法，如排序，搜索等。

书本和习题大部分代码实现。

Programming Pearls: Column14-15

2014-05-24T00:00:00+08:00

Book notes

Hash 存储 word 次数

typedef struct Node* Nodeptr;
struct Node {
  Node(string inword, int incount, Nodeptr innext) {
    word = inword;
    count = incount;
    next = innext;
  }
  string word;
  int count;
  Nodeptr next;
};

#define NHASH 29989
#define MULT 31
Nodeptr bin[NHASH];

unsigned int Hash(const string &str) {
  unsigned int h = 0;
  for (string::const_iterator it = str.begin(); it != str.end(); ++it) {
    h = MULT * h + *it;
  }
  return h % NHASH;
}

void InWord(const string &str) {
  Nodeptr p;
  int h;
  h = Hash(str);
  for (p = bin[h]; p != NULL; p = p->next) {
    if (str.compare(p->word) == 0) {
      (p->count)++;
      return;
    }
  }
  p = new Node(str, 1, bin[h]);
  bin[h] = p;
}

int main(int argc, char *argv[]) {
  string str;
  int i;
  for (i = 0; i < NHASH; ++i) {
    bin[i] = NULL;
  }
  while (cin >> str) {
    InWord(str);
  }
  for (i = 0; i < NHASH; ++i) {
    for (Nodeptr p = bin[i]; p != NULL; p = p->next) {
      cout << p->word << " " << p->count << endl;
    }
  }
  return 0;
}

Markov 产生随机词汇

利用指针指向不同单词的开头，并按照 K 个单词对比方式排序，利用二分搜索定位相同 K 长度的文本，并利用Reservoir sampling在不知道长度的情况下，均等的随机选取一个。

#define MAXINPUT 4000000
#define MAXWORDS 800000
#define K 2
char input_letters[MAXINPUT];
char *word[MAXWORDS];

int WordNcmp(const char *p, const char *q, int n) {
  while (*p == *q) {
    if (*p == 0 && --n == 0) {
      return 0;
    }
    ++p;
    ++q;
  }
  return *p - *q;
}


int SortCmp(const void *a, const void *b) {
  const char **p = (const char**)(a);
  const char **q = (const char**)(b);
  return WordNcmp(*p, *q, K);
}

char* SkipNword(char *p, int n) {
  for (; n > 0; p++) {
    if (*p == 0) {
      --n;
    }
  }
  return p;
}

int FindPhrase(char **word, int n, char *phrase) {
  int l = -1;
  int u = n;
  int m;
  while (l + 1 != u) {
    m = (l + u) / 2;
    if (WordNcmp(word[m], phrase, K) < 0) {
      l = m;
    } else {
      u = m;
    }
  }
  return u;
}

int main(int argc, char *argv[]) {
  int nword = 0;
  word[0] = input_letters;
  while (scanf("%s", word[nword]) != EOF) {
    word[nword + 1] = word[nword] + strlen(word[nword]) + 1;
    nword++;
    if (nword == MAXWORDS) {
      break;
    }
  }
  int i;
  for (i = 0; i < K; ++i) {
    word[nword][i] = 0;
  }
  for (i = 0; i < K; ++i) {
    printf("%s ", word[i]);
  }
  qsort(word, nword, sizeof(word[0]), SortCmp);
  char *phrase = input_letters;
  int printlen = 100;
  int find_index;
  char *p;
  for (; printlen > 0; --printlen) {
    int find_index = FindPhrase(word, nword, phrase);
    for (i = 0; WordNcmp(phrase, word[find_index + i], K) == 0; ++i) {
      if ((rand() % (i + 1)) == 0) {
        p = word[find_index + i];
      }
    }
    phrase = SkipNword(p, 1);
    if (strlen(SkipNword(phrase, K - 1)) == 0) {
      break;
    }
    printf("%s ", SkipNword(phrase, K - 1));
  }
  printf("\n");
  return 0;
}

Markov 利用 Hash 产生随机词汇

利用 Hash 表加快搜索相同 K 长度的文本。

#define MAXINPUT 4000000
#define MAXWORDS 800000
#define K 2
char input_letters[MAXINPUT];
char *word[MAXWORDS];

int WordNcmp(const char *p, const char *q, int n) {
  while (*p == *q) {
    if (*p == 0 && --n == 0) {
      return 0;
    }
    ++p;
    ++q;
  }
  return *p - *q;
}

char* SkipNword(char *p, int n) {
  for (; n > 0; p++) {
    if (*p == 0) {
      --n;
    }
  }
  return p;
}

#define NHASH 499979
#define MULT 31
int bin[NHASH];
int next[MAXWORDS];

unsigned int Hash(char *str) {
  unsigned int h = 0;
  char *p = str;
  for (int n = K; n > 0; p++) {
    h = MULT * h + (unsigned char)(*p);
    if (*p == 0) {
      --n;
    }
  }
  return h % NHASH;
}

void InitHash(char **word, int nword) {
  int i;
  for (i = 0; i < NHASH; ++i) {
    bin[i] = - 1;
  }
  for (i = 0; i < nword; ++i) {
    unsigned int h = Hash(word[i]);
    next[i] = bin[h];
    bin[h] = i;
  }
}

int main(int argc, char *argv[]) {
  int nword = 0;
  word[0] = input_letters;
  while (scanf("%s", word[nword]) != EOF) {
    word[nword + 1] = word[nword] + strlen(word[nword]) + 1;
    nword++;
    if (nword == MAXWORDS) {
      break;
    }
  }
  int i;
  for (i = 0; i < K; ++i) {
    word[nword][i] = 0;
  }
  InitHash(word, nword);
  for (i = 0; i < K; ++i) {
    printf("%s ", word[i]);
  }
  char *phrase = input_letters;
  int printlen = 100;
  char *p;
  for (; printlen > 0; --printlen) {
    i = 0;
    for (int j = bin[Hash(phrase)]; j >= 0; j = next[j]) {
      if (WordNcmp(word[j], phrase, K) == 0 && (rand() % (++i) == 0)) {
        p = word[j];
      }
    }
    phrase = SkipNword(p, 1);
    if (strlen(SkipNword(phrase, K - 1)) == 0) {
      break;
    }
    printf("%s ", SkipNword(phrase, K - 1));
  }
  printf("\n");
  return 0;
}

Problems

Column14-2

void SiftDown(int *x, int l, int u) {
  int i = l;
  int child;
  for (;;) {
    child = i * 2;
    if (child > u) {
      break;
    }
    if (child + 1 <= u) {
      if (x[child + 1] < x[child]) {
        child++;
      }
    }
    if (x[i] <= x[child]) {
      break;
    }
    swap(x[i], x[child]);
    i = child;
  }
}

void HeapSort(int *x, int n) {
  int i;
  for (i = n / 2; i >= 1; --i) {
    SiftDown(x, i, n);
  }
  for (i = n; i >= 2; --i) {
    swap(x[1], x[i]);
    SiftDown(x, 1, i - 1);
  }
}

Column15-8

找出最长重复超过 M 次的字符串。

经过排序后，越是相邻的越是相同的多，至少重复 M 次，就是计算相邻 M 个位置的字符所重复的字符长度，即 ComLen(pstr[i], pstr[i + kM])

int CmpPstr(const void *a, const void *b) {
  const char **p = (const char **)a;
  const char **q = (const char **)b;
  return strcmp(*p, *q);
}

int ComLen(char *p, char *q) {
  int i = 0;
  while (*p && (*p == *q)) {
    ++i;
    ++p;
    ++q;
  }
  return i;
}

#define kMaxN 500000
#define kM 1
char str[kMaxN];
char *pstr[kMaxN];
int main(int argc, char *argv[]) {
  int ch;
  int n = 0;
  while ((ch = getchar()) != EOF) {
    str[n] = ch;
    pstr[n] = &str[n];
    ++n;
  }
  str[n] = 0;
  qsort(pstr, n, sizeof(char *), CmpPstr);
  int maxlen = 0;
  int maxindex = 0;
  for (int i = 0; i < n - kM; ++i) {
    if (ComLen(pstr[i], pstr[i + kM]) > maxlen) {
      maxlen = ComLen(pstr[i], pstr[i + kM]);
      maxindex = i;
    }
  }
  printf("%.*s\n", maxlen, pstr[maxindex]);
  return 0;
}

Column15-9

找出两个文本中最长的共同字符串。

经典Longest common substring problem. 利用 Dynamic Programming 解决。复杂度 O(mn).

vector<string> LongestCommonString(const string &s, const string &t) {
  vector<vector<int> > array;
  int len_s = s.size();
  int len_t = t.size();
  int i, j;
  array.resize(len_s);
  for (i = 0; i < len_s; ++i) {
    array[i].resize(len_t);
  }
  int max_len = 0;
  vector<int> end_indexs;
  for (i = 0; i < len_s; ++i) {
    for (j = 0; j < len_t; ++j) {
      if (s[i] == t[j]) {
        if (i == 0 || j == 0) {
          array[i][j] = 1;
        } else {
          array[i][j] = array[i-1][j-1] + 1;
        }
        if (array[i][j] == max_len) {
          end_indexs.push_back(i);
        } else if (array[i][j] > max_len) {
          max_len = array[i][j];
          end_indexs.clear();
          end_indexs.push_back(i);
        }
      }
    }
  }
  vector<string> res;
  for (vector<int>::iterator it = end_indexs.begin(); it != end_indexs.end();
       ++it) {
    res.push_back(s.substr(*it - max_len + 1, max_len));
  }
  return res;
}

Column15-11

产生单词层次的 Markov 文本。

int main(int argc, char *argv[]) {
  const int kMax = 50000;
  const int kK = 5;
  const int kPrintlen = 1000;
  char str[kMax];
  int c, n;
  n = 0;
  while ((c = getchar()) != EOF) {
    str[n++] = c;
  }
  str[n] = 0;
  char *p, *q, *next_p;
  p = str;
  int i, eq_sofar, j;
  for (i = 0; i < kK; ++i) {
    printf("%c", str[i]);
  }
  for (i = 0; i < kPrintlen; ++i) {
    eq_sofar = 0;
    for (q = str; q < str + n - kK + 1; ++q) {
      for (j = 0; j < kK && *(p + j) == *(q + j); ++j) {
      }
      if (j == kK) {
        eq_sofar++;
        if (rand() % eq_sofar == 0) {
          next_p = q;
        }
      }
    }
    c = *(next_p + kK);
    if (c == 0) {
      break;
    }
    putchar(c);
    p = next_p + 1;
  }
  return 0;
}

Programming Pearls: Column11-12

2014-05-18T00:00:00+08:00

Book notes

QuickSort

void swap(int *array, int m, int n) {
  int temp;
  temp = array[m];
  array[m] = array[n];
  array[n] = temp;
}

int randint(int m, int n) {
  return m + (rand() / (RAND_MAX / (n - m + 1) + 1));
}

void isort(int *array, int n) {
  int i, j, t;
  for (i = 1; i < n; ++i) {
    t = array[i];
    for (j = i; j >= 0  && array[j - 1] < t; --j) {
      array[j] = array[j - 1];
    }
    array[j - 1] = t;
  }
}

void qsort1(int *array, int l, int u) {
  /*use array[l] for the mid element */
  if (l >= u) {
    return;
  }
  int m;
  m = l;
  for (int i = l + 1; i <= u; ++i) {
    if (array[i] < array[l]) {
      swap(array, ++m, i);
    }
  }
  swap(array, l, m);
  qsort1(array, l, m - 1);
  qsort1(array, m + 1, u);
}

void qsort2(int *array, int l, int u) {
  /*use array[l] for the mid element, from back to start,
    always swap the first element */
  if (l >= u) {
    return;
  }
  int i, m;
  i = m = u + 1;
  do {
    do {
      --i;
    } while (array[i] < array[l]);
    swap(array, --m, i);
  } while (i > l);
  qsort2(array, l, m - 1);
  qsort2(array, m + 1, u);
}

void qsort3(int *array, int l, int u) {
  /* two-way partition, use array[l] for the mid element */
  if (l >= u) {
    return;
  }
  int t, i , j;
  t = array[l];
  i = l;
  j = u + 1;
  for (;;) {
    do {
      ++i;
    } while (i <= u && array[i] < t);
    do {
      --j;
    } while (array[j] > t);
    if (i > j) {
      break;
    }
    swap(array, i, j);
  }
  swap(array, l, j);
  qsort3(array, l, j - 1);
  qsort3(array, j + 1, u);
}

const int kCutOff = 50;
void qsort4(int *array, int l, int u) {
  /* qsort3 + randomization + isort small subarrays + swap inline */
  if (u - l < kCutOff) {
    return;
  }
  int t, i , j;
  swap(array, l, randint(l, u));
  t = array[l];
  i = l;
  j = u + 1;
  for (;;) {
    do {
      ++i;
    } while (i <= u && array[i] < t);
    do {
      --j;
    } while (array[j] > t);
    if (i > j) {
      break;
    }
    swap(array, i, j);
  }
  swap(array, l, j);
  qsort3(array, l, j - 1);
  qsort3(array, j + 1, u);
}

生成随机数

从 n 中生成不重复的 m 个随机数。

1

void GenerateSortedRand(int m, int n) {
  int select = m;
  int remaining = n;
  for (int i = 0; i < n && select > 0; ++i) {
    if (rand() % remaining < select) {
      cout << i << " ";
      --select;
    }
    --remaining;
  }
  cout << endl;
}

2

void GenKnuth(int m, int n) {
  for (int i = 0; i < n && m > 0; ++i) {
    if (rand() % (n - i) < m) {
      cout << i << " ";
      --m;
    }
  }
  cout << endl;
}

3

void GenSets(int m, int n) {
  set<int> num_set;
  while (num_set.size() < m) {
    num_set.insert(rand() % n);
  }
  for (set<int>::iterator it = num_set.begin(); it != num_set.end(); ++it) {
    cout << *it << " ";
  }
  cout << endl;
}

4

int randint(int m, int n) {
  return m + (rand() / (RAND_MAX / (n - m + 1) + 1));
}

int compare(const void *a, const void *b) {
  return (*static_cast<const int*>(a) - *static_cast<const int*>(b));
}

void GenShuf(int m, int n) {
  int *x = new int[n];
  int i = 0;
  for (i = 0; i < n; ++i) {
    x[i] = i;
  }
  for (i = 0; i < m; ++i) {
    int j = randint(i, n - 1);
    int t = x[j];
    x[j] = x[i];
    x[i] = t;
  }
  qsort(x, m, sizeof(int), compare);
  for (i = 0; i < m; ++i) {
    cout << x[i] << " ";
  }
  cout << endl;
  delete x;
}

原则

理解问题。与用户讨论提出问题的有关场景。问题的陈述中往往包含问题的想法，和所有早期的想法一样，它们应该被考虑而不是与其他排斥。
指出一个抽象问题。一个清晰，整洁的问题陈述不旦帮助我们解决这个问题，并且能体现如何把这个解答应用到其他的问题上。
探索设计空间。不要急于立刻去解决问题，思考一分钟，花一天时间编程。应该思考一小时，编程一小时。使用通俗的上层语言帮助我们描述设计：伪代码描述控制六，抽象化表示关键数据结构的数据类型。
实现一种解答。我们应该追求以直接清晰的代码来实现选择的设计，使用最强大的能用的操作。
回顾。Polya 的How to Solve It能帮助任何程序员成为更好的问题解决者。在 15 页他说：”基本存在一些东西去做，随着足够的学习和突破，我们能改善每个解答，并且在任何情况下，我们都能经常改善我们对解答的理解。“

Problems

Column11-9

在数组 n 中以算法复杂度 O(n)找出第 k 个小的元素。

void swap(int *array, int m, int n) {
  int temp;
  temp = array[m];
  array[m] = array[n];
  array[n] = temp;
}

int randint(int m, int n) {
  return m + (rand() / (RAND_MAX / (n - m + 1) + 1));
}

void SelectK(int *array, int l, int u, int k) {
  if (l >= u) {
    return;
  }
  int t, i, j;
  swap(array, l, randint(l, u));
  t = array[l];
  i = l;
  j = u + 1;
  for (;;) {
    do {
      ++i;
    } while (i <= u && array[i] < t);
    do {
      --j;
    } while (array[j] > t);
    if (i > j) {
      break;
    }
    swap(array, i, j);
  }
  swap(array, l, j);
  if (j < k) {
    SelectK(array, j + 1, u, k);
  }
  else if (j > k) {
    SelectK(array, l, j - 1, k);
  }
}

1

int randint(int m, int n) {
  return m + (rand() / (RAND_MAX / (n - m + 1) + 1));
}

int bigrand() {
  return RAND_MAX * rand() + rand();
}

2

int randint(int m, int n) {
  return m + (rand() / (RAND_MAX / (n - m + 1) + 1));
}

void GenerateM(int m, int n) {
  int i, t;
  i = randint(0, n - 1);
  for(int j = 0; j < m; ++j) {
    t = i + j;
    if (t >= n) {
      t -= n;
    }
    cout << t << " " << endl;
  }
  cout << endl;
}

8

0..n-1 中生成 m 个随机数。

int randint(int m, int n) {
  return m + (rand() / (RAND_MAX / (n - m + 1) + 1));
}

int compare(const void *a, const void *b) {
  return (*static_cast<const int*>(a) - *static_cast<const int*>(b));
}

void GenShuf(int m, int n) {
  int *x = new int[n];
  int i = 0;
  for (i = 0; i < n; ++i) {
    x[i] = i;
  }
  for (i = 0; i < m; ++i) {
    int j = randint(i, n - 1);
    int t = x[j];
    x[j] = x[i];
    x[i] = t;
  }

  for (i = 0; i < m; ++i) {
    cout << x[i] << " ";
  }
  cout << endl;
  delete x;
}

如果允许有重复的数，如何生成排序的 m 个随机数。

void GenSets(int m, int n) {
  multiset<int> num_set;
  while (num_set.size() < m) {
    num_set.insert(rand() % n);
  }
  for (multiset<int>::iterator it = num_set.begin(); it != num_set.end();
       ++it) {
    cout << *it << " ";
  }
  cout << endl;
}

如果可以重复并顺序随机。

void GenM(int m, int n) {
  for (int i = 0; i < m; ++i) {
    cout << randint(0, n - 1) << " ";
  }
  cout << endl;
}

9

int randint(int m, int n) {
  return m + (rand() / (RAND_MAX / (n - m + 1) + 1));
}

void GenSets(int m, int n) {
  set<int> num_set;
  int t;
  for (int i = n - m ; i < n; ++i) {
    t = randint(0, i);
    if (num_set.find(t) == num_set.end()) {
      num_set.insert(t);
    } else {
      num_set.insert(i);
    }
  }
  for (set<int>::iterator it = num_set.begin(); it != num_set.end(); ++it) {
    cout << *it << " ";
  }
  cout << endl;
}

10

int randint(int m, int n) {
  return m + (rand() / (RAND_MAX / (n - m + 1) + 1));
}

int Select() {
  int res;
  int i = 0;
  res = object[i];
  ++i;
  while (IsEnd(object[i])) {
      int j = randint(0, i);
      if (j < 1) {
        res = object[i];
      }
      ++i;
  }
  return res;
}

More: 选 k 个

12

生成 N>1e6 组的 m 个随机数，计算生成每个随机数出现的概率，是不是符合预期，还是偏差很大而不是随机的。

Programming Pearls: Column8

2014-05-17T00:00:00+08:00

Book notes

问题与算法

给出数组中找出连续子数组最大和。

1

直接算每个子区间的和并比较得出最大值。算法复杂度 O(n³)。

float FindMaxSubvectorAlg1(const vector<float> &num) {
  int i, j, k;
  float sum, max_sofar;
  max_sofar = 0;
  for (i = 0; i < num.size(); ++i) {
    for (j = 0; j < num.size(); ++j) {
      sum = 0;
      for (k = i; k <= j; k++) {
        sum += num[k];
        if (sum > max_sofar) {
          max_sofar = sum;
        }
      }
    }
  }
  return max_sofar;
}

2

2.1

因为 x[i..j]直接的和可以基于 x[i..j-1]的和算出，不用重头开始算。算法复杂度 O(n²)。

float FindMaxSubvectorAlg2(const vector<float> &num) {
  int i, j;
  float sum, max_sofar;
  max_sofar = 0;
  for (i = 0; i < num.size(); ++i) {
    sum = 0;
    for (j = i; j < num.size(); ++j) {
      sum += num[j];
      if (sum > max_sofar) {
        max_sofar = sum;
      }
    }
  }
  return max_sofar;
}

2.2

先算出 x[0..i]区间的和为 cum_vector[i] ，那么 x[i..j]区间的和就是 cum_vector[j] - cum-vector[i-1]

float FindMaxSubvectorAlg2b(const vector<float> &num) {
  vector<float> cum_vector(num.size() + 1);
  int i, j;
  cum_vector[0] = 0;
  for (i = 1; i < cum_vector.size(); ++i) {
    cum_vector[i] = cum_vector[i - 1] + num[i];
  }
  float sum, max_sofar;
  max_sofar = 0;
  for (i = 1; i < cum_vector.size(); ++i) {
    for (j = i; j < cum_vector.size(); ++j) {
      sum = cum_vector[j] - cum_vector[i - 1];
      if (sum > max_sofar) {
        max_sofar = sum;
      }
    }
  }
  return max_sofar;
}

3

Divide-and-Conquer 算法。

求整个数组的子数组和，可以分成前面一半和后面一半

求出前半部分的最大子数组和后半部分的最大子数组和

求出两部分中间连着的子数组最大和

最后比较这 3 部分和就能得出整个个数组的子数组最大和

float FindMaxSubvectorAlg3Core(const vector<float> &num, int l, int u) {
  if (l > u) {
    return 0;
  }
  if (l == u) {
    return max<float>(num[l], 0);
  }
  int m;
  m = (l + u) / 2;
  float lmax, rmax, sum;
  lmax = sum = 0;
  for (int i = m; i >= l; --i) {
    sum += num[i];
    if (sum > lmax) {
      lmax = sum;
    }
  }
  rmax = sum = 0;
  for (int i = m + 1; i <= u; ++i) {
    sum += num[i];
    if (sum > rmax) {
      rmax = sum;
    }
  }
  return max(lmax + rmax, max(FindMaxSubvectorAlg3Core(num, l, m),
                              FindMaxSubvectorAlg3Core(num, m + 1, u)));
}

float FindMaxSubvectorAlg3(const vector<float> &num) {
  return FindMaxSubvectorAlg3Core(num, 0, num.size() - 1);
}

4

假定已经解决了 x[0..i-1]的情况，那么如何扩展到 x[0..i]的情况，只多了 x[i] 元素？

解决了 x[0..i-1]的情况，有这区间的最大子数组和 max_sofar ，和必须以 x[i-1]结尾的子数组最大和;
到 x[0..i]的情况，就要把必须以 x[i-1]结尾的子数组最大和与 x[i]相加，如果以 x[i-1]结尾的子数组为负数的话，加了反而减少总和。所以此种情况以 x[i]的和就是 x[i];
最后把以 x[i]与在区间 x[0..i-1]的最大子数组和 max_sofar 比较，就能解决 x[0..i]的情况;
如此一直扩展到 x[0..n]算出整个数组的最大子数组和。

只扫描一遍，算法复杂度 O(n)。

float FindMaxSubvectorAlg4(const vector<float> &num) {
  float max_sofar, max_ending_here;
  max_sofar = max_ending_here = 0;
  for (int i = 0; i < num.size(); ++i) {
    max_ending_here += num[i];
    if (max_ending_here < 0) {
      max_ending_here = 0;
    }
    if (max_ending_here > max_sofar) {
      max_sofar = max_ending_here;
    }
  }
  return max_sofar;
}

算法设计技巧

保存状态防止重复计算。
预处理信息到适当的数据结构中来加快之后的计算。比如先建立堆，先排序等。
分而治之，把大问题分成类似的小问题解决。
扫描算法。比如解出了 x[0..i-1]如何扩展到 x[0..i].
累积。
确定问题的算法复杂度下界。

Problems

10

初始化累积和数组 cum，使得 cum[i]=x[0]+x[1]...x[i] ，那么要 x[l..u] 区间的和为 0 的话，cum[l-1] = cum[u]
排序 cum 数组;
扫描排序好的数组 cum，找出最相近的相邻数组元素即得到结果。

算法复杂度 O(n) + O(nlogn) + O(n-1) = O(nlogn).

找出子数组和与一个特定值 r 最相近，算法类似，只是 step3 找出与 r 最相近的相邻数组元素。

11

累积收费和数组 cum，使得 cum[i]=x[0]+x[1]...x[i]
计算 l 和 u 关卡之间的收费 cum[u]-cum[l]

(learn&think)

C++11 Memory Model and Atomic

Table of Contents

C++11 Atomic1

atomic 类

atomic 类型的操作函数

atomic_flag 类

内存序列同步相关操作

Memory Model and Order

More

Footnotes:

Make Colorful Equations With Mathjax

加载 Mathjax Color extension

使用

浅谈C++11 Multithreading Programming

Overview

Compile

Threads

概要

thread::id 类

thread 类

Constructs a thread object

joinable

get_id

native_handle

hardware_concurrency (static)

swap

管理当前 thread 的函数

Mutual exclusion

概要

Lockable types

BasicLockable

Lockable

TimedLockable

mutex 类

recursive_mutex 类

timed_mutex 类

recursive_timed_mutex 类

Mutex Exception safety

lock_guard 类

unique_lock 类

class

Constructor

实例

lock_guard VS unique_lock

try_lock 和 lock

call_once

Condition variables

概要

cv_status

notify_all_at_thread_exit

condition_variable 类

condition_variable_any 类

condition_variable VS condition_variable_any

Future

概要

Error handling

future_error 类

future_errc

future_status

future_category

template promise

class

set_value and set_value_at_thread_exit

set_exception and set_exception_at_thread_exit

template packaged_task

class

construct and use

reset

template future 类

template shared_future 类

template async 函数

Launching policy for async

async

Header synopsis

其他资料

Books

Online resources

浅谈C++ Multithreading Programming

Overview

C++11 Atomic¹

`atomic` 类

`atomic` 类型的操作函数

`atomic_flag` 类

`thread::id` 类

`thread` 类

`get_id`

`native_handle`

`hardware_concurrency` (static)

`mutex` 类

`recursive_mutex` 类

`timed_mutex` 类

`recursive_timed_mutex` 类

`lock_guard` 类

`unique_lock` 类

`lock_guard` VS `unique_lock`

`try_lock` 和 `lock`

`call_once`

`cv_status`

`notify_all_at_thread_exit`

`condition_variable` 类

`condition_variable_any` 类

`condition_variable` VS `condition_variable_any`

`future_error` 类

`future_errc`

`future_status`

`future_category`

`template promise`

`set_value` and `set_value_at_thread_exit`

`set_exception` and `set_exception_at_thread_exit`

`template packaged_task`

`template future` 类

`template shared_future` 类

Thread 设计模式¹

Performance Analysis Tools²