C++11 Atomic 可简单分为 4 部分:
atomic
类
atomic
类型的操作函数
atomic_flag
类
atomic
类主要分为四种模板类:
std::atomic
template< class T > struct atomic;
template<> struct atomic<Integral>;
template<> struct atomic<bool>;
template< class T > struct atomic<T*>;
bool 和 integral 类型:
std::atomic_bool std::atomic<bool> std::atomic_char std::atomic<char> std::atomic_schar std::atomic<signed char> std::atomic_uchar std::atomic<unsigned char> std::atomic_short std::atomic<short> std::atomic_ushort std::atomic<unsigned short> std::atomic_int std::atomic<int> std::atomic_uint std::atomic<unsigned int> std::atomic_long std::atomic<long> std::atomic_ulong std::atomic<unsigned long> std::atomic_llong std::atomic<long long> std::atomic_ullong std::atomic<unsigned long long> std::atomic_char16_t std::atomic<char16_t> std::atomic_char32_t std::atomic<char32_t> std::atomic_wchar_t std::atomic<wchar_t> std::atomic_int8_t std::atomic<std::int8_t> std::atomic_uint8_t std::atomic<std::uint8_t> std::atomic_int16_t std::atomic<std::int16_t> std::atomic_uint16_t std::atomic<std::uint16_t> std::atomic_int32_t std::atomic<std::int32_t> std::atomic_uint32_t std::atomic<std::uint32_t> std::atomic_int64_t std::atomic<std::int64_t> std::atomic_uint64_t std::atomic<std::uint64_t> std::atomic_int_least8_t std::atomic<std::int_least8_t> std::atomic_uint_least8_t std::atomic<std::uint_least8_t> std::atomic_int_least16_t std::atomic<std::int_least16_t> std::atomic_uint_least16_t std::atomic<std::uint_least16_t> std::atomic_int_least32_t std::atomic<std::int_least32_t> std::atomic_uint_least32_t std::atomic<std::uint_least32_t> std::atomic_int_least64_t std::atomic<std::int_least64_t> std::atomic_uint_least64_t std::atomic<std::uint_least64_t> std::atomic_int_fast8_t std::atomic<std::int_fast8_t> std::atomic_uint_fast8_t std::atomic<std::uint_fast8_t> std::atomic_int_fast16_t std::atomic<std::int_fast16_t> std::atomic_uint_fast16_t std::atomic<std::uint_fast16_t> std::atomic_int_fast32_t std::atomic<std::int_fast32_t> std::atomic_uint_fast32_t std::atomic<std::uint_fast32_t> std::atomic_int_fast64_t std::atomic<std::int_fast64_t> std::atomic_uint_fast64_t std::atomic<std::uint_fast64_t> std::atomic_intptr_t std::atomic<std::intptr_t> std::atomic_uintptr_t std::atomic<std::uintptr_t> std::atomic_size_t std::atomic<std::size_t> std::atomic_ptrdiff_t std::atomic<std::ptrdiff_t> std::atomic_intmax_t std::atomic<std::intmax_t> std::atomic_uintmax_t std::atomic<std::uintmax_t>
基本模板类定义:
template < class T > struct atomic { bool is_lock_free() const volatile; bool is_lock_free() const; void store(T, memory_order = memory_order_seq_cst) volatile; void store(T, memory_order = memory_order_seq_cst); T load(memory_order = memory_order_seq_cst) const volatile; T load(memory_order = memory_order_seq_cst) const; operator T() const volatile; operator T() const; T exchange(T, memory_order = memory_order_seq_cst) volatile; T exchange(T, memory_order = memory_order_seq_cst); bool compare_exchange_weak(T &, T, memory_order, memory_order) volatile; bool compare_exchange_weak(T &, T, memory_order, memory_order); bool compare_exchange_strong(T &, T, memory_order, memory_order) volatile; bool compare_exchange_strong(T &, T, memory_order, memory_order); bool compare_exchange_weak(T &, T, memory_order = memory_order_seq_cst) volatile; bool compare_exchange_weak(T &, T, memory_order = memory_order_seq_cst); bool compare_exchange_strong(T &, T, memory_order = memory_order_seq_cst) volatile; bool compare_exchange_strong(T &, T, memory_order = memory_order_seq_cst); atomic() = default; constexpr atomic(T); atomic(const atomic &) = delete; atomic & operator=(const atomic &) = delete; atomic & operator=(const atomic &) volatile = delete; T operator=(T) volatile; T operator=(T); };
Integral 特有的函数:
integral fetch_add(integral, memory_order = memory_order_seq_cst) volatile; integral fetch_add(integral, memory_order = memory_order_seq_cst); integral fetch_sub(integral, memory_order = memory_order_seq_cst) volatile; integral fetch_sub(integral, memory_order = memory_order_seq_cst); integral fetch_and(integral, memory_order = memory_order_seq_cst) volatile; integral fetch_and(integral, memory_order = memory_order_seq_cst); integral fetch_or(integral, memory_order = memory_order_seq_cst) volatile; integral fetch_or(integral, memory_order = memory_order_seq_cst); integral fetch_xor(integral, memory_order = memory_order_seq_cst) volatile; integral fetch_xor(integral, memory_order = memory_order_seq_cst); integral operator++(int) volatile; integral operator++(int); integral operator--(int) volatile; integral operator--(int); integral operator++() volatile; integral operator++(); integral operator--() volatile; integral operator--(); integral operator+=(integral) volatile; integral operator+=(integral); integral operator-=(integral) volatile; integral operator-=(integral); integral operator&=(integral) volatile; integral operator&=(integral); integral operator|=(integral) volatile; integral operator|=(integral); integral operator^=(integral) volatile; integral operator^=(integral);
指针特有的函数
T* fetch_add(ptrdiff_t, memory_order = memory_order_seq_cst) volatile; T* fetch_add(ptrdiff_t, memory_order = memory_order_seq_cst); T* fetch_sub(ptrdiff_t, memory_order = memory_order_seq_cst) volatile; T* fetch_sub(ptrdiff_t, memory_order = memory_order_seq_cst); T* operator=(T*) volatile; T* operator=(T*); T* operator++(int) volatile; T* operator++(int); T* operator--(int) volatile; T* operator--(int); T* operator++() volatile; T* operator++(); T* operator--() volatile; T* operator--(); T* operator+=(ptrdiff_t) volatile; T* operator+=(ptrdiff_t); T* operator-=(ptrdiff_t) volatile; T* operator-=(ptrdiff_t);
atomic
类型的操作函数
除了 atomic
类的成员函数,也提供了对其操作的函数:
atomic_is_lock_free
: checks if the atomic type’s operations are
lock-free
atomic_store
and atomic_store_explicit
: atomically replaces the
value of the atomic object with a non-atomic argument
atomic_load
and atomic_load_explicit
: atomically obtains the
value stored in an atomic object
atomic_exchange
and atomic_exchange_explicit
: atomically
replaces the value of the atomic object with non-atomic argument and
returns the old value of the atomic
atomic_compare_exchange_weak
atomic_compare_exchange_weak_explicit
atomic_compare_exchange_strong
atomic_compare_exchange_strong_explicit
: atomically compares the
value of the atomic object with non-atomic argument and performs
atomic exchange if equal or atomic load if not
atomic_fetch_add
atomic_fetch_add_explicit
: adds a non-atomic value to an atomic
object and obtains the previous value of the atomic
atomic_fetch_sub
atomic_fetch_sub_explicit
: subtracts a non-atomic value from an
atomic object and obtains the previous value of the atomic
atomic_fetch_and
atomic_fetch_and_explicit
: replaces the atomic object with the
result of logical AND with a non-atomic argument and obtains the
previous value of the atomic
atomic_fetch_or
atomic_fetch_or_explicit
: replaces the atomic object with the
result of logical OR with a non-atomic argument and obtains the
previous value of the atomic
atomic_fetch_xor
atomic_fetch_xor_explicit
: replaces the atomic object with the
result of logical XOR with a non-atomic argument and obtains the
previous value of the atomic
atomic_flag
类
atomic_flag
是一种原子布尔类型,不同于 std::atomic<bool>
, 不提供 load
或 store 操作,只支持两种操作, test_and_set
和 clear
。
atomic_flag() noexcept = default; atomic_flag (const atomic_flag&T) = delete;
std::atomic_flag
只有默认构造函数,拷贝构造函数已被禁用. 一般使用
ATOMIC_FLAG_INIT
初始化为 clear 状态.
memory_order
: defines memory ordering constraints for the given
atomic operation
enum memory_order { memory_order_relaxed, memory_order_consume, memory_order_acquire, memory_order_release, memory_order_acq_rel, memory_order_seq_cst };
kill_dependency
: removes the specified object from the
std::memory_order_consume
dependency tree
atomic_thread_fence
: Establishes memory synchronization ordering
of non-atomic and relaxed atomic accesses, as instructed by order,
without an associated atomic operation.
atomic_signal_fence
: Establishes memory synchronization ordering
of non-atomic and relaxed atomic accesses, as instructed by order,
between a thread and a signal handler executed on the same thread.
This is equivalent to std::atomic_thread_fence, except no CPU
instructions for memory ordering are issued. Only reordering of the
instructions by the compiler is suppressed as order instructs.
在浅谈 Memory Reordering中提及编译开发者和处理器制造商遵循的中心内存排序准则是: 不能改变单线程程序的行为. 从而产生了:
在多核多线程时代,当多线程共享某一变量时,不同线程对共享变量的读写就应该格外小心,不适当的乱序执行可能导致程序运行错误。所以必须对编译器和 CPU 作出一定的约束才能合理正确地优化你的程序,这个约束就是 内存模型 (Memory Model) .
或者说,程序转化成机器指令执行时并不按照之前的原始代码顺序执行,所以内存模型是程序员、编译器,CPU 之间的准则约束,遵守这一准则约束后,大家各自做优化, 从而尽可能提高程序的性能。
wiki 上的 Memory model给出一个比较抽象的描述: In computing, a memory model describes the interactions of threads through memory and their shared use of the data.
C++11 中规定了 6 种访存次序(Memory Order),如下:
enum memory_order { memory_order_relaxed, memory_order_consume, memory_order_acquire, memory_order_release, memory_order_acq_rel, memory_order_seq_cst };
上面 C++11 Atomic 涉及 memory_order
的接口, 默认值是
std::memory_order_seq_cst
.
可以把上述 6 种访存次序(内存序)分为 3 类,顺序一致性模型
(memory_order_seq_cst
),Acquire-Release 模型
(memory_order_consume
, memory_order_acquire
,
memory_order_release
, memory_order_acq_rel
) 和 Relax 模型
(memory_order_relaxed
).
memory_order_relaxed
: all reorderings are okay2
memory_order_acquire
: guarantees that subsequent loads are not
moved before the current load or any preceding loads.
memory_order_release
: preceding stores are not moved past the
current store or any subsequent stores.
memory_order_acq_rel
: combines the two previous guarantees.
memory_order_consume
: potentially weaker form of
memory_order_acquire that enforces ordering of the current load
before other operations that are data-dependent on it (for instance,
when a load of a pointer is marked memory_order_consume, subsequent
operations that dereference this pointer won’t be moved before it
(yes, even that is not guaranteed on all platforms!).
memory_order_scq_cst
: 是 memory_order_acq_rel
的加强版,除了有
acq_rel
语义,还保证是sequencially-consistent.
其中, $\mu$ 是分布的均值或期望值, 而 $\sigma$ 是它的标准差, $\sigma^2$ 则是方差.
1 2 3 4 5 6 7 8 9 10 11 12 |
|
使用 \definecolor
和 \color
来为公式添加颜色如下:
1 2 3 4 5 6 7 8 9 10 11 |
|
MathJax 的 \definecolor
不支持 HTML
的颜色颜色空间,所以手动在它们之间转换颜色,文字部分如下:
1 2 3 |
|
上一篇浅谈 C++ Multithreading Programming主要介绍时下规范好的 C++使用 Pthread 库和 Boost Thread 库实现 C++多线程编程.这里主要谈谈正在规范的 C++11 引入的 Thread 库和 Atomic 库,终于自带的 C++库能支持高效并可移植的 Multithreading 编程.分为 2 篇,这里先谈谈 C++11 的Thread 的库 (并包含对 C 的支持), 后一篇谈谈 C++11 的Atomic 操作的库.
C++11(之前被成为 C++0x)是编程语言 C++最新版本的标准.它由 ISO 在 2011 年 8 月 12 日被批准替代C++03. C++11 标准正在规范中,从ISO 页面 可以知道如何获得进行中的草稿:
所以本文:
更多有关 C++参考最后的其他资料.
GCC 编译支持 C++11,使用编译选项 -std=c++11
或 -std=gnu++11
, 前者关闭
GNU 扩张支持.并加上 -pthread
选项.
g++ program.o -o program -std=c++11 -pthread
如果漏掉 -phtread
选项,编译能通过,当运行出现如下错误:
terminate called after throwing an instance of 'std::system_error'
what(): Enable multithreading to use std::thread: Operation not permitted
<thread>
概要
头文件是 <thread>
, 分为两部分: thread
类和在 namespace
this_thread
用来管理当前 thread 的函数.具体见之后的Header <thread> synopsis.
thread::id
类
thread::id
类型的对象为每个执行的线程提供唯一的标识,并为所有并不表示线程执行(默认构造的线程对象)的所有线程对象提供一个唯一的值.
thread::id
类没有特别的东西,主要提供方便比较或打印等运算符重载.
namespace std { class thread::id { public: id() noexcept; }; bool operator==(thread::id x, thread::id y) noexcept; bool operator!=(thread::id x, thread::id y) noexcept; bool operator<(thread::id x, thread::id y) noexcept; bool operator<=(thread::id x, thread::id y) noexcept; bool operator>(thread::id x, thread::id y) noexcept; bool operator>=(thread::id x, thread::id y) noexcept; template<class charT, class traits> basic_ostream<charT, traits>& operator<< (basic_ostream<charT, traits>& out, thread::id id); // Hash support template <class T> struct hash; template <typename T> <> struct hash<thread::id>; }
thread
类namespace std { class thread { public: // types: class id; typedef implementation-defined native_handle_type; // See 30.2.3 // construct/copy/destroy: thread() noexcept; template <class F, class ...Args> explicit thread(F&& f, Args&&... args); ~thread(); thread(const thread&) = delete; thread(thread&&) noexcept; thread& operator=(const thread&) = delete; thread& operator=(thread&&) noexcept; // members: void swap(thread&) noexcept; bool joinable() const noexcept; void join(); void detach(); id get_id() const noexcept; native_handle_type native_handle(); // See 30.2.3 // static members: static unsigned hardware_concurrency() noexcept; }; }
从如上的 thread
类知道, 构造 thread 对象:
args..
到 thread 可访问的内存通过如下函数:
template <class T> typename decay<T>::type decay_copy(T&& v) { return std::forward<T>(v); }
求值和复制/移动参数过程丢出的任何 exceptions 仅在当前线程丢出,不在新线程中.
实例:
#include <iostream> // NOLINT #include <utility> #include <thread> #include <functional> using std::cout; using std::endl; void Thread1Fun(int n) { for (int i = 0; i < n; ++i) { cout << "Thread 1 executing" << endl; } } void Thread2Fun(const int& n) { for (int i = 0; i < n; ++i) { std::cout << "Thread 2 executing\n"; } } int main() { const int kLoops = 5; std::thread t1; // t1 is not a thread std::thread t2(Thread1Fun, kLoops + 1); // pass by value std::thread t3(Thread2Fun, std::ref(kLoops)); // pass by reference std::thread t4(std::move(t3)); // t4 is now running f2(). t3 is no longer a thread t2.join(); t4.join(); return 0; }
用来检查一个线程对象是否是正在执行的线程.若是,返回 true
. 所以默认构造 thread 对象是不可 joinable.
实例:
#include <iostream> // NOLINT #include <thread> #include <chrono> using std::cout; using std::endl; void ThreadFun() { std::this_thread::sleep_for(std::chrono::seconds(1)); } int main() { std::thread t; cout << "default construct, joinable: " << t.joinable() << endl; t = std::thread(ThreadFun); cout << "initial construct, joinable: " << t.joinable() << endl; t.join(); return 0; }
结果:
default construct, joinable: 0 initial construct, joinable: 1
get_id
返回 thread 对象的 std::thread::id
值.
实例:
#include <iostream> // NOLINT #include <thread> #include <chrono> using std::cout; using std::endl; void ThreadFun() { std::this_thread::sleep_for(std::chrono::seconds(1)); } int main() { std::thread t1(ThreadFun); std::thread::id id_t1 = t1.get_id(); cout << "thread1's id: " << id_t1 << endl; t1.join(); return 0; }
native_handle
这个函数是 implementation-defined. 它允许提供底层实现细节的访问.但实际使用它是 non-portable.
实例: 使用 native_handle
打开在 POSIX 系统上 C++线程的实时调度.
#include <pthread.h> #include <thread> #include <mutex> #include <iostream> // NOLINT #include <chrono> #include <cstring> using std::cout; using std::endl; std::mutex iomutex; void ThreadFun(int thread_id) { std::this_thread::sleep_for(std::chrono::seconds(1)); sched_param sch; int policy; pthread_getschedparam(pthread_self(), &policy, &sch); std::lock_guard<std::mutex> lk(iomutex); cout << "Thread " << thread_id << " is executing at priority " << sch.sched_priority << endl; } int main() { std::thread t1(ThreadFun, 1), t2(ThreadFun, 2); sched_param sch; int policy; pthread_getschedparam(t1.native_handle(), &policy, &sch); sch.sched_priority = 20; if (pthread_setschedparam(t1.native_handle(), SCHED_FIFO, &sch)) { cout << "Failed to setschedparam: " << std::strerror(errno) << endl; } t1.join(); t2.join(); return 0; }
使用 Super User,结果:
$ sudo ./test Thread 1 is executing at priority 20 Thread 2 is executing at priority 0
hardware_concurrency
(static)返回硬件支持的 thread 数.这个值仅作为参考.如果这个值不可计算或没有很多的定义,那么实现返回 0.
#include <iostream> // NOLINT #include <thread> int main() { unsigned int num = std::thread::hardware_concurrency(); std::cout << num << " concurrent threads are supported." << std::endl; }
swap
操作用来交换 2 个线程对象的底层句柄.有 2 种可选,thread 类的成员函数和在 std 下的全局函数.
实例:
#include <iostream> // NOLINT #include <thread> #include <chrono> void Thread1Fun() { std::this_thread::sleep_for(std::chrono::seconds(1)); } void Thread2Fun() { std::this_thread::sleep_for(std::chrono::seconds(1)); } int main() { std::thread t1(Thread1Fun); std::thread t2(Thread2Fun); std::cout << "thread 1 id: " << t1.get_id() << std::endl; std::cout << "thread 2 id: " << t2.get_id() << std::endl; std::swap(t1, t2); std::cout << "after std::swap(t1, t2):" << std::endl; std::cout << "thread 1 id: " << t1.get_id() << std::endl; std::cout << "thread 2 id: " << t2.get_id() << std::endl; t1.swap(t2); std::cout << "after t1.swap(t2):" << std::endl; std::cout << "thread 1 id: " << t1.get_id() << std::endl; std::cout << "thread 2 id: " << t2.get_id() << std::endl; t1.join(); t2.join(); return 0; }
在 thread 的头文件中,加了一个新的 namespace this_thread
用来包含一些管理操作当前 thread 的一些函数.
void yield();
重新调度线程的执行,让其他线程运行.具体行为依赖于实现,与 OS 的调度机制有关.
std::thread::id get_id();
返回当前线程的 thread::id
类型的对象.
template< class Rep, class Period > void sleep_for( const std::chrono::duration<Rep, Period>& sleep_duration );
阻塞当前线程的执行至少相对时间 sleep_duration
.
template< class Clock, class Duration > void sleep_until( const std::chrono::time_point<Clock,Duration>& sleep_time );
阻塞当前线程的执行直到绝对时间 sleep_time
到达.
实例:
#include <iostream> // NOLINT #include <thread> #include <chrono> #include <mutex> #include <atomic> #include <ctime> using std::cout; using std::endl; using std::chrono::system_clock; std::atomic<bool> ready(false); void Thread1Fun() { while (!ready) { std::this_thread::yield(); } std::thread::id id = std::this_thread::get_id(); cout << "thread " << id << "go to sleep" << endl; std::this_thread::sleep_for(std::chrono::seconds(1)); } void Thread2Fun() { std::thread::id id = std::this_thread::get_id(); cout << "thread " << id << "is running" << endl; ready = true; std::time_t tt = system_clock::to_time_t(system_clock::now()); struct std::tm *ptm = std::localtime(&tt); ptm->tm_sec += 2; std::this_thread::sleep_until(system_clock::from_time_t(mktime(ptm))); } int main() { std::thread t1(Thread1Fun); std::thread t2(Thread2Fun); t1.join(); t2.join(); return 0; }
<mutex>
概要
头文件 <mutex>
分为: mutexes,locks 和一些特殊函数.
具体见之后的Header <mutex> synopsis.
mutex
,
recursive_mutex
, timed_mutex
, recursive_timed_mutex
.
lock_guard
, unique_lock
.
try_lock
, lock
),并使某个函数只被调用一次(call_once
).
BasicLockable 概念描述了最少特性类型,也就是满足(若 m 是 BasicLockable 类型 ):
m.lock()
m.unlock()
所以所有 mutex 都满足 BasicLockable 类型: mutex
, recursive_mutex
,
timed_mutex
, recursive_timed_mutex
, unique_lock
.
Lockable 概念扩展了 BasicLockable 概念,并支持 try_lock
.
所以这些 mutex 满足 Lockable 类型: mutex
, recursive_mutex
,
timed_mutex
, recursive_timed_mutex
.
TimedLockable 概念扩展了 Lockable 概念,并支持 try_lock_for
和
try_lock_until
.
所以这些 mutex 满足 TimedLockable 类型: timed_mutex
,
recursive_timed_mutex
.
mutex
类
mutex
类提供了一个不可递归的排它锁.基本接口可以从如下类中参考.
namespace std { class mutex { public: constexpr mutex() noexcept; ~mutex(); mutex(const mutex&) = delete; mutex& operator=(const mutex&) = delete; void lock(); bool try_lock(); void unlock(); typedef implementation-defined native_handle_type; // See 30.2.3 native_handle_type native_handle(); // See 30.2.3 }; }
实例:
#include <iostream> // NOLINT #include <vector> #include <thread> #include <mutex> using std::cout; using std::endl; using std::vector; int g_value = 0; std::mutex count_mutex; void Increase() { const int kLoops = 100; for (int i = 0; i < kLoops; ++i) { count_mutex.lock(); g_value++; count_mutex.unlock(); } } int main(int argc, char *argv[]) { const int kNumThreads = 5; vector<std::thread> threads; for (int i = 0; i < kNumThreads; ++i) { threads.push_back(std::thread(Increase)); } for (auto &thread : threads) { thread.join(); } cout << "value = " << g_value << endl; return 0; }
recursive_mutex
类namespace std { class recursive_mutex { public: recursive_mutex(); ~recursive_mutex(); recursive_mutex(const recursive_mutex&) = delete; recursive_mutex& operator=(const recursive_mutex&) = delete; void lock(); bool try_lock() noexcept; void unlock(); typedef implementation-defined native_handle_type; // See 30.2.3 native_handle_type native_handle(); // See 30.2.3 }; }
timed_mutex
类namespace std { class timed_mutex { public: timed_mutex(); ~timed_mutex(); timed_mutex(const timed_mutex&) = delete; timed_mutex& operator=(const timed_mutex&) = delete; void lock(); bool try_lock(); template <class Rep, class Period> bool try_lock_for(const chrono::duration<Rep, Period>& rel_time); template <class Clock, class Duration> bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time); void unlock(); typedef implementation-defined native_handle_type; // See 30.2.3 native_handle_type native_handle(); // See 30.2.3 }; }
recursive_timed_mutex
类namespace std { class recursive_timed_mutex { public: recursive_timed_mutex(); ~recursive_timed_mutex(); recursive_timed_mutex(const recursive_timed_mutex&) = delete; recursive_timed_mutex& operator=(const recursive_timed_mutex&) = delete; void lock(); bool try_lock() noexcept; template <class Rep, class Period> bool try_lock_for(const chrono::duration<Rep, Period>& rel_time); template <class Clock, class Duration> bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time); void unlock(); typedef implementation-defined native_handle_type; // See 30.2.3 native_handle_type native_handle(); // See 30.2.3 }; }
基本保证: 当 exception 被以上 mutex 的成员函数抛出时,这些 mutex 对象保持有效状态. 如果是 lock
操作被 exception, lock 不会被抛出 exception 的线程所拥有.
抛出的是一个 system_error
exception, 导致的基本情况是:
exception 类型 | error 情况 | 描述 |
---|---|---|
system_error |
errc::resource_deadlock_would_occur |
deadlock 被检测到 |
system_error |
errc::operation_not_permitted |
线程没有权利做这个操作 |
system_error |
errc::device_or_resource_busy |
native handle 已经被锁 |
lock_guard
类之前的 mutex 必须写明 lock 和 unlock 调用,如果在 lock 和 unlock 之间产生 exception,那么必须在 exception 处理中不能忘记处理 unlock.当只是在一个关键区域内需要 mutex 保护,使用这样的 mutex 既不方便也容易忘记 unlock 而造成死锁.
引入对之前的 mutex 的封装后的 lock_guard
和 unique_lock
,提供易用性的 RAII-style 机制来获取锁在一段区域内.
lock guard 是一个用来管理一个 mutex 对象,并保持锁住它的对象.
在构造时,mutex 对象被调用的线程锁住,然后在析构时,mutex 被解锁.它是最简单的 lock,并且作为自动作用范围直到它的作用区域结束时特别有用.通过这种方法,它保证 mutex 对象得到解锁即使在 exception 被抛出时.
namespace std { template <class Mutex> class lock_guard { public: typedef Mutex mutex_type; explicit lock_guard(mutex_type& m); lock_guard(mutex_type& m, adopt_lock_t); ~lock_guard(); lock_guard(lock_guard const&) = delete; lock_guard& operator=(lock_guard const&) = delete; private: mutex_type& pm; // exposition only }; }
实例:
#include <iostream> // NOLINT #include <thread> #include <mutex> #include <stdexcept> std::mutex mtx; void PrintEven(int x) { if (x % 2 == 0) { std::cout << x << " is even\n"; } else { throw(std::logic_error("not even")); } } void PrintThreadEvenId(int id) { try { std::lock_guard<std::mutex> lck(mtx); PrintEven(id); } catch (std::logic_error&) { std::cout << "[exception caught]" << std::endl; } } int main() { std::thread threads[10]; for (int i = 0; i < 10; ++i) { threads[i] = std::thread(PrintThreadEvenId, i+1); } for (auto& th : threads) { th.join(); } return 0; }
unique_lock
类
unique_lock
与上面的 lock_guard
基本差不多,同样是 RAII-style 机制来获取锁在一段区域内的对象.
但 lock_guard
非常简单,只提供构造自动拥有锁和析构释放锁,如果需要一些其他的操作,那么就需要更复杂和接口更多的类来处理, lock_guard
能满足如此要求. 它类基本接口如下.
namespace std { template <class Mutex> class unique_lock { public: typedef Mutex mutex_type; // 30.4.2.2.1, construct/copy/destroy: unique_lock() noexcept; explicit unique_lock(mutex_type& m); unique_lock(mutex_type& m, defer_lock_t) noexcept; unique_lock(mutex_type& m, try_to_lock_t); unique_lock(mutex_type& m, adopt_lock_t); template <class Clock, class Duration> unique_lock(mutex_type& m, const chrono::time_point<Clock, Duration>& abs_time); template <class Rep, class Period> unique_lock(mutex_type& m, const chrono::duration<Rep, Period>& rel_time); ~unique_lock(); unique_lock(unique_lock const&) = delete; unique_lock& operator=(unique_lock const&) = delete; unique_lock(unique_lock&& u) noexcept; unique_lock& operator=(unique_lock&& u) noexcept; // 30.4.2.2.2, locking: void lock(); bool try_lock(); template <class Rep, class Period> bool try_lock_for(const chrono::duration<Rep, Period>& rel_time); template <class Clock, class Duration> bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time); void unlock(); // 30.4.2.2.3, modifiers: void swap(unique_lock& u) noexcept; mutex_type *release() noexcept; // 30.4.2.2.4, observers: bool owns_lock() const noexcept; explicit operator bool () const noexcept; mutex_type* mutex() const noexcept; private: mutex_type *pm; // exposition only bool owns; // exposition only }; template <class Mutex> void swap(unique_lock<Mutex>& x, unique_lock<Mutex>& y) noexcept; }
在mutex header 概要中可以看到有不同的构造函数,其中一类 unique_lock
构造传入不同的类型:
defer_lock
: 不去获取 mutex,只有要和 mutex 一样,手动去 lock 它.
try_to_lock
: 相当于在构造时,调用 try_lock
, 不阻塞,之后可通过成员函数 bool owns_lock()
或直接操作符 explicit operator bool()
const
判断是否获取锁成功.
adopt_lock_t
: 认为调用的线程已经占有这个锁 m.已经占有这个锁了,为什么要去创建一个 unique_lock
去包含它呢? 因为可以利用 unique_lock
中途接手管理这个锁 m, 比如想用 RAII-style 机制管理它,使它 exception
safe 等.
这些类型在源代码定义基本如下:
struct defer_lock_t { }; struct try_to_lock_t { }; struct adopt_lock_t { }; constexpr std::defer_lock_t defer_lock = std::defer_lock_t(); constexpr std::try_to_lock_t try_to_lock = std::try_to_lock_t(); constexpr std::adopt_lock_t adopt_lock = std::adopt_lock_t();
余下的构造:
unique_lock();
:仅仅创建一个 nique_lock
对象,不和任何 mutex 相关联.
nique_lock(unique_lock&& other);
: 通过 other 的内容来构造
nique_lock
对像,使得 other 不和任何 mutex 相关连联.
explicit unique_lock(mutex_type& m);
: 通过 m.lock()
来构造与 m 相关联的 unique_lock
对象.
unique_lock(mutex_type& m, const std::chrono::duration<Rep,Period>&
timeout_duration);
: 通过 m.try_lock_for(timeout_duration)
来构造与 m 相关联的 unique_lock
对象.
unique_lock( mutex_type& m, const
std::chrono::time_point<Clock,Duration>& timeout_time);
: 通过
m.try_lock_until(timeout_time)
来构造与 m 相关联的 unique_lock
对象.
利用 defer_lock
, 不去获取 mutex, 只创建与它相关联的 unique_lock
对象,之后用 lock()
同时去获取两个锁,防止死锁.
#include <iostream> // NOLINT #include <mutex> #include <thread> #include <chrono> using std::cout; using std::endl; struct Box { explicit Box(int num) : num_things{num} {} int num_things; std::mutex m; }; void Transfer(Box *from, Box *to, int num) { // don't actually take the locks yet std::unique_lock<std::mutex> lock1(from->m, std::defer_lock); std::unique_lock<std::mutex> lock2(to->m, std::defer_lock); // lock both unique_locks without deadlock std::lock(lock1, lock2); from->num_things -= num; to->num_things += num; // 'from.m' and 'to.m' mutexes unlocked in 'unique_lock' dtors } int main() { Box acc1(100); Box acc2(50); cout << "acc1 num = " << acc1.num_things << " ,acc2 num = " << acc2.num_things << endl; std::thread t1(Transfer, &acc1, &acc2, 10); std::thread t2(Transfer, &acc2, &acc1, 5); t1.join(); t2.join(); cout << "after transfer: " << "acc1 num = " << acc1.num_things << " ,acc2 num = " << acc2.num_things << endl; return 0; }
lock_guard
VS unique_lock
lock_guard
和 unique_lock
很大程序上很相似,都是 RAII-style 机制来封装一个 mutex 的锁, lock_guard
可以说是 unique_lock
更严格并拥有限制的接口的版本.
如何合适的选择两者的使用呢? 如果 lock_guard
对于情况 A 足够,那么就使用它. 不仅仅是从效率(efficiency)考虑,更是从想要表达的功能(functionality)
考虑. 使用 lock_guard
不仅避免了不需要的其他接口的开销,更是对读代码者表达它的意图,你将永远都不需要解锁这个 guard.
所以你先考虑使用 lock_guard
, 除非你需要 unique_lock
的功能. 比如
condition_variable
就需要传入一个 unique_lock
对象.
try_lock
和 lock
template< class Lockable1, class Lockable2, class LockableN... > int try_lock(Lockable1& lock1, Lockable2& lock2, LockableN& lockn... );
按对象 lock1, lock2, …, lockn 从头到尾的顺序尝试去获取每个锁. 如果某个 try_lock
失败, unlock 所有对象并返回. 返回值:
template< class Lockable1, class Lockable2, class LockableN... > void lock( Lockable1& lock1, Lockable2& lock2, LockableN& lockn... );
占有传入的锁 lock1, lock2, …, lockn,使用 防止死锁算饭 来防止死锁.
对于传入对象按照不特定的顺序调用它们的成员函数 lock
, try_lock
,
unlock
,确保最后所有的锁被获取成功在函数返回时.
call_once
class once_flag; template< class Callable, class... Args > void call_once( std::once_flag& flag, Callable&& f, Args&&... args );
为了让一段代码只被多个线程只执行一次, mutex 文件中中包含了这个保证只调用一次的接口.
once_flag
对象是辅助 call_once
的,作为多个线程共同执行这段的标识,
所以这些个线程必须传入同一个 once_flag
对象.
它并对 exception 做一定的处理,如果 call_once
执行的函数以 exception
退出,那么 exception 会抛给调用者.这次已 exception 退出的执行并不算一次,之后其他函数仍可以继续调用它一次.
如下的实例, t1 和 t2 线程抛出 exception, t3 仍然运行一次, t4 无论是怎样,都得不到运行.
#include <iostream> // NOLINT #include <thread> #include <mutex> using std::cout; using std::endl; std::once_flag flag; inline void MayThrowFunction(bool do_throw) { // only one instance of this function can be run simultaneously if (do_throw) { cout << "throw" << endl; // this message may be printed from 0 to 3 times // if function exits via exception, another function selected throw std::exception(); } cout << "once" << endl; // printed exactly once, it's guaranteed that // there are no messages after it } inline void DoOnce(bool do_throw) { try { std::call_once(flag, MayThrowFunction, do_throw); } catch (...) { } } int main() { std::thread t1(DoOnce, true); std::thread t2(DoOnce, true); std::thread t3(DoOnce, false); std::thread t4(DoOnce, true); t1.join(); t2.join(); t3.join(); t4.join(); return 0; }
<condition_variable>
概要
<condition_variable>
头文件主要包含两个 condition_variable
类, 一个全局函数.
namespace std { class condition_variable; class condition_variable_any; void notify_all_at_thread_exit(condition_variable& cond, unique_lock<mutex> lk); enum class cv_status { no_timeout, timeout }; }
cv_status
Condition variables 与 mutex 之类在等待 timeout 时,返回的不一样,mutex 之类放回 bool
类型, 而 Condition variables 特意为它定义了 enum
类型:
no_timeout
和 timeout
, 来判断等待是否成功.
enum class cv_status { no_timeout, timeout };
cv_status::no_timeout
The function returned without a timeout (i.e.,
it was notified).
cv_status::timeout
The function returned because it reached its
time limit (timeout).
notify_all_at_thread_exit
void notify_all_at_thread_exit(std::condition_variable& cond, std::unique_lock<std::mutex> lk);
<condition_variable>
头文件中有这个函数,它提供机制 notify 其他线程在调用这个函数的线程退出时. 它相当于操作(并包括清理所有 thread_local
对象):
lk.unlock(); cond.notify_all();
虽然可以在调用线程的最后同样调用如上两句代码,但意图没有表现出来,表明
cond 的 notify 必须在线程退出时调用,后面维护者可能会在这之后继续添加代码.
notify_all_at_thread_exit
用一句调用替代两个调用,既不用在函数最后去调用它,而且表明它的意图.
它的操作流程如下:
lk.unlock(); cond.notify_all();
Notes
lk.mutex()
没有被当前线程锁住,调用此函数导致 undefined behavior.
lk.mutex()
的 mutex 不是其他线程使用来等待 condition variable
的同一个的话, 调用此函数导致 undefined behavior.
condition_variable
类namespace std { class condition_variable { public: condition_variable(); ~condition_variable(); condition_variable(const condition_variable&) = delete; condition_variable& operator=(const condition_variable&) = delete; void notify_one() noexcept; void notify_all() noexcept; void wait(unique_lock<mutex>& lock); template <class Predicate> void wait(unique_lock<mutex>& lock, Predicate pred); template <class Clock, class Duration> cv_status wait_until(unique_lock<mutex>& lock, const chrono::time_point<Clock, Duration>& abs_time); template <class Clock, class Duration, class Predicate> bool wait_until(unique_lock<mutex>& lock, const chrono::time_point<Clock, Duration>& abs_time, Predicate pred); template <class Rep, class Period> cv_status wait_for(unique_lock<mutex>& lock, const chrono::duration<Rep, Period>& rel_time); template <class Rep, class Period, class Predicate> bool wait_for(unique_lock<mutex>& lock, const chrono::duration<Rep, Period>& rel_time, Predicate pred); typedef implementation-defined native_handle_type; // See 30.2.3 native_handle_type native_handle(); // See 30.2.3 }; }
Condition Variable 的基本概念可以从之前篇浅谈 C++ Multithreading Programming中获取.
condition_variable
类的 void wait(unique_lock<mutex>& lock,
Predicate pred);
接口:
unique_lock
.
pred
函数, 如果 predicate 返回 false
,等待. 相当于:
while (!pred()) {
wait(lock);
}
实例:
#include <iostream> // NOLINT #include <string> #include <thread> #include <mutex> #include <condition_variable> using std::string; using std::cout; using std::endl; std::mutex m; std::condition_variable cv; string data; bool g_ready = false; bool g_processed = false; void WorkerThread() { // Wait until main() sends data std::unique_lock<std::mutex> lk(m); cv.wait(lk, []{return g_ready;}); // after the wait, we own the lock. cout << "Worker thread is processing data" << endl; data += " after processing"; // Send data back to main() g_processed = true; cout << "Worker thread signals data processing completed" << endl; // Manual unlocking is done before notifying, to avoid // that the waiting thread gets blocked again. lk.unlock(); cv.notify_one(); } int main() { std::thread worker(WorkerThread); data = "Example data"; // send data to the worker thread { std::lock_guard<std::mutex> lk(m); g_ready = true; cout << "main() signals data ready for processing" << endl; } cv.notify_one(); // wait for the worker { std::unique_lock<std::mutex> lk(m); cv.wait(lk, []{return g_processed;}); } cout << "Back in main(), data = " << data << '\n'; worker.join(); return 0; }
condition_variable_any
类namespace std { class condition_variable_any { public: condition_variable_any(); ~condition_variable_any(); condition_variable_any(const condition_variable_any&) = delete; condition_variable_any& operator=(const condition_variable_any&) = delete; void notify_one() noexcept; void notify_all() noexcept; template <class Lock> void wait(Lock& lock); template <class Lock, class Predicate> void wait(Lock& lock, Predicate pred); template <class Lock, class Clock, class Duration> cv_status wait_until(Lock& lock, const chrono::time_point<Clock, Duration>& abs_time); template <class Lock, class Clock, class Duration, class Predicate> bool wait_until(Lock& lock, const chrono::time_point<Clock, Duration>& abs_time, Predicate pred); template <class Lock, class Rep, class Period> cv_status wait_for(Lock& lock, const chrono::duration<Rep, Period>& rel_time); template <class Lock, class Rep, class Period, class Predicate> bool wait_for(Lock& lock, const chrono::duration<Rep, Period>& rel_time, Predicate pred); }; }
condition_variable_any
是 condition_variable
的一个通用版,它可以等待任何满足 BasicLockable 要求 Lock 类型的对象.其他与 condition_variable
一样.
实例:
#include <iostream> // NOLINT #include <condition_variable> #include <thread> #include <chrono> #include <vector> using std::cout; using std::endl; std::condition_variable_any cv; std::mutex cv_m; // This mutex is used for three purposes: // 1) to synchronize accesses to i // 2) to synchronize accesses to std::cout // 3) for the condition variable cv int g_wait_val = 0; void WaitVal(int id) { std::unique_lock<std::mutex> lk(cv_m); cout << "thread " << id << " Waiting... " << endl; cv.wait(lk, []{return g_wait_val == 1;}); cout << "...finished waiting," << "thread " << id << endl; } void Signals() { std::this_thread::sleep_for(std::chrono::seconds(1)); { std::lock_guard<std::mutex> lk(cv_m); cout << "Notifying..." << endl; } cv.notify_all(); std::this_thread::sleep_for(std::chrono::seconds(1)); { std::lock_guard<std::mutex> lk(cv_m); g_wait_val = 1; cout << "Notifying again..." << endl; } cv.notify_all(); } int main() { std::vector<std::thread> threads; for (int i = 0; i < 3; ++i) { threads.emplace_back(WaitVal, i); } threads.emplace_back(Signals); for (auto& t : threads) { t.join(); } return 0; }
condition_variable
VS condition_variable_any
引自 N3690 §30.5[thread.condition]:
Class condition_variable
provides a condition variable that can only
wait on an object of type unique_lock<mutex>
, allowing maximum
efficiency on some platforms. Class condition_variable_any
provides a
general condition variable that can wait on objects of user-supplied
lock types.
condition_variable
只与 unique_lock<mutex>
类型对象关联,在某些平台上,它可以更好的得到特定的优化,如果不需要
condition_variable_any
的灵活性, 选更高效的 condition_variable
对象使用.
<future>
概要
如果要异步的获取一个函数的运行结果, 可以创建一个线程,并利用 Condition
varialbes 来同步线程间使得另外线程正确获取到这个结果. 但 C++11 的
future
库使得这一过程更方便, 它提供接口使程序在一个线程中获取一个在同一个或其他线程中运行的函数的结果(值或异常), (这些类使用并不限制在
multi-threaded 程序中,同样可以在 single-threaded 使用.
future 的概要主要分为:
promise
和 packaged_task
.
future
和 shared_future
.
future_error
, future_errc
等.
async
.
future_error
类
future_error
类定义对 future 对象非法操作抛出异常的对象类型. 也就是专门为 future 库中接口出现异常提供特定的异常类.
从上图类图可知,这个类继承自 logic_error
, 并添加获取 error_code
的成员函数 code
, 获取 exception 信息的 what
成员函数.
namespace std { class future_error : public logic_error { public: future_error(error_code ec); // exposition only const error_code& code() const noexcept; const char* what() const noexcept; }; } const error_code& code() const noexcept;
实例:
#include <future> #include <iostream> // NOLINT int main() { std::future<int> empty; try { int n = empty.get(); } catch (const std::future_error& e) { std::cout << "Caught a future_error with code \"" << e.code() << "\"\nMessage: \"" << e.what() << "\"\n"; } }
future_errc
enum class future_errc { broken_promise = /* implementation-defined */, future_already_retrieved = /* implementation-defined */, promise_already_satisfied = /* implementation-defined */, no_state = /* implementation-defined */ };
这个 enum class 定义了 future 抛出异常的error condition. future_errc
的值可以用来创建 error_condition
对象, 并与 future_error
的成员函数
code
返回的值对比, 决定所抛出异常的类型.
所以 <future>
另外有两个函数提供它们之间的转换:
std::error_code make_error_code( std::future_errc e ); std::error_condition make_error_condition( std::future_errc e ); template<> struct is_error_condition_enum<std::future_errc> : std::true_type;
实例:
#include <iostream> // NOLINT #include <future> int main() { std::promise<int> prom; try { prom.get_future(); prom.get_future(); // throws std::future_error with future_already_retrieved } catch (std::future_error& e) { if (e.code() == std::make_error_condition(std::future_errc::future_already_retrieved)) { std::cerr << "[future already retrieved]\n"; } else { std::cerr << "[unknown exception]\n"; } } return 0; }
future_status
enum class future_status { ready, timeout, deferred };
future
和 shared_future
类中属于 wait 类型的接口返回的状态.
future_category
用来识别 future error 种类.
const std::error_category& future_category();
这个函数返回一个 error_category
类型的静态对象,拥有如下特性:
name
成员函数返回指向字符串”future”的指针.
实例:
#include <iostream> // NOLINT #include <future> int main() { std::promise<int> prom; try { prom.get_future(); prom.get_future(); // throws a std::future_error of the future category } catch (std::future_error& e) { if (e.code().category() == std::future_category()) { std::cerr << "future_error of the future category thrown\n"; } } return 0; }
template promise
模版类 promise 提供一种方便的方法存储一个值或异常,之后可以异步的被 future 对象获取(同一个或其他线程).
promise 对象在共享状态(shared state)存储值的操作 synchronizes-with 在其他函数中成功获取这个共享状态的返回值(如 future::get
).
namespace std { template <class R> class promise { public: promise(); template <class Allocator> promise(allocator_arg_t, const Allocator& a); promise(promise&& rhs) noexcept; promise(const promise& rhs) = delete; ~promise(); // assignment promise& operator=(promise&& rhs) noexcept; promise& operator=(const promise& rhs) = delete; void swap(promise& other) noexcept; // retrieving the result future<R> get_future(); // setting the result void set_value(see below ); void set_exception(exception_ptr p); // setting the result with deferred notification void set_value_at_thread_exit(const R& r); void set_value_at_thread_exit(see below ); void set_exception_at_thread_exit(exception_ptr p); }; template <class R> void swap(promise<R>& x, promise<R>& y) noexcept; template <class R, class Alloc> struct uses_allocator<promise<R>, Alloc>; }
set_value
and set_value_at_thread_exit
set_value
接口存储值到 shared state,并使 state 准备好.这个操作是原子性的. 而 set_value_at_thread_exit
接口如名字,调用后不会马上设置值到
shared state 中,只在当前函数退出时.
使用 get_future
返回与它相关联同一个 shared state 的 future 对象.
实例:
#include <iostream> // NOLINT #include <functional> #include <thread> #include <future> void Print(std::future<int>& fut) { // (synchronizes with getting the future) int x = fut.get(); std::cout << "value: " << x << std::endl; } int main() { std::promise<int> prom; std::future<int> fut = prom.get_future(); std::thread t1(Print, std::ref(fut)); prom.set_value(10); // fulfill promise t1.join(); return 0; }
set_exception
and set_exception_at_thread_exit
这两个接口与上面 set_value
和 set_value_at_thread_exit
一样, 只是保存的是 exception.
实例:
#include <iostream> // NOLINT #include <thread> #include <future> int main() { std::promise<int> result; std::thread t([&]{ try { throw std::runtime_error("Example"); } catch(...) { try { // store anything thrown in the promise result.set_exception(std::current_exception()); } catch(...) {} // set_exception() may throw too } }); try { std::cout << result.get_future().get(); } catch(const std::exception& e) { std::cout << "Exception from the thread: " << e.what() << std::endl; } t.join(); return 0; }
template packaged_task
packaged_task
与 promise
类似,都是提供异步获取值的方法,不同是
promise
直接设置值, 而 packaged_task
封装一个可调用的元素,并把这个可调用任务的返回值异步到 shared state 中.
namespace std { template<class> class packaged_task; // undefined template<class R, class... ArgTypes> class packaged_task<R(ArgTypes...)> { public: // construction and destruction packaged_task() noexcept; template <class F> explicit packaged_task(F&& f); template <class F, class Allocator> explicit packaged_task(allocator_arg_t, const Allocator& a, F&& f); ~packaged_task(); // no copy packaged_task(const packaged_task&) = delete; packaged_task& operator=(const packaged_task&) = delete; // move support packaged_task(packaged_task&& rhs) noexcept; packaged_task& operator=(packaged_task&& rhs) noexcept; void swap(packaged_task& other) noexcept; bool valid() const noexcept; // result retrieval future<R> get_future(); // execution void operator()(ArgTypes... ); void make_ready_at_thread_exit(ArgTypes...); void reset(); }; template <class R, class... ArgTypes> void swap(packaged_task<R(ArgTypes...)>& x, packaged_task<R(ArgTypes...)>& y) noexcept; template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>; }
packaged_task
的创建与 thread
类似, 它可以:
运行:
()
, 可以直接运行如: task()
.
move
给一个线程运行.
实例:
#include <iostream> // NOLINT #include <cmath> #include <thread> #include <future> #include <functional> // unique function to avoid disambiguating the std::pow overload set int FunPow(int x, int y) { return std::pow(x, y); } void TaskLambda() { std::packaged_task<int(int, int)> task([](int a, int b) { return std::pow(a, b); }); std::future<int> result = task.get_future(); task(2, 9); std::cout << "task_lambda:\t" << result.get() << '\n'; } void TaskBind() { std::packaged_task<int()> task(std::bind(FunPow, 2, 11)); std::future<int> result = task.get_future(); task(); std::cout << "task_bind:\t" << result.get() << '\n'; } void TaskThread() { std::packaged_task<int(int, int)> task(FunPow); std::future<int> result = task.get_future(); std::thread task_td(std::move(task), 2, 10); task_td.join(); std::cout << "task_thread:\t" << result.get() << '\n'; } int main() { TaskLambda(); TaskBind(); TaskThread(); }
packaged_task
的 reset
接口, 重置状态,舍弃之前运行的结果.相当于: *this = packaged_task(std::move(f))
.
实例:
#include <iostream> // NOLINT #include <cmath> #include <thread> #include <future> int main() { std::packaged_task<int(int, int)> task([](int a, int b) { return std::pow(a, b); }); std::future<int> result = task.get_future(); task(2, 9); std::cout << "2^9 = " << result.get() << '\n'; task.reset(); result = task.get_future(); std::thread task_td(std::move(task), 2, 10); task_td.join(); std::cout << "2^10 = " << result.get() << '\n'; }
template future
类
模版类 future
是用来异步获取共享状态里的结果. future
类是独占的,不能与其他 future
共享异步的获取结果. 若要多个 future
共享异步结果,
使用之后的 shared_future
类.
有效的与共享状态相关联的 future 对象,由如下函数构造:
async
.
promise::get_future
.
package_task::get_future
.
它的接口:
share
: 转换 shared state 从 *this 到一个 shared_future
对象.
get
: 返回 shared state 的值, 若未准备好,调用者阻塞等待它准备好.
wait
: 阻塞等待结果直到有效.
wait_for
和 wait_until
: 等待一段时间, 并通过 future_status
判断等待后的状态.
namespace std { template <class R> class future { public: future() noexcept; future(future &&) noexcept; future(const future& rhs) = delete; ~future(); future& operator=(const future& rhs) = delete; future& operator=(future&&) noexcept; shared_future<R> share(); // retrieving the value see below get(); // functions to check state bool valid() const noexcept; void wait() const; template <class Rep, class Period> future_status wait_for(const chrono::duration<Rep, Period>& rel_time) const; template <class Clock, class Duration> future_status wait_until(const chrono::time_point<Clock, Duration>& abs_time) const; }; }
template shared_future
类
模版类 shared_future
与 future
基本一样, 不同就是多个
shared_future
对象可以共享异步结果.
namespace std { template <class R> class shared_future { public: shared_future() noexcept; shared_future(const shared_future& rhs); shared_future(future<R>&&) noexcept; shared_future(shared_future&& rhs) noexcept; ~shared_future(); shared_future& operator=(const shared_future& rhs); shared_future& operator=(shared_future&& rhs) noexcept; // retrieving the value see below get() const; // functions to check state bool valid() const noexcept; void wait() const; template <class Rep, class Period> future_status wait_for(const chrono::duration<Rep, Period>& rel_time) const; template <class Clock, class Duration> future_status wait_until(const chrono::time_point<Clock, Duration>& abs_time) const; }; }
实例:
#include <iostream> // NOLINT #include <future> #include <chrono> int main() { std::promise<void> ready_promise, t1_ready_promise, t2_ready_promise; std::shared_future<void> ready_future(ready_promise.get_future()); std::chrono::time_point<std::chrono::high_resolution_clock> start; auto fun1 = [&]() -> std::chrono::duration<double, std::milli> { t1_ready_promise.set_value(); ready_future.wait(); // waits for the signal from main() return std::chrono::high_resolution_clock::now() - start; }; auto fun2 = [&]() -> std::chrono::duration<double, std::milli> { t2_ready_promise.set_value(); ready_future.wait(); // waits for the signal from main() return std::chrono::high_resolution_clock::now() - start; }; auto result1 = std::async(std::launch::async, fun1); auto result2 = std::async(std::launch::async, fun2); // wait for the threads to become ready t1_ready_promise.get_future().wait(); t2_ready_promise.get_future().wait(); // the threads are ready, start the clock start = std::chrono::high_resolution_clock::now(); // signal the threads to go ready_promise.set_value(); std::cout << "Thread 1 received the signal " << result1.get().count() << " ms after start\n" << "Thread 2 received the signal " << result2.get().count() << " ms after start\n"; return 0; }
enum class launch : /* unspecified */ { async = /* unspecified */, deferred = /* unspecified */, /* implementation-defined */ };
函数 async
有不同的策略来运行函数:
launch::async
:创建一个新的线程来调用函数f.
launch::deferred
:调用函数 f 延迟(deferred)到返回的 future 的 shared
state 被访问时(wait 或 get).
launch::async|launch::deferred
:函数自动选择策略运行.与系统的库实现有关.
template <class F, class... Args> future<typename result_of<typename decay<F>::type(typename decay<Args>::type...)>::type> async(F&& f, Args&&... args); template <class F, class... Args> future<typename result_of<typename decay<F>::type(typename decay<Args>::type...)>::type> async(launch policy, F&& f, Args&&... args);
第一个接口没有 policy
作为传入参数, 相当于
async(std::launch::async | std::launch::deferred, f, args...)
实例:
#include <iostream> // NOLINT #include <vector> #include <algorithm> #include <numeric> #include <future> template <typename RAIter> int ParallelSum(RAIter beg, RAIter end) { auto len = std::distance(beg, end); if (len < 1000) return std::accumulate(beg, end, 0); RAIter mid = beg + len/2; auto handle = std::async(std::launch::async, ParallelSum<RAIter>, mid, end); int sum = ParallelSum(beg, mid); return sum + handle.get(); } int main() { std::vector<int> v(10000, 1); std::cout << "The sum is " << ParallelSum(v.begin(), v.end()) << '\n'; }
<thread>
基本概要如下(§30.3 [thread.threads] of N3690):
// Header <thread> synopsis namespace std { class thread; void swap(thread& x, thread& y) noexcept; namespace this_thread { thread::id get_id() noexcept; void yield() noexcept; template <class Clock, class Duration> void sleep_until(const chrono::time_point<Clock, Duration>& abs_time); template <class Rep, class Period> void sleep_for(const chrono::duration<Rep, Period>& rel_time); } }
<mutex>
// Header <mutex> synopsis namespace std { class mutex; class recursive_mutex; class timed_mutex; class recursive_timed_mutex; struct defer_lock_t { }; struct try_to_lock_t { }; struct adopt_lock_t { }; constexpr defer_lock_t defer_lock { }; constexpr try_to_lock_t try_to_lock { }; constexpr adopt_lock_t adopt_lock { }; template <class Mutex> class lock_guard; template <class Mutex> class unique_lock; template <class Mutex> void swap(unique_lock<Mutex>& x, unique_lock<Mutex>& y) noexcept; template <class L1, class L2, class... L3> int try_lock(L1&, L2&, L3&...); template <class L1, class L2, class... L3> void lock(L1&, L2&, L3&...); struct once_flag { constexpr once_flag() noexcept; once_flag(const once_flag&) = delete; once_flag& operator=(const once_flag&) = delete; }; template<class Callable, class ...Args> void call_once(once_flag& flag, Callable func, Args&&... args); }
<future>
namespace std { enum class future_errc { broken_promise = implementation-defined , future_already_retrieved = implementation-defined , promise_already_satisfied = implementation-defined , no_state = implementation-defined }; enum class launch : unspecified { async = unspecified , deferred = unspecified , implementation-defined }; enum class future_status { ready, timeout, deferred }; template <> struct is_error_code_enum<future_errc> : public true_type { }; error_code make_error_code(future_errc e) noexcept; error_condition make_error_condition(future_errc e) noexcept; const error_category& future_category() noexcept; class future_error; template <class R> class promise; template <class R> class promise<R&>; template <> class promise<void>; template <class R> void swap(promise<R>& x, promise<R>& y) noexcept; template <class R, class Alloc> struct uses_allocator<promise<R>, Alloc>; template <class R> class future; template <class R> class future<R&>; template <> class future<void>; template <class R> class shared_future; template <class R> class shared_future<R&>; template <> class shared_future<void>; template <class> class packaged_task; // undefined template <class R, class... ArgTypes> class packaged_task<R(ArgTypes...)>; template <class R> void swap(packaged_task<R(ArgTypes...)>&, packaged_task<R(ArgTypes...)>&) noexcept; template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>; template <class F, class... Args> future<typename result_of<typename decay<F>::type(typename decay<Args>::type...)>::type> async(F&& f, Args&&... args); template <class F, class... Args> future<typename result_of<typename decay<F>::type(typename decay<Args>::type...)>::type> async(launch policy, F&& f, Args&&... args); }
随着多核 CPU 随处可见,多线程(multithreading)可以被用来实现并行,提高 CPU 的利用率和性能显著的提高.掌握多线程编程也成为现代实现软件的基本要求技能之一.Introduction to Parallel Computing详细的介绍了 Parallel Computing; 为什么使用它;Parallel Computing 的分类;Parallel Computing 的 limits 和 costs; Parallel Computing 的程序模型;如何设计 Parallel 程序等.
这里先介绍多线程的概念,多线程中涉及的基本概念,然后用实例介绍 Pthread 库的使用,并介绍 Google Code 中如何把它封装成 C++类,最后介绍可移植并大量使用的 Boost Thread 库.
还有一些其他的 Thread 库:
A thread is defined as an independent stream of instructions that can be scheduled to run as such by the operating system.所以它是在程序中独立于其他代码可由操作系统调度的一段指令.
那么是操作系统是如何具体实现这一独立性呢?
要理解 thread,必须先明白 process.进程由操作系统创建来运行相应的程序,进程包含程序资源和程序执行状态的信息.以 Linux 的进程为例包含:
Thread 使用 Process 的资源,并且能成为独立的元件被操作系统调度,是因为它仅重复那些使得它们能成为独立运行代码的必要资源.Thread 维护它自己如下的信息:
与 Process 比较,Thread 可以总结如下:
Posix Thread 基本模型如下图,一些有关其中 Thread 的术语:
Threads 能提供益处 对于相适 的应用.所以 thread 的并行性对于应用来说也有它的限制.
Amdahl 法则 陈述到潜在的程序加速由能被并行的代码率 P 定义为:
$$ \begin{align} speedup = \dfrac{1}{1-P} \end{align} $$引入能并行的处理器个数,那么进一步可以定义为:
$$ \begin{align} speedup = \dfrac{1}{\dfrac{P}{N} + (1-P)} 其中 P 并行率,N 处理器个数 \end{align} $$Pareto 原则 陈述到 80%的处理器时间花在 20%的代码中.所以仔细分析代码,不要把时间花在并行/优化那部分不重要的代码.
在程序中有不同的方法使用线程,这里讨论 3 种线程设计模式,没有哪一种模式最好,每种模式都有相应适合的应用场合.
如上图,一个 Boss 线程创建其他 Worker 线程,并给它们分配任务,必要的话,并等待其他线程运行结束.通常 Boss 线程会在初始建立 Thread Pool 来为之后分配.尽管线程是轻量级的,但是创建它们仍是有开销的.
Peer 模式又叫做 workcrew 模式,一个 thread 创建其他 peer threads 当程序开始,但是如上图,与 Boss/worker 模式不同,这个 thread 之后也变成 peer thread 去处理自己的任务.
Pipeline 模式假定:
如上图, Pipeline 就像流水线一般,每个 thread 是一个长链中的一部分.每个 thread 处理由之前 thread 过的数据.
如上线程中的定义,线程们共享进程中的全局变量或资源,它们可以并行同时对这些数据和资源操作,如果没有一定的机制协调它们,那么数据或资源将处于一个不安全状态,引起诸如如下的一些问题:
所以我们需要如下的一些线程同步原语满足不同的线程间同步需求.
Mutex 又被称为 Lock,所以它就像一把 Lock,一个线程 Lock 住一段资源,那么其他线程就不能去访问那段资源,只有等到第一个线程 Unlock 那么资源,它才能访问.
在 Lock 和 Unlock 之间的代码,一般被称为 critical section.
Mutex 也包含一些复杂的类型,如下:
但 Mutex 也会引入其他一些问题,如deadlock 和 priority inversion.
在 Blog 中之前浅谈 Mutex (Lock)中可以看到更多有关 Mutex 的性能和开销分析,并如何实现一个轻量级的 Mutex.
线程 join 机制能让一个线程 join 到另外一个线程中.比如一个子线程 join 回主线程,那么主线程就会等待子线程运行结束.从而达到线程间等待的同步机制.
Condition variable 允许线程同步到某个共享资源的某个值.
比如,程序有一个计数器,当计数器达到某一个值时去激活某个线程运行.把计数器当成一个 Condition variable.这个线程可以等待这个 Condition variable,其他 active 线程操作完这个 Condition variable,可以通过 signal/broadcast 去唤醒那些等待这个 Condition variable 睡眠的线程.
Barrier 是一种能让一系列线程在某个点得到同步的方法,通过让参与 barrier 的线程等待直到所有参与线程都调用了这个 barrier 函数.本质上就是,阻塞所有参与 barrier 的线程直到最慢的那个参与线程调用 barrier.
Spinlock 与 mutex 类似,是种锁,但当获取锁失败时,spinlock 不会让线程进入睡眠,而是不断 poll 去获取这个锁直到获取成功.更多Mutex 与 Spinlock 的区别.
当某些资源具有多个时,简单的 Mutex 不能满足,引入 Semphore,Semphore 可以根据资源个数初始化为任意值.当线程们占有所有资源,使得 Semphore 为 0,那么其他线程再获取资源只有等待.当 Semphore 值只能是 1 或 0 时,它相当于简单的 Mutex.
原始的 Pthread API 由 ANSI/IEEE POSIX 1003.1 - 1995 standard 定义.POSIX 标准也随着时间不断改进.
接下来主要把 Pthread API 分成如下主要 5 部分:
如果想把 Pthread 封装成类对象或 Scoped Lock,可以参考之后 Google wrap the Pthread,或直接使用之后介绍的Boost thread library.
如果更全面的 API 参考文章最后的Pthread Library Routines Reference.更多有关资料参考文章后的其他资料.
对于 POSIX 系统,包含头文件 pthread.h
. 如果使用 semaphore
, 包含
semaphore.h
.
#include <pthread.h> #include <semaphore.h>
对于 Gcc 编译器,使用选项 -l
,如下:
gcc Program.o -o Program -lpthread
int pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine)(void*), void *arg); void pthread_exit(void *value_ptr); int pthread_cancel(pthread_t thread); int pthread_attr_init(pthread_attr_t *attr); int pthread_attr_destroy(pthread_attr_t *attr);
pthread_create
创建一个新的线程并运行它.它能在代码的任何处被多次调用.
pthread_create
的参数:
thread
:返回新 thread 程的唯一标识.
attr
:设置 thread 的性质.NULL 为默认性质.
start_routine
: 新 thread 运行的函数指针.
arg
:传给 start_routine
的参数,必须强制转换成 void *
.NULL 为没有参数传入.
Process 能创建的最大 thread 个数由系统配置决定.如下 Ubuntu 打印出的结果:
$ limit cputime unlimited filesize unlimited datasize unlimited stacksize 8MB coredumpsize 0kB memoryuse unlimited maxproc 62694 descriptors 1024 memorylocked 64kB addressspace unlimited maxfilelocks unlimited sigpending 62694 msgqueue 819200 nice 0 rt_priority 0 rt_time unlimited
pthread_attr_init
和 pthread_attr_destroy
被用来初始化/销毁 thread
性质对象.
性质包括:
Pthread APIs 并没有提供 binding threads 到特定 cpus/cores 的接口.但不同系统可能包含这功能,比如提供非标准的pthread_setaffinity_np
接口.
比如设置两个线程都在 core0 上运行,如下设置:
cpu_set_t cpus; CPU_ZERO(&cpus); CPU_SET(0, &cpus); pthread_setaffinity_np(thread[0], sizeof(cpu_set_t), &cpus); pthread_setaffinity_np(thread[1], sizeof(cpu_set_t), &cpus);
一个线程有很多种方法终止:
pthread_exit
无论它的工作完成否.
pthread_cancel
来取消.
exec()
或 exit()
.
main()
函数先完成,没有调用 pthread_exit
.
pthread_exit()
允许指定一个可选的终止 status parameter
.这个可选参数一般返回给线程”joining”到这个终止线程.
pthread_exit()
不关闭文件,在线程打开的任何文件将继续打开在线程终止后.
在 main()
调用 pthread_exit()
:
main()
在它创建的 threads 之前终止,并没有显示的调用
pthread_exit()
,这将是个问题.所有创建的线程将终止因为 main()结束,不再存在支持这些线程.
pthread_exit()
, main()将阻塞并保持存活来支持它创建的线程运行直到它们完成.
如果注释掉 main()中最后的 pthread_exit(NULL);
,那么它创建的线程将会完成不了所有的打印而被强制退出.
#include <pthread.h> #include <cstdio> #include <cstdlib> void *ThreadProc(void *param) { int id; id = *(static_cast<int *>(param)); for (int i = 0; i < 10; ++i) { printf("thread %d: run %d \n", id, i); } pthread_exit(NULL); } int main(int argc, char *argv[]) { const int kNumThreads = 4; pthread_t threads[kNumThreads]; int thread_ids[kNumThreads]; for (int i = 0; i < kNumThreads; ++i) { thread_ids[i] = i; int rt = pthread_create(&threads[i], NULL, ThreadProc, static_cast<void *>(&thread_ids[i])); if (rt) { printf("ERROR: pthread_create failed, rt=%d\n", rt); exit(1); } } pthread_exit(NULL); }
int pthread_join(pthread_t thread, void **value_ptr); int pthread_detach(pthread_t thread); int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate); int pthread_attr_getdetachstate(const pthread_attr_t *attr, int *detachstate);
Joining 是同步不同线程的方法之一,原理如下图:
pthread_join()
阻塞调用它的线程直到指定的 threadid
的线程终止.
status
只要目标线程调用
pthread_exit()
.
pthread_create()
中的 attr
参数.典型的步骤是:
pthread_attr_t
类型的 pthread 属性;
pthread_attr_init()
初始化属性变量;
pthread_attr_setdetachstate()
设置 detached 属性;
pthread_attr_destroy()
释放属性使用的资源.
pthread_detach()
能显示的 detach 一个线程即使它是以可 join 创建.
#include <pthread.h> #include <cstdio> #include <cstdlib> void *ThreadProc(void *param) { int id; id = *(static_cast<int *>(param)); for (int i = 0; i < 10; ++i) { printf("thread %d: run %d \n", id, i); } pthread_exit(param); } int main(int argc, char *argv[]) { const int kNumThreads = 4; pthread_t threads[kNumThreads]; int thread_ids[kNumThreads]; pthread_attr_t attr; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); for (int i = 0; i < kNumThreads; ++i) { thread_ids[i] = i; int rt = pthread_create(&threads[i], &attr, ThreadProc, static_cast<void *>(&thread_ids[i])); if (rt) { printf("ERROR: pthread_create failed, rt=%d\n", rt); exit(1); } } for (int i = 0; i < kNumThreads; ++i) { void *status; int rt = pthread_join(threads[i], &status); if (rt) { printf("ERROR: pthread_join failed, rt=%d\n", rt); exit(1); } printf("completed join with thread %d having a status of %d\n" , i, *static_cast<int *>(status)); } pthread_exit(NULL); }
int pthread_attr_getstacksize(const pthread_attr_t *restrict attr, size_t *restrict stacksize); int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize); int pthread_attr_getstackaddr(const pthread_attr_t *restrict attr, void **restrict stackaddr); int pthread_attr_setstackaddr(pthread_attr_t *attr, void *stackaddr);
每个线程都有各自独立的 stack, pthread_attr_getstackaddr
和
pthread_attr_setstackaddr
分别获取和设置线程的 stack 属性.
#include <pthread.h> #include <cstdio> #include <cstdlib> pthread_attr_t attr; void *ThreadProc(void *param) { int id; size_t thread_stack_size; id = *(static_cast<int *>(param)); pthread_attr_getstacksize(&attr, &thread_stack_size); printf("thread %d: stack size = %d\n", id, thread_stack_size); for (int i = 0; i < 10; ++i) { printf("thread %d: run %d \n", id, i); } pthread_exit(NULL); } int main(int argc, char *argv[]) { const int kNumThreads = 4; const int kThround = 1000; pthread_t threads[kNumThreads]; int thread_ids[kNumThreads]; size_t stack_size; pthread_attr_init(&attr); pthread_attr_getstacksize(&attr, &stack_size); printf("Default stack size = %d\n", stack_size); stack_size = sizeof(double) * kThround * kThround; printf("Setting stack size = %d\n", stack_size); pthread_attr_setstacksize(&attr, stack_size); for (int i = 0; i < kNumThreads; ++i) { thread_ids[i] = i; int rt = pthread_create(&threads[i], &attr, ThreadProc, static_cast<void *>(&thread_ids[i])); if (rt) { printf("ERROR: pthread_create failed, rt=%d\n", rt); exit(1); } } pthread_exit(NULL); pthread_attr_destroy(&attr); return 0; }
pthread_t pthread_self(void); int pthread_equal(pthread_t t1, pthread_t t2); int pthread_once(pthread_once_t *once_control, void (*init_routine)(void)); pthread_once_t once_control = PTHREAD_ONCE_INIT;
pthread_self
返回调用线程的唯一 thread ID.
pthread_equal
比较两个线程 ID 是否相等.
pthread_once
只执行 init_routine
仅仅一次在进程中.
Mutex 以”mutual exclusion”(互斥)简称.
Mutex variable 就像一把”锁”一样保护共享数据资源.mutex 的基本概念就是,只有一个线程能 lock 一个 mutex 变量在任何时候.所以,即使很多线程尝试去锁一个 mute,也仅仅只有一个线程能成功.
典型使用 mutex 的顺序如下:
int pthread_mutex_destroy(pthread_mutex_t *mutex); int pthread_mutex_init(pthread_mutex_t *restrict mutex, const pthread_mutexattr_t *restrict attr); pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; int pthread_mutexattr_destroy(pthread_mutexattr_t *attr); int pthread_mutexattr_init(pthread_mutexattr_t *attr);
Mutex 变量由 pthread_mutex_t
声明定义,而且必须初始化在使用前.两种方法初始:
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_init()
函数,并能设置 mutex 的属性 attr
.
attr
用来设置 mutex 变量的属性,必须是 pthread_mutexattr_t
类型.Pthread 标准中定义的 3 种可选 mutex 属性:
int pthread_mutex_lock(pthread_mutex_t *mutex); int pthread_mutex_trylock(pthread_mutex_t *mutex); int pthread_mutex_unlock(pthread_mutex_t *mutex);
pthread_mutex_lock()
函数被用来获取传入的 mutex 变量,如果 mutex 已经被其他线程占用,那么这个调用就阻塞调用线程,使它进入睡眠等待这个 mutex 直到它被释放.
pthread_mutex_trylock()
仅尝试获取锁,若不成功也立即返回’busy’信号.
#include <pthread.h> #include <cstdio> #include <cstdlib> struct ThreadData { int tid; int data; }; int shared_x; pthread_mutex_t lock; void *ThreadProc(void *param) { ThreadData *data = static_cast<ThreadData *>(param); printf("begin from thread id: %d\n", data->tid); pthread_mutex_lock(&lock); shared_x += data->data; printf("thread %d: x = %d\n", data->tid, shared_x); pthread_mutex_unlock(&lock); pthread_exit(NULL); } int main(int argc, char *argv[]) { const int kNumThreads = 4; pthread_t threads[kNumThreads]; ThreadData threads_data[kNumThreads]; pthread_attr_t attr; shared_x = 0; pthread_mutex_init(&lock, NULL); pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); for (int i = 0; i < kNumThreads; ++i) { threads_data[i].tid = i; threads_data[i].data = i * i; int rt = pthread_create(&threads[i], &attr, ThreadProc, static_cast<void *>(&threads_data[i])); if (rt) { printf("ERROR: pthread_create failed, rt=%d\n", rt); exit(1); } } for (int i = 0; i < kNumThreads; ++i) { void *status; pthread_join(threads[i], &status); } pthread_attr_destroy(&attr); pthread_exit(NULL); return 0; }
Mutex 变量如锁一般防止多个线程访问共享数据资源,如果某个线程等待某个共享数据达到某个数值才进行相应的操作,那么这个线程需要不断的去 poll,查看是否满足需要的值,这样开销很大,因为线程需要一直处于忙状态.
引入 Condition Variables 来完成这样的同步到某个实际数据值而不要不断 poll.
Condition 变量一般与 mutex 一起使用.锁住查看的共享数据资源.
使用 Condition 的一般步骤如下:
int pthread_cond_destroy(pthread_cond_t *cond); int pthread_cond_init(pthread_cond_t *restrict cond, const pthread_condattr_t *restrict attr); int pthread_condattr_destroy(pthread_condattr_t *attr); int pthread_condattr_init(pthread_condattr_t *attr);
Condition 变量由 pthread_cond_t
声明定义,而且必须初始化在使用前.两种方法初始:
pthread_cond_t convar = PTHREAD_COND_INITIALIZER;
pthread_cond_init()
函数,并能设置 condition 的属性 attr
.
attr
用来设置 condition 变量的属性,必须是 pthread_condattr_t
类型.只有一种属性可选:是否进程共享,也就是允许其他进程中的线程也能看到它.
int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex); int pthread_cond_signal(pthread_cond_t *cond); int pthread_cond_broadcast(pthread_cond_t *cond);
pthread_cond_wait()
阻塞调用它的线程直到其中 cond
被 signal.这个函数需要在占有 mutex 时被调用,而它将 自动释放 mutex 当它等待时.等到
signal 收到,线程被唤醒, mutex 将 自动被占有 .最后当线程完成
condition 的操作,要负责对 mutex 解锁.
pthread_cond_signal()
用来 signal 其他等待这个 cond
的线程.它需要在占有 mutex 时被调用.然后必须对 mutex 解锁来完成 pthread_cond_wait
的等待.
如果有多余一个线程处于等待 cond
而阻塞, 应该用
pthread_cond_broadcast()
替换 pthread_cond_signal()
.
#include <pthread.h> #include <cstdio> #include <cstdlib> #include <unistd.h> const int kNumThreads = 3; const int kLoops = 10; const int kCountLimit = 15; int g_count; pthread_mutex_t count_mutex; pthread_cond_t count_cv; void *IncreaseCount(void *param) { int id; id = *(static_cast<int *>(param)); for (int i = 0; i < kLoops; ++i) { pthread_mutex_lock(&count_mutex); g_count++; if (g_count == kCountLimit) { pthread_cond_signal(&count_cv); printf("increse thread %d: count = %d, signal cond\n", id, g_count); } printf("increse thread %d: count = %d, unlock mutex\n", id, g_count); pthread_mutex_unlock(&count_mutex); sleep(1); } pthread_exit(NULL); } void *WatchCount(void *param) { int id; id = *(static_cast<int *>(param)); pthread_mutex_lock(&count_mutex); while (g_count < kCountLimit) { pthread_cond_wait(&count_cv, &count_mutex); printf("watch thread %d: count = %d, receive signal\n", id, g_count); } pthread_mutex_unlock(&count_mutex); pthread_exit(NULL); } int main(int argc, char *argv[]) { pthread_t threads[kNumThreads]; int thread_ids[kNumThreads]; pthread_attr_t attr; pthread_mutex_init(&count_mutex, NULL); pthread_cond_init(&count_cv, NULL); pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); for (int i = 0; i < kNumThreads; ++i) { thread_ids[i] = i; } int rt; rt = pthread_create(&threads[0], &attr, WatchCount, static_cast<void *>(&thread_ids[0])); if (rt) { printf("ERROR: pthread_create failed, rt=%d\n", rt); exit(1); } rt = pthread_create(&threads[1], &attr, IncreaseCount, static_cast<void *>(&thread_ids[1])); if (rt) { printf("ERROR: pthread_create failed, rt=%d\n", rt); exit(1); } rt = pthread_create(&threads[2], &attr, IncreaseCount, static_cast<void *>(&thread_ids[2])); if (rt) { printf("ERROR: pthread_create failed, rt=%d\n", rt); exit(1); } for (int i = 0; i < kNumThreads; ++i) { pthread_join(threads[i], NULL); } pthread_attr_destroy(&attr); pthread_cond_destroy(&count_cv); pthread_mutex_destroy(&count_mutex); pthread_exit(NULL); }
Barrier 就是栅栏一样,调用等待 barrier 的线程需要等待直到满足调用 barrier 的线程个数达到要求的 count
.
int pthread_barrier_init(pthread_barrier_t *barrier, const pthread_barrierattr_t *attr, unsigned count); pthread_barrier_t barrier = PTHREAD_BARRIER_INITIALIZER(count); int pthread_barrier_destroy(pthread_barrier_t *barrier); int pthread_barrierattr_init(pthread_barrierattr_t *attr); int pthread_barrierattr_destroy(pthread_barrierattr_t *attr); int pthread_barrier_wait(pthread_barrier_t *barrier);
Barrier 变量由 pthread_barrier_t
声明定义,而且必须初始化在使用前.需要传入满足 barrier 等待的个数 count
, 两种方法初始:
pthread_barrier_t barrier = PTHREAD_BARRIER_INITIALIZER(count);
pthread_barrier_init()
函数,并能设置 barrier 的属性 attr
.
线程调用 barrier,只需要调用 pthread_barrier_wait
来等待 barrier 达到满足条件.
Google api 的 base 包里封装了 Mutex 类和 CondVar 类.
namespace base { enum LinkerInitialized { LINKER_INITIALIZED }; } class LOCKABLE PThreadMutex { public: explicit PThreadMutex(base::LinkerInitialized) { pthread_mutex_init(&mutex_, NULL); } PThreadMutex() { pthread_mutex_init(&mutex_, NULL); } ~PThreadMutex() { pthread_mutex_destroy(&mutex_); } void Lock() { CHECK_EQ(0, pthread_mutex_lock(&mutex_)); } void Unlock() { CHECK_EQ(0, pthread_mutex_unlock(&mutex_)); } private: friend class PThreadCondVar; pthread_mutex_t mutex_; DISALLOW_COPY_AND_ASSIGN(PThreadMutex); }; class PThreadCondVar { public: PThreadCondVar() { pthread_cond_init(&cv_, NULL); } ~PThreadCondVar() { CHECK_EQ(0, pthread_cond_destroy(&cv_)); } void Signal() { CHECK_EQ(0, pthread_cond_signal(&cv_)); } void SignalAll() { CHECK_EQ(0, pthread_cond_broadcast(&cv_)); } void Wait(PThreadMutex* mu) { CHECK_EQ(0, pthread_cond_wait(&cv_, &mu->mutex_)); } bool WaitWithTimeout(PThreadMutex* mu, int64 millis) { struct timeval tv; struct timespec ts; gettimeofday(&tv, NULL); ts.tv_sec = tv.tv_sec + millis / 1000; ts.tv_nsec = millis % 1000; int result = pthread_cond_timedwait(&cv_, &mu->mutex_, &ts); if (!result) return true; CHECK_EQ(ETIMEDOUT, result); return false; } private: pthread_cond_t cv_; DISALLOW_COPY_AND_ASSIGN(PThreadCondVar); }; typedef PThreadCondVar CondVar; typedef PThreadMutex Mutex;
typedef pthread_once_t GoogleOnceType; #define GOOGLE_ONCE_INIT PTHREAD_ONCE_INIT inline void GoogleOnceInit(GoogleOnceType* once, void (*initializer)()) { CHECK_EQ(0, pthread_once(once, initializer)); }
使用 Linux 自带的 ps
命令查看运行的 thread 情况,ps 的 man 手册.
➜$ ps -Lf UID PID PPID LWP C NLWP STIME TTY TIME CMD shougang 13103 8814 13103 0 1 23:30 pts/17 00:00:00 /bin/zsh shougang 13237 13103 13237 0 6 23:30 pts/17 00:00:00 [thread] shougang 13237 13103 13240 0 6 23:30 pts/17 00:00:00 [thread] shougang 13237 13103 13241 0 6 23:30 pts/17 00:00:00 [thread] shougang 13237 13103 13242 0 6 23:30 pts/17 00:00:00 [thread] shougang 13237 13103 13243 0 6 23:30 pts/17 00:00:00 [thread] shougang 13237 13103 13244 0 6 23:30 pts/17 00:00:00 [thread]
Linux 的 top
命令加上 -H
参数, process 中的 threads 也能看到.
如下是 top -H
的一个例子:
直到 C++11 库才比较好的支持 thread,之前 C++程序使用操作系统支持的 thread 库(如 Pthread).但这样做至少有个主要的问题:(1) 这些库基本是 C 的库,需要很小心的 C++中使用,和(2) 每个操作系统提供自己的一套对 thread 支持的库.以致,编写的代码既不标准又不可移植.
Boost Thread可以解决这两个主要问题. Boost Thread 不是通过继承来使用线程,而是 Boost 的 thread 类使用一个 Callable 的对象创建.
根据使用到的 Boost Thread 中的类型包含不同头文件:
#include <boost/thread/thread.hpp> #include <boost/thread/mutex.hpp> #include <boost/thread/condition.hpp> #include <boost/thread/locks.hpp> #include <boost/thread/once.hpp>
对于 Gcc 编译器,使用选项 -l
,如下:
g++ Program.o -o Program -lboost_thread -lboost_system
#include <boost/thread/thread.hpp> class thread { public: thread() noexcept; thread(const thread&) = delete; thread& operator=(const thread&) = delete; thread(thread&&) noexcept; thread& operator=(thread&&) noexcept; ~thread(); template <class F> explicit thread(F f); template <class F> thread(F &&f); template <class F,class A1,class A2,...> thread(F f,A1 a1,A2 a2,...); template <class F, class ...Args> explicit thread(F&& f, Args&&... args); template <class F> explicit thread(attributes& attrs, F f); // EXTENSION template <class F> thread(attributes& attrs, F &&f); // EXTENSION template <class F, class ...Args> explicit thread(attributes& attrs, F&& f, Args&&... args); class id; id get_id() const noexcept; bool joinable() const noexcept; void join(); ... };
整个 thread 类包含 thread 的所有特性,如 thread id, join, detach 等.
Callable 对象既可以是一个函数又可以是类中的 operator()
实现,如下:
void hello() { cout << "hello world" << endl; } struct Hello { public: void operator() () { cout << "hello world" << endl; } }; Hello h; boost::thread thread_hello(h); boost::thread thread_hello(hello);
传递参数给线程
void hello(const string &str) { cout << str << endl; } string str = "hello"; boost::thread thrd(hello, str);
Boost.bind
库接口
void hello(const string &str) { cout << str << endl; } string str = "hello"; string str = "hello"; boost::thread thrd(bind(hello, str));
可以使用 thread group 类管理 thread,通过 add_thread
和 create_thread
添加线程到管理类中, 可以直接 join_all
将所有管理类中的线程 join.
#include <boost/thread/thread.hpp> class thread_group { public: thread_group(const thread_group&) = delete; thread_group& operator=(const thread_group&) = delete; thread_group(); ~thread_group(); template<typename F> thread* create_thread(F threadfunc); void add_thread(thread* thrd); void remove_thread(thread* thrd); bool is_this_thread_in(); bool is_thread_in(thread* thrd); void join_all(); void interrupt_all(); int size() const; };
Boost Thread 中还有 strict_scoped_thread
类和 scoped_thread
类,提供线程结束不是调用 terminate
,而是调用传入的参数来执行特定行为.
#include <iostream> #include <string> #include <boost/thread/thread.hpp> #include <boost/bind.hpp> using namespace std; using namespace boost; void hello(const string &str) { cout << str << endl; } int main(){ string str = "hello"; boost::thread thrd(bind(hello, str)); thrd.join(); return 0; }
#include <boost/thread/mutex.hpp> class mutex: boost::noncopyable { public: mutex(); ~mutex(); void lock(); bool try_lock(); void unlock(); typedef platform-specific-type native_handle_type; native_handle_type native_handle(); typedef unique_lock<mutex> scoped_lock; typedef unspecified-type scoped_try_lock; };
lock()
来获取锁.
unlock()
释放锁.
typedef unique_lock<mutex> scoped_lock;
定义了 scoped_lock
的类型,通过 boost::mutex::scoped_lock
来定义一个 RAII-style 锁,离开定义区域自动释放锁.
lock_guard
boost::lock_guard
非常简单:
它提供了一个简单的 RAII-style 锁对象,使得 exception-safe 锁和解锁更容易.
namespace boost { template<typename Lockable> class lock_guard #if ! defined BOOST_THREAD_NO_MAKE_LOCK_GUARD template <typename Lockable> lock_guard<Lockable> make_lock_guard(Lockable& mtx); // EXTENSION template <typename Lockable> lock_guard<Lockable> make_lock_guard(Lockable& mtx, adopt_lock_t); // EXTENSION #endif }
基本使用,传入可 Lockable
的 mutex 类型:
boost::mutex count_mutex; boost::lock_guard<mutex> lock(count_mutex) ;
#include <iostream> #include <boost/thread/thread.hpp> #include <boost/thread/mutex.hpp> using std::cout; using std::endl; boost::mutex count_mutex; struct count { count(int id) : id_(id) { } void operator() () { for (int i = 0; i < 10; ++i) { boost::mutex::scoped_lock lock(count_mutex); cout << id_ << ": " << i << endl; } } int id_; }; int main(int argc, char *argv[]) { boost::thread thread1(count(1)); boost::thread thread2(count(2)); thread1.join(); thread2.join(); return 0; }
与 Pthread, Boost Condition Variable 功能更全面,如不同条件的
wait_until
, wait_for
等功能.
namespace boost { class condition_variable { public: condition_variable(); ~condition_variable(); void notify_one() noexcept; void notify_all() noexcept; void wait(boost::unique_lock<boost::mutex>& lock); template<typename predicate_type> void wait(boost::unique_lock<boost::mutex>& lock,predicate_type predicate); template <class Clock, class Duration> typename cv_status::type wait_until( unique_lock<mutex>& lock, const chrono::time_point<Clock, Duration>& t); ... };
利用 Condition Variables 实现一个简单的 read/writer Buffer.
#include <boost/thread/thread.hpp> #include <boost/thread/mutex.hpp> #include <boost/thread/condition.hpp> #include <iostream> using std::cout; using std::endl; const int kBufSize = 10; const int kIters = 100; boost::mutex io_mutex; class Buffer { public: typedef boost::mutex::scoped_lock scoped_lock; Buffer() : p(0), c(0), full(0) { } void Put(int m) { scoped_lock lock(mutex); if (full == kBufSize) { { scoped_lock lock(io_mutex); cout << "Buffer is full." << endl; } while (full == kBufSize) { cond.wait(lock); } } buf[p] = m; p = (p + 1) % kBufSize; ++full; cond.notify_one(); } int Get() { scoped_lock lock(mutex); if (full == 0) { { scoped_lock lock(io_mutex); cout << "Buffer is empty." << endl; } while (full == 0) { cond.wait(lock); } } int i = buf[c]; c = (c + 1) % kBufSize; --full; cond.notify_one(); return i; } private: boost::mutex mutex; boost::condition cond; unsigned int p, c, full; int buf[kBufSize]; }; Buffer buf; void writer() { for (int i = 0; i < kIters; ++i) { { boost::mutex::scoped_lock lock(io_mutex); cout << "sending: " << i << endl; } buf.Put(i); } } void reader() { for (int i = 0; i < kIters; ++i) { int n = buf.Get(); { boost::mutex::scoped_lock lock(io_mutex); cout << "received: " << n << endl; } } } int main(int argc, char *argv[]) { boost::thread thread_reader(&reader); boost::thread thread_writer(&writer); thread_reader.join(); thread_writer.join(); return 0; }
基本使用:
barrier b(num_threads)
.
b.wait()
.
class barrier { public: barrier(barrier const&) = delete; barrier& operator=(barrier const&) = delete; barrier(unsigned int count); template <typename F> barrier(unsigned int count, F&&); ~barrier(); bool wait(); void count_down_and_wait(); };
#include <boost/thread/once.hpp> namespace boost { struct once_flag; template<typename Function, class ...ArgTypes> inline void call_once(once_flag& flag, Function&& f, ArgTypes&&... args); #if defined BOOST_THREAD_PROVIDES_DEPRECATED_FEATURES_SINCE_V3_0_0 void call_once(void (*func)(),once_flag& flag); #endif }
#include <boost/thread/thread.hpp> #include <boost/thread/once.hpp> #include <iostream> int i = 0; boost::once_flag flag = BOOST_ONCE_INIT; void init() { ++i; } void thread() { boost::call_once(&init, flag); } int main(int argc, char* argv[]) { boost::thread thrd1(&thread); boost::thread thrd2(&thread); thrd1.join(); thrd2.join(); std::cout << i << std::endl; return 0; }
pthread_atfork
pthread_attr_destroy
pthread_attr_getdetachstate
pthread_attr_getguardsize
pthread_attr_getinheritsched
pthread_attr_getschedparam
pthread_attr_getschedpolicy
pthread_attr_getscope
pthread_attr_getstack
pthread_attr_getstackaddr
pthread_attr_getstacksize
pthread_attr_init
pthread_attr_setdetachstate
pthread_attr_setguardsize
pthread_attr_setinheritsched
pthread_attr_setschedparam
pthread_attr_setschedpolicy
pthread_attr_setscope
pthread_attr_setstack
pthread_attr_setstackaddr
pthread_attr_setstacksize
pthread_barrier_destroy
pthread_barrier_init
pthread_barrier_wait
pthread_barrierattr_destroy
pthread_barrierattr_getpshared
pthread_barrierattr_init
pthread_barrierattr_setpshared
pthread_cancel
pthread_cleanup_pop
pthread_cleanup_push
pthread_cond_broadcast
pthread_cond_destroy
pthread_cond_init
pthread_cond_signal
pthread_cond_timedwait
pthread_cond_wait
pthread_condattr_destroy
pthread_condattr_getclock
pthread_condattr_getpshared
pthread_condattr_init
pthread_condattr_setclock
pthread_condattr_setpshared
pthread_create
pthread_detach
pthread_equal
pthread_exit
pthread_getconcurrency
pthread_getcpuclockid
pthread_getschedparam
pthread_getspecific
pthread_join
pthread_key_create
pthread_key_delete
pthread_kill
pthread_mutex_destroy
pthread_mutex_getprioceiling
pthread_mutex_init
pthread_mutex_lock
pthread_mutex_setprioceiling
pthread_mutex_timedlock
pthread_mutex_trylock
pthread_mutex_unlock
pthread_mutexattr_destroy
pthread_mutexattr_getprioceiling
pthread_mutexattr_getprotocol
pthread_mutexattr_getpshared
pthread_mutexattr_gettype
pthread_mutexattr_init
pthread_mutexattr_setprioceiling
pthread_mutexattr_setprotocol
pthread_mutexattr_setpshared
pthread_mutexattr_settype
pthread_once
pthread_rwlock_destroy
pthread_rwlock_init
pthread_rwlock_rdlock
pthread_rwlock_timedrdlock
pthread_rwlock_timedwrlock
pthread_rwlock_tryrdlock
pthread_rwlock_trywrlock
pthread_rwlock_unlock
pthread_rwlock_wrlock
pthread_rwlockattr_destroy
pthread_rwlockattr_getpshared
pthread_rwlockattr_init
pthread_rwlockattr_setpshared
pthread_self
pthread_setcancelstate
pthread_setcanceltype
pthread_setconcurrency
pthread_setschedparam
pthread_setschedprio
pthread_setspecific
pthread_sigmask
pthread_spin_destroy
pthread_spin_init
pthread_spin_lock
pthread_spin_trylock
pthread_spin_unlock
pthread_testcancel
在之前 浅谈 Memory Reordering 中谈到编译器 reordering 和在多核下的处理器的 reordering,在 lock-free programming 中,如果不控制好这两者的 reordering 就会引起上文中所不想的结果.
你可以通过指令强制 CPU 和编译器在内存处理上的顺序,这些指令就被成为 Memory Barrier.
有很多指令作为 memory barriers,所以需要知道很多不同类型的 memory barriers. Doug Lea 指出如下的四大类可以很好的归纳在 CPU 上的特殊指令.尽管不是完全,大多数时候,一个正真的 CPU 指令执行包含上面 barrier 类型的各种组合,或附带其他效果.无论如何, 一旦你理解了这四种类型的 memory barriers,你就很好的理解了大部分真正 CPU 的关于内存约束的指令. Memory Barriers Are Like Source Control Operations 这篇把 Memory Barriers 与 Source Control 作类比,熟悉 Source Control 机制的可以很形象的理解各类 Memory Barriers 机制.
顺序: Load1; LoadLoad; Load2
保证 Load1 的数据加载在被 load2 和之后的 load 指令读取加载之前.是一个比较好的方法防止看到旧的数据.以这个经典的例子,CPU1 检查一个共享的标识变量 flag 来确认一些数据是否被 CPU1 更新.如果标识变量 flag 是 true 的话,把LoadLoad
barrier
放在读取更新数据之前:
1 2 3 4 |
|
只要is_updated
被 CPU1 看到为 true, LoadLoad
fence 防止 CPU1 读到比标识变量 flag 本身旧的value
.
顺序: Store1; StoreStore; Store2
保证 Store1 的数据被其他 CPU 看到在与这数据相关的 Store2 和之后的 store 指令之前.同样,它足够的防止其他 CPU 看到自己的旧数据.同上一样的例子,CPU1 需要更新一些数据到共享的内存中,把StoreStore
barrier 放在标识变量 flag 是 true
之前:
1 2 3 |
|
一旦其他 CPU 看到is_updated
为 true,它能自信它看到正确的value
值.而且
value
不需要原子类型,它可以是一个包含很多元素的大数据结构.
顺序: Load1; LoadStore; Store2
保证 Load1 的数据被加载在与这数据相关的 Store2 和之后的 store 指令之前.
顺序: Store1; StoreLoad; Load2
保证 Store1 的数据被其他 CPU 看到在数据被 Load2 和之后的 load 指令加载之前.也就是说,它有效的防止所有 barrier 之前的 stores 与所有 barrier 之后的 load 乱序.
StoreLoad
是唯一的.它是唯一的 memory barrier 类型来防止r1=r2=0
在之前
Memory ordering at processor time
中给出的例子.
StoreLoad
有什么区别与StoreStore
之后跟LoadLoad
?虽然,StoreStore
按序把存储改变推送到主内存中,LoadLoad
按序把改变加载过来,但是这两种类型的 barrier 是不够的.Store 可以延迟任意的指令,以致在 Load
之后,Load 也可以不是加载最新 Store 之后的内容.这就是为啥 PowerPC 的指令
lwsync
,包含这三种 memory barriers,LoadLoad
,LoadStore
和
StoreStore
,但不包含StoreLoad
,是不足以防止r1=r2=0
在那个实例中.
除了上面 4 大类,还有Loadload
的弱化模式的Data dependency barrier
.如
LoadLoad
类似,在两个 load 顺序执行,load2 依赖于 load1 的结果,Data
dependency barrier
需要插入保证两者的顺序.
但与LoadLoad
不同,Data dependency barrier
只是部分顺序约束在内在以来的 load,就是 load1 必须与 load2 是 data dependency 而不是仅仅是
control dependency.
r1 与 r2 之间是 data dependency.
1 2 |
|
r1 与 r2 之间是 control dependency.
1 2 3 4 5 6 |
|
在 lock-free programming 中,共享内存被多个线程通过合作传递信息来处理,在这种处理下,acquire 和 release semantics 是关键技术保证可靠的传递信息在线程之间.
acqure 和 release semantics 并没有好的被定义,这里借用 Jeff Preshing 在 这里给予的定义:
Acquire semantics 是一种只能应用于如下操作的性质: 从共享内存读取,无论是 read-modify-write 操作还是普通的加载.这一操作被认为是一个 read acquire. Acquire semantics 防止 read acquire 程序上之后的任何读或写操作与它的内存乱序.
Release semantics 是一种只能应用于如下操作的性质: 写入到共享内存, 无论是 read-modify-write 操作还是普通的存储.这一操作被认为是一个 write release. Release semantics 防止 write release 程序上之前的任何读或写操作与它的乱序.
Acqure 和 release semantics 能通过之前四种 memory barrier 的简单组合来达到.
Acqure 和 release semantics 可以基本划分为如下结构:
在 X86/64 使用mefence
指令,mfence 是一个满足全部 memory barrier,防止任何类型的内存乱序.
C++11 的 atomic 库定义了一个可移植的函数atomic_thread_fence()
,输入一个变量来指定什么类型的 fence.
在 C++11 中,可以直接对 atomic 变量直接约束 fence,而不是显示的明确 fence.与上面明确 fence 相比,这实际是更优的方法来表达 acquire and release semantics 在 C++11 中.
Happens-before 是一个术语来描述 C++11,Java,LLVM 之类背后的软件内存模型.
在之上每个语言里都能找到* happends-before *的定义,尽管每个都有不同的说法,但内在意思基本一致.粗略地讲,基本定义如下:
A 和 B 表示一个多线程进行的操作.若 A happens-before B,那么,在 B 进行前,A 对 B 的内存影响有效的被 B 看到.
无论使用任何编程语言,它们都有一个共同处:如果操作 A 和 B 被同一个进程进行,A 的语句在 B 的语句之前在程序顺序上,那么 A 优先发生(happens-before)B.这也是在之前 Memory ordering 中谈到中心原则.
这里再次提一下指令重排序问题,有人有如下疑问: 指令重排序会破坏 happens-before 原则吗?happens-before 的程序次序原则说:在一个线程内,按照程序代码顺序,书写在前面的操作会先行发生于书写在后面的操作。如果线程内出现指令重排序,那不是破坏了程序次序原则了吗?
是会破坏程序次序的执行,但是并不破坏 happens-before 原则,并不造成内存对单线程有效性的破坏.这里主要的困惑是时间上顺序的发生之前(happening before)与先行发生(happens-before)两者关系.
时间上顺序的发生在前于(happening before)与先行发生(happens-before)两者是不一样的,基本没太大关系.特别:
谨记 happens-before 是由一系列编程语言特定定义的操作间的关系,它的存在独立于时间的概念.
如下例子有 happens-before 关系但并不是顺序执行,没有 happening before.如下代码:(1) 存储到 A,之后(2)存储到 B.根据程序顺序原则,(1) happens-before (2).
1 2 3 4 5 |
|
用 O2 打开优化编译的如下:
1 2 3 4 5 6 |
|
从汇编指令看出,第二句mov DWORD PTR B, 0
就已经完成对B
的存储,但是对A
的存储还没进行.(1)顺序上并没有在(2)之前执行!
但是 happens-before 原则有被违背吗?根据定义,(1)的内存效用必须有效被看到在进行(2)之前.也就是存储 A 必须影响存储 B.
在这里,存储 A 实际并没有影响存储 B.(2)被提前执行与之后执行仍然一样,相当与 (1)的内存有效性是一样的.因此,这并不算违背 happens-before 原则.
这是个时间上发生于前但并含有 happens-before 关系的例子.如下的代码,想象一个线程调用UpdateValue
,而另一个线程调用ConsumeValue
.因为处理共享的数据并行的,为了简单,认为普通的读取和存储int
是 atomic 的.因为程序顺序原则,在(1)和(2)之间 happens-before 关系,(3)和(4)之间 happens-before 关系.
1 2 3 4 5 6 7 8 9 10 11 12 |
|
进一步假设在运行开始的时候,(3)读取update
到为 1,这个值是有(2)在另外个线程中存储的.这里,我们可以得出时间顺序上(2)必须发生前于(3).但是这里并没有规则意味着在(2)和(3)之间有 happens-before 关系.(2)和(3)之间没有
happens-before 关系,(1)和(4)之间也没有 happens-before 关系.因此,(1)和(4)
的内存可以重排序,因为编译器重排序或在 CPU 上内存重排序,以致(4)可以打印
“0”,即使(3)读到 1.
那么接下来做 3 个关于 Mutex 的 Benchmark,具体分析一下 Mutex 的开销如何,最后并利用原子操作和 semaphore 实现一个 lightweight Mutex.
一个 Mutex 仅仅从 Lock 到 Unlock 具体开销是多少,是不是占用很多时间,从 Always Use a Lightweight Mutex 从可以看到在 windows 中有两种 Mutex:Muetx 和 Critical Section, 重量级和轻量级的区别,两者的时间开销相差 25 倍多,所以一直使用轻量级的 Mutex.
这篇文章在高强度下 lock 的性能:每个线程做任何事情都占用 lock(高冲突),lock 占用极短的时间 (高频率).值得一读,但是在实际应用中,基本避免如此使用 locks.这里对 Mutex Contention 和 Mutex Frequency 都做最好和最坏场景的使用测试.
Mutex 被灌以避免使用也因为其他原因.现在有很多大家熟知的 lock-free programming 技术.Lock-free 编程非常具有挑战性,但在实际场景中获得巨大的性能.既然有 lock-free 的技术吸引我们使用它们,那么 locks 就显得索然无味了.
但也不能因此忽略 lock.因为在实际很多场景,它仍然是利器.
Linux 下的 POSIX thread 是轻量级的 Mutex.基于 Linux 特有的 futex 技术,当没有其他线程竞争锁时它被优化过.使用如下简单的例子,测试一个单线程 lock 和 unlock,所有代码在 Github 上.
1 2 3 4 5 6 7 |
|
插入相应的时间代码,算出 10 万次的单线程 lock/unlock 平均时间.在不同的处理器下,结果如下:
如果假设一个线程每分钟获取 1e5 次 mutex,并且没有其他线程与它竞争.基于如下的图,可预计 0.2%到 0.4%的开销.不算差.在比较低频率下,开销基本忽略不计.之后 Build own lightweight mutex,会利用 semaphore 和一个原子操作,实现一个 lightweight mutex.
POSIX thread 与 Windows Critical Section 不同,它不仅支持线程间的同步, 还支持进程间的同步.实例代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 |
|
在测试中,产生一个不断生成随机数的线程,使用自己编制的线程安全的 Mersenne Twister 实现代码.每过一段时间,它获取和释放一个锁,获取和释放锁之间的时间每次是随机的,但是总的平均时间是提前设计好的.这个随机的过程就是个泊松分布过程,计算出产生一个随机数的平均时间 6.25 ns 在 2.93 GHz i7 上,把它作为运行单位.利用 Poisson Process 的算法决定运行多少个运行单位在获取和释放锁之间.并利用 High Resolution TimeAPI 计算时间.这个线程的代码如下,所有代码在 Github 上:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
|
这里模拟获取和释放 15000 次锁每秒,从 1 个线程运行到 2 个线程,最后到 4 个线程.并且验证占用锁的时间,从 0%到 100%的每次运行时间占用锁.把 1 个线程的完成的工作量作为基准数据,其他的去除以它,计算相对增益.基本测试方案如下:
1 2 3 4 5 |
|
从图中看出,随着锁占用的时间增加,并行性越来越差,直到最后占用 60%以后,单线程运行的更好.可以说,短时间的占用锁的时间,以 10%以内,系统达到很高的并行性.虽然并不是完美的,但是也接近.锁总体很快.
把这个结果放到实际中,Jeff Preshing 在 这篇 提到,实际的游戏程序中,15000 的锁每秒来自 3 个线程,占用锁的时间相对 2%.在图中很适中的区域.
尽管一个 lightweight mutex 有开销,但如上测试在 2.40GHz i5 上,lock/unlock 锁开销约 34.2ns ,因此 15000 锁每秒开销很低以致不是严重影响结果.那么把锁的每秒频率提高呢?
只创建 2 个线程,进行一系列的锁的每秒频率测试在 2.40GHz i5 上,从占用锁时间 10 ns(1e8/s)到 100 us(1e4/s),用单线程的占用锁时间 10 ms 作为基准工作量,其他与它比较,测试方案如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 |
|
如预想一样,对于非常高频率的锁,锁的开销开始减少实际工作量.在网络上,可以找到很多同样的测试.图中下边的线条,对于这样高的频率,也就是占用锁的时间很短,就一些 CPU 的指令,这样的情况下,当锁之间的工作如此简单,那么一个 lock-free 的实现更适合.
我们获得了一大块关于锁的性能:从它进行很好的情况,到缓慢应用的情况.在考虑实际锁的使用情况,不能说所有锁都是慢的.必须承认,很容易乱用锁,但不用太担心,任何的瓶颈问题都会在细心的 profiling 中发现.当你考虑锁是如何的稳定, 相对容易的理解它们(与 lock-free 技术相比),锁有时候其实很好用.
我们也可以实现自己的简单轻量级的 mutex,但仅仅作为教育手段,理解 mutex 一些内在实现细节,实际现在操作系统都提供轻量级的 mutex,千万不要自己实现一个并实际使用,直接只用操作系统提供的即可.
网络上有很多种方法在用户层写自己的 mutex:
这里利用 Benaphore 技术,在 Linux 平台上利用 semaphore 和 atomic 操作实现自己的 C++版本的 lightweight mutex.这里并没有用 C++11 的原子库.所有代码在 Github 上.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
|
__sync_add_and_fetch
是一个由 GCC 内部提供的 atomic read-modify-write (RMW) 操作,它把 1 加到某个数并且返回新的数,在同一时间所有操作由一个线程原子操作完成,其他线程不能干涉,只能在后等待.这里counter_
初始化为 0,第一个线程调用Lock
将得到 1 从__sync_add_and_fetch
,然后跳过sem_wait
,一旦这个线程占用这个锁,
之后线程都将递增counter_
,获得大于 1 的数,从而调用sem_wait
等待.
之后,第一个线程完成自己的操作,调用Unlock
,__sync_sub_and_fetch
的返回值大于 1 说明有其他线程在等待这个 mutex,调用sem_post
唤醒其他线程.
上面使用了__sync_add_and_fetch
,它编译成lock xadd
指令如下.在没有竞争下的 lock/unlock 操作性能与 pthread mutex 相当.但是在 mutex 多线程竞争情况下,这个 mutex 性能没有 pthread mutex 好.
上面简单的 lightweight mutex 的局限性是它不能递归.也就是同一个线程试图获取同样的锁两次以上,将造成死锁(deadlock).递归锁在函数调用自己时很有用.比如在内存管理代码中,可能会遇到如下代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
|
Lock
是个封装好的 C++宏,用来获取锁和自动结果当退出函数.
可以看到,当传递NULL
给Realloc
,锁被Realloc
函数获取,然后第二次被获取当Alloc
被调用.
把它扩展成可递归的锁如下,加入 2 个新成员变量,owner_
,存储当前占有线程的
ID(TID),和recursion_
,存储递归的层数.基本代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
|
如之前一样,第一个线程调用Lock
,设置owner_
为自己的 TID,增加
recursion_
到 1.如果同一个线程再次调用Lock
,它将同时增加
recursion_
和counter_
.
之后,第一个线程完成自己的操作,调用Unlock
,同时减少recursion_
和counter_
,
仅仅调用sem_post
唤醒其他线程当recursion_
减少到0
.如果
recursion_
仍然大于 0,意味着当前的线程仍然占有此锁在外层程序.
最后进行压力测试,建立一些线程,每个随机获取锁,随机的递归层次.代码在 Github 上.
一些细节问题:
* 在Unlock
中,设置owner_
为 0 在调用__sync_sub_and_fetch
之前,否则可能发生死锁(deadlock).比如,有两个线程 TID 是 111 和 222.
1. 线程 111 完成操作调用Unlock
,先调用__sync_sub_and_fetch
把counter_
减到 0
2. 在设置owner_
为 0 被中断,线程 222 得到运行,它调用Lock
,发现counter_
为 0,跳过sem_wait
,设置owner_=222
,完成Lock
操作.
3. 线程 222 被中断调出,线程 111 重新得到运行,设置owner_
为 0,然后完成Unlock
操作.
4. 因为此时owner_
为 0,线程 222 不能在递归占用锁,一旦它再次获取锁,形成死锁.
在Unlock
中,recursion_
被拷贝到本地变量一次,之后只本地变量,比如没有在__sync_sub_and_fetch
之后重新读取她.因为在那之后它能被其他线程已经改变.
recursion_
和owner_
没有原子操作.因为它们在调用Lock
的
__sync_add_and_fetch
和调用Unlock
的__sync_sub_and_fetch
之间,线程占有锁,独占recursion_
和owner_
的读写操作,并拥有所有的 acquire
and release semantics.对recursion_
和owner_
使用原子操作没必要.因为在 X86/84 的平台上,__sync_add_and_fetch
生成lock xadd
的指令,保证全部的 memory barrier,也就保证 acquire and release semantics.
提到 Mutex,往往会提到 Spinlock,因为在使用 Lock 时,会遇到如何在 Mutex 与 Spinlock 之间选择.那么接下来对比一下两者.
Mutex: 如果一个线程试图获取一个 mutex,但是没有成功,因为 mutex 已经被占用, 它将进入睡眠,让其他进程运行,直到 mutex 被其他进程释放.
Spinlock: 如果一个线程试图获取一个 Spinlock, 但是没有成功,它将持续试着去获取它,直到它最终成功获取,因为它将不允许其他线程运行(然而,操作系统将强制调度其他线程).
Mutex: Mutex 将使得线程睡眠,然后再唤醒它们,两者都是开销比较大的操作,也就是 context switch 的开销.如果锁只是被其他线程占用非常短的时间,那么时间花在使的线程睡眠并唤醒它可能超过它使用 spinlock 持续获取锁的时间.
Spinlock: Spinlock 持续获取锁,浪费很多 CPU 时间,如果锁被其他线程占用很长时间,那么它将浪费很多时间,不如使得线程进入睡眠,让出 CPU.Spinlock 的确能优化 context switches 但会在没有 threads priority inversion 的平台上产生副作用.(但一个高优先级的线程自旋一个锁来等待一个低优先级的线程释放这个锁,就会造成死锁).在没有 Preemption 的 Uniprocessor,使用 spinlock 是没有意义的,当前只有一个线程运行,没有必要保护关键区域,也没有其他线程同时运行,释放锁给它.
所以在 Linux 下,Spinlock 在 kernel 这样实现:
CONFIG_SMP
和CONFIG_PREEMPT
,spinlock 实现代码是空的.CONFIG_SMP
,打开CONFIG_PREEMPT
,spinlock 仅仅是简单的关闭
preemption,足够来防止任何的
races. CONFIG_SMP
,打开CONFIG_PREEMPT
,spinlock 实现如下代码,不断检查
lock 是否被其他线程释放: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
|
Criteria | Muutex | Spinlock |
---|---|---|
机制 | 尝试获取锁.若可得到就占有.若不能,就进入睡眠等待. | 尝试获取锁.若可得到就占有.若不能,持续尝试直到获取. |
什么时候使用 | 当线程进入睡眠没有伤害.或需要等待一段足够长的时间才能获取锁. | 当线程不应该进入睡眠如中断处理等.当只需等待非常短的时间就能获取锁. |
缺点 | 引起 context switch 和 scheduling 开销. | 线程不做任何事情在获取到锁前.浪费 CPU 运行. |
大多数操作系统(包括 Solaris,Mac OS X 和 FreeBSD)使用混合的机制叫”adaptive mutex”或”hybrid mutex”.一个 hybrid mutex 首先行为和 spinlock 一样,如果不能获取锁,持续尝试获取,但过了一定的时间,它就和 mutex 一样,让线程进入睡眠.1.
http://stackoverflow.com/questions/5869825/when-should-one-use-a-spinlock-instead-of-mutex↩
在我们编写的 C/C++代码和它被在 CPU 上运行,按照一些规则,代码的内存交互会被乱序.内存乱序同时由编译器(编译时候)和处理器(运行时)造成,都为了使代码运行的更快.
被编译开发者和处理器制造商遵循的中心内存排序准则是:
不能改变单线程程序的行为.
因为这条规则,在写单线程代码时内存乱序被普遍忽略.即使在多线程程序中,它也被时常忽略,因为有 mutexes,semaphores 等来防止它们调用中的内存乱序.仅当 lock-free 技术被使用时,内存在不受任何互斥保护下被多个线程共享,内存乱序的影响能被看到.
下面先比较 Weak 和 Strong 的内存模型,然后分两部分,实际内存乱序如何在编译和运行时发生,并如何防止它们.
Jeff Preshing 在 Weak vs. Strong Memory Models 中很好的总结了从 Weak 到 Strong 的类型:
非常弱 | 数据依赖性的弱 | 强制 | 顺序一致 |
---|---|---|---|
DEC Alpha | ARM | X86/64 | dual 386 |
C/C++11 low-level atomics | PowerPC | SPARC TSO | Java volatile/C/C++11 atomics |
在最弱的内存模型中,可能经历所有四种内存乱序 (LoadLoad, StoreStore, LoadStore and StoreLoad).任何 load 或 store 的操作能与任何的其他的 load 或 store 操作乱序,只要它不改变一个独立进程的行为.实际中,这样的乱序由于编译器引起的指令乱序或处理器本身处理指令的乱序.
当处理器是弱硬件内存模式,通常称它为 weakly-ordered 或 weak ordering.或说它有 relaxed memory model. DEC Alpha 是 最具代表 的弱排序的处理器.
C/C++的底层原子操作也呈现弱内存模型,无论代码的平台是如 x86/64 的强序处理器.下面章节 Memory ordering at compile time 会演示其弱内存模型,并说明如何强制内存顺序来保护编译器乱序.
ARM 和 PowerPC 系列的处理器内存模型和 Alpha 同样弱,除了它们保持
data dependency ordering.它意味两个相依赖的load
(load A, load B<-A)被保证顺序load B<-A
总能在
load A
之后.(A data dependency barrier is a partial ordering on interdependent loads only; it is not required to have any effect on stores, independent loads or overlapping loads.)
弱和强内存模型区别存在分歧.Preshing 总结的定义是:
一个强硬件内存模型是在这样的硬件上每条机器指令隐性的保证 acquire and release
semantics 的执行.因此,当一个 CPU 核进行了一串写操作,每个其他的 CPU 核看到这些值的改变顺序与其顺序一致.
所以也就是保证了四种内存乱序 (LoadLoad, StoreStore, LoadStore and StoreLoad) 中的 3 种,除了不保证 StoreLoad 的顺序.基于以上的定义,x86/64 系列处理器基本就是强顺序的.之后 Memory ordering at processor time 可以看到 StoreLoad 在 X86/64 的乱序实验.
在顺序一致 (Sequential consistency) 的内存模型中,没有内存乱序存在.
如今,很难找到一个现代多核设备保证在硬件层 Sequential consistency.也就早期的 386 没有强大到能在运行时进行任何内存的乱序.
当用上层语言编程时,Sequential consistency 成为一个重要的软件内存模型.Java5 和之后版本,用volatile
声明共享变量.在 C+11 中,可以使用默认的顺序约束memory_order_seq_cst
在做原子操作时.当使用这些术语后,编译器会限制编译乱序和插入特定 CPU 的指令来指定合适的 memory barrier 类型.
看如下代码:
1 2 3 4 5 |
|
不打开编译器的优化,把它编译成汇编,我们可以看到,B
的赋值在A
的后面,和原程序的顺序一样.
1 2 3 4 5 6 |
|
用O2
打开优化:
1 2 3 4 5 6 |
|
这次编译器把B
的赋值提到A
的前面.为什么它可以这么做呢?内存顺序的中心没有破坏.这样的改变并不影响单线程程序,单线程程序不能知道这样的区别.
但是当编写 lock-free 代码时,这样的编译器乱序就会引起问题.看如下例子,一个共享的标识来表明其他共享数据是否更新:
1 2 3 4 5 6 |
|
如果编译器把update
的赋值提到value
赋值的前面.即使在单核处理器系统中,会有问题:在两个参数赋值的中间这个线程被中断,使得另外的程序通过update
判断以为value
的值已经得到更新,实际上却没有.
一种方法是用一个特殊的被称为 Compiler Barrier 的指令来防止编译器优化的乱序.以下
asm volative
是 GCC 中的方法.
1 2 3 4 5 6 |
|
经过这样的修改,打开优化,B
的存储将保持在要求的顺序上.
1 2 3 4 5 6 |
|
在 C++11 中原子库中,每个不是 relaxed 的原子操作同时是一个 compiler barrier.
1 2 3 4 5 6 7 |
|
每一个拥有 compiler barrier 的函数本身也是一个 compiler barrier,即使它是 inline 的.
1 2 3 4 5 6 7 |
|
进一步推知,大多数被调用的函数是一个 compiler barrier.无论它们是否包含
memory barrier.排除 inline 函数,被声明为pure attribution
或当
link-time code generation
使用时.因为编译器在编译时,并不知道UpdateValue
的运行是否依赖于a
或会改变a
的值从而影响b
,所以编译器不会乱序它们之间的顺序.
可以看到,有许多隐藏的规则禁止编译指令的乱序,也防止了编译器多进一步的代码优化,所以在某些场景 Why the “volatile” type class should not be used, 来让编译器进一步优化.
有隐形的 Compiler Barriers,同样 GCC 编译器也有无缘由的存储.来自这里的实例:
1 2 3 4 5 6 7 8 |
|
在 i686,GCC 3.3.4–4.3.0 用O1
编译得到:
1 2 3 4 5 6 7 8 |
|
在单线程中,没有问题,但多线程中调用f(0)
仅仅只是读取 v 的值,但中断后回去覆盖其他线程修改的值.引起
data rate.在新的 C++11 标准中明确禁止了这样的行为,看最近 C+11 标准进行的 draft§1.10.22 节:
Compiler transformations that introduce assignments to a potentially shared memory location that would not be modified by the abstract machine are generally precluded by this standard.
看一个简单的 CPU 乱序的简单例子,即使在强内存模型的 X86/64 也能看到.有两个整数X
和Y
初始是 0,另外两个变量 r1 和 r2 读取它们的值,两个线程并行运行,执行如下的机器代码:
每个线程存储 1 到一个共享变量,然后把对方变量读取到一个变量或一个寄存器中.无论哪个线程先写 1 到内存,另外个线程读回那个值,意味着最后 r1=1 或 r2=1 或两者都是.但是 X86/64 是强内存模型,它还是允许乱序机器指令.特别,每个线程允许延迟存储到读回之后.以致最后 r1 和 r2 能同时等于 0–违反直觉的一个结果.因为指令可能如下顺序执行:
写一个实例程序,实际看一下 CPU 的确乱序了指令.源码可以 Github 下载.两个读写的线程代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
|
随机的延迟被插入在存储的开始处,为了交错线程的开始时间,以来达到重叠两个线程的指令的目的.随机延迟使用线程安全的MersenneTwister
类.汇编代码asm
volatile("" ::: "memory");
如上节所述只是用来
防止编译器的乱序,
因为这里是要看 CPU 的乱序,排除编译器的乱序影响.
主线程如下,利用
POSIX 的 semaphore
同步它与两个子线程的同步.先让两个子线程等待,直到主线程初始化X=0
和
Y=0
.然后主线程等待,直到两个子线程完成操作,然后主线程检查r1
和r2
的值.所以 semaphore 防止线程见的不同步引起的内存乱序,主线程代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
|
在 Intel i5-2435M X64 的 ubuntu 下运行一下程序:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
|
差不多每 4000 次的迭代才发现一次 CPU 内存乱序.所以多线程的 bug 是多么难发现.那么如何消除这些乱序.至少有如下两种方法:
让两个子线程在同一个 CPU 核下运行,代码如下:
1 2 3 4 5 |
|
防止一个 Store 在 Load 之后的乱序,需要一个 StoreLoad 的 barrier.这里使用
mfence
的一个全部 memory barrier,防止任何类型的内存乱序.代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
|
先理解一些时间的概念。明白不同时间 API 测量的是什么时间。
Wall-clock time,顾名思义,墙上的钟,代表一个任务从开始到完成所经历的时间。它包含 3 部分:CPU 的时间,I/O 的时间和通信延迟的时间。但 wall-clock 很少是正确的时钟来使用,因为它随着时区,和 daylightsaving 改变,或与 NTP 同步。而这些特性没有一个是有益的,如果你用它来调度任务或做 performance benchmarking。它仅仅如名字所言,墙上的一个时钟。
CPU time 仅仅统计一个任务从开始到完成在 CPU 上所花的时间。CPU time 主要包括 User time(在 user space 所花时间)和 System time(在 kernel space 所花时间)。
以并行程序为例,CPU time 就是所有 CPU 在这个程序所花的时间总和, Wall-clock time 在这种情况可能时间相对短,它只统计任务开始到结束所花时间。
对于不同的时钟 API,主要分析如下特性:
Linux 和 OS X 的主要时钟 API:
CLOCKS_PER_SEC
是1000000
,使精度最多达到
1µs.clock_t
类型平台相关(The range and precision of times
representable in clock_t and time_t are implementation-defined.) 它
wrap around 一旦达到最大值.(通常是 32 位的类型,那么~2^32 ticks 后,还是比较长的时间.)Window 的高精度时钟:
QueryPerformanceFrequency() 和 QueryPerformanceCounter(). QueryPerformanceFrequency() 返回计数的频率,QueryPerformanceCounter()返回当前计数值.和 Linux 中 CLOCK_MONOTONIC 一样,它是一个稳定并单调递增计数器,精准达到纳秒级,并且不会 wrap around.
更多参考:
使用
clock_gettime(CLOCK_MONOTONIC,..)
作为 High Resolution Time,编译需加上参数-lrt
,实例代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
|
除了clock_gettime()
高精度时钟外,还有相对应的高精度的睡眠函数
clock_nanosleep,
实例代码如下:
1 2 3 4 5 6 7 8 9 10 |
|
clock_get_time
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
|
mach_absolute_time
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
|
http://stackoverflow.com/questions/12392278/measure-time-in-linux-getrusage-vs-clock-gettime-vs-clock-vs-gettimeofday↩
C++11 引入了新的内存模型和线程库,使得能在 C++中实现可移植的 DCLP.本文说明如何实现它.
在 单例模式(Singleton) 很好的介绍什么是 DCLP,这里稍作回顾.
线程安全的方式实现 Signleton 模式如下:
1 2 3 4 5 6 7 |
|
每次获取 Singleton 都要获取一个锁,但是实际上,我们只有当初始化 pInstance 时才需要一个锁。也就是只发生在第一次调用 instance 时。如果在一个程序运行时, instance 被调用了 n 次,我们只需要锁在第一次调用时。当我们知道那 n-1 次锁是没必要的.
DCLP 的关键点是发现,大多数 instance 的调用将看到 pInstance 是非空的,因此根本没必要去尝试初始化它。因此,DCLP 判断 pInstance 是否为空在尝试获取锁前。只有当判断成功( pInstance 还没有被初始化)才去获取锁,然后之后这个判断在此进行一次确保 pInstance 是仍然空的。(所以名字叫双重检查锁)。第二个检查是有必要的,因为从上可以看到,另外的线程可能碰巧初始化了 pInstance 在 pInstance 被第一次判断和获取锁之间。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
|
单例模式(Singleton) 说明了各种不安全实现的缺陷,主要原因是 1) 编译器的乱序编译 和 2) CPU 的乱序执行指令.所以安全的实现依靠 memory barrier,防止它们的乱序,使得在多线程中得到同步,C++11 之前没有可移植的 C/C++函数,但现在,C++11 有了.
使用 Acqure 和 Release Fence 来实现它,并且保证对实例pInstance
进行原子操作,把它定义为atomic
类型,并用memory_order_relaxed
操作.(Relaxed
ordering: there are no synchronization or ordering constraints, only
atomicity is required of this operation.)如下实现代码.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
|
在多核系统中,这整个代码也是稳健的,因为 memory fences 在多个线程间建立了同步的关系.Singleton::m_pInstance
作为 guard variable,singleton 变量自身成为 payload.
如果没有这层同步关系的话,就不能保证第一个线程的所有写操作(这里就是
singleton 实力的创建)被第二个线程读取到,即使m_pInstance
已经被第二个线程能看到.
write-release 能同步于一个 read-release.
memory_order_acquire
: A load operation with this memory order performs the acquire operation on the affected memory location: prior writes made to other memory locations by the thread that did the release become visible in this thread.
memory_order_release
: A store operation with this memory order performs the release operation: prior writes to other memory locations become visible to the threads that do a consume or an acquire on the same location.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
|
从深层分析来看,这种形式的免锁机制的同步比上面单独 memory fences 来的约束更小.这种形式的操作只意味在这个操作周围防止内存乱序,而 memory fences 意味着在一块区域内防止内存乱序.更多细节参考 preshing 的
Acquire and Release Fences Don’t Work the Way You’d Expect
的分析.
## 使用 C++11 的 Sequentially-consistent ordering
C++11 还提供了其他的方法来写 lock-free 的代码.当在 atomic 操作函数中忽略
std::memory_order
参数项,那么默认值是std::memory_order_seq_cst
,使得所有原子参数成为
sequentically consistent(SC)
原子.通过 SC 原子性,整个算法保证 sequentically consistent 只要没有 data races.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
|
SC 的原子性可能更容易理解.权衡点就是它产生的机器代码没有之前做法的高效.比如如下是 Gcc 4.8.2 intle X64 对上面代码产生的机器代码,通过g++ -O2 -std=c++11 -S
.
因为使用了 SC 原子性,对m_pInstance
的存储实现使用了mfence
指令,起到一个在 X64 上的 full memory fence.这是个更严格的指令想对于 DCLP 在 X64 上的实际需求.一个普通的mov
足以胜任.但也无关紧要,因为mfence
指令也仅仅执行一次而已,就在创建 singleton 的实例的代码路径上.
使用 Preshing 的小型可移植的 lock-free 库,在没有 C++11 的支持下,使用它的 Mintomic Fences 实现 DCLP.
更多关于 C++11 的 multithreading 库的详解见之后的文章.
]]>A greedy algorithm suffices for correctness: we always add the lowest-weight edge linking a vertex in the tree to a vertex on the outside. (选取相邻最近的不在树内的点。)
prim(graph *g, int start) { int i; /* counter */ edgenode *p; /* temporary pointer */ bool intree[MAXV+1]; /* is the vertex in the tree yet? */ int distance[MAXV+1]; /* cost of adding to tree */ int v; /* current vertex to process */ int w; /* candidate next vertex */ int weight; /* edge weight */ int dist; /* best current distance from start */ for (i=1; i<=g->nvertices; i++) { intree[i] = FALSE; distance[i] = MAXINT; parent[i] = -1; } distance[start] = 0; v = start; while (intree[v] == FALSE) { intree[v] = TRUE; p = g->edges[v]; while (p != NULL) { w = p->y; weight = p->weight; if ((distance[w] > weight) && (intree[w] == FALSE)) { distance[w] = weight; parent[w] = v; } p = p->next; } v = 1; dist = MAXINT; for (i=1; i<=g->nvertices; i++) if ((intree[i] == FALSE) && (dist > distance[i])) { dist = distance[i]; v=i; } } }
The algorithm repeatedly considers the lightest remaining edge and tests whether its two endpoints lie within the same connected component. (最短边)
a clever data structure calledunion-find,can support such queries in O(lgn) time. With this data structure, Kruskal’s algorithm runs in O(mlgm) time.
Implementation
kruskal(graph *g) { int i; /* counter */ set_union s; /* set union data structure */ edge_pair e[MAXV+1]; /* array of edges data structure */ bool weight_compare(); set_union_init(&s, g->nvertices); to_edge_array(g, e); /* sort edges by increasing cost */ qsort(&e,g->nedges,sizeof(edge_pair),weight_compare); for (i=0; i<(g->nedges); i++) { if (!same_component(s,e[i].x,e[i].y)) { printf("edge (%d,%d) in MST\n",e[i].x,e[i].y); union_sets(&s,e[i].x,e[i].y); } } }
find(i)
now equals find(j)
.
We must double the number of nodes in the tree to get an extra unit of height. How many doublings can we do before we use up allnnodes? At most, lg2ndoublings can be performed. Thus, we can do both unions and finds in O(logn), good enough for Kruskal’s algorithm. In fact, union-find can be done even faster, as discussed in Section 12.5.
Implementation
typedef struct { int p[SET_SIZE+1]; /* parent element */ int size[SET_SIZE+1]; /* number of elements in subtree i */ int n; /* number of elements in set */ } set_union; set_union_init(set_union *s, int n) { int i; /* counter */ for (i=1; i<=n; i++) { s->p[i] = i; s->size[i] = 1; } s->n = n; } int find(set_union *s, int x) { if (s->p[x] == x) return(x); else return( find(s,s->p[x]) ); } int union_sets(set_union *s, int s1, int s2) { int r1, r2; /* roots of sets */ r1 = find(s,s1); r2 = find(s,s2); if (r1 == r2) return; /* already in same set */ if (s->size[r1] >= s->size[r2]) { s->size[r1] = s->size[r1] + s->size[r2]; s->p[ r2 ] = r1; } else { s->size[r2] = s->size[r1] + s->size[r2]; s->p[ r1 ] = r2; } } bool same_component(set_union *s, int s1, int s2) { return ( find(s,s1) == find(s,s2) ); }
Given a particular start vertexs, it finds the shortest path from s to every other vertex in the graph, including your desired destination t.
Implementation
dijkstra(graph *g, int start) /* WAS prim(g,start) */ { int i; /* counter */ edgenode *p; /* temporary pointer */ bool intree[MAXV+1]; /* is the vertex in the tree yet? */ int distance[MAXV+1]; /* distance vertex is from start */ int v; /* current vertex to process */ int w; /* candidate next vertex */ int weight; /* edge weight */ int dist; /* best current distance from start */ for (i=1; i<=g->nvertices; i++) { intree[i] = FALSE; distance[i] = MAXINT; parent[i] = -1; } distance[start] = 0; v = start; while (intree[v] == FALSE) { intree[v] = TRUE; p = g->edges[v]; while (p != NULL) { w = p->y; weight = p->weight; /* CHANGED */ if (distance[w] > (distance[v]+weight)) { /* CHANGED */ distance[w] = distance[v]+weight; /* CHANGED */ parent[w] = v; } p = p->next; } v=1; dist = MAXINT; for (i=1; i<=g->nvertices; i++) if ((intree[i] == FALSE) && (dist > distance[i])) { dist = distance[i]; v=i; } } }
As implemented here, the complexity is O(n2).
Dijkstra works correctly only on graphs without negative-cost edges. The reason is that midway through the execution we may encounter an edge with weight so negative that it changes the cheapest way to get froms to some other vertex already in the tree.
typedef struct { int weight[MAXV+1][MAXV+1]; /* adjacency/weight info */ int nvertices; /* number of vertices in graph */ } adjacency_matrix;
The critical issue in an adjacency matrix implementation is how we denote the edges absent from the graph. A common convention for unweighted graphs denotes graph edges by 1 and non-edges by 0. This gives exactly the wrong interpretation if the numbers denote edge weights, for the non-edges get interpreted as a free ride between vertices. Instead, we should initialize each non-edge to MAXINT.
floyd(adjacency_matrix *g) { int i,j; /* dimension counters */ int k; /* intermediate vertex counter */ int through_k; /* distance through vertex k */ for (k=1; k<=g->nvertices; k++) for (i=1; i<=g->nvertices; i++) for (j=1; j<=g->nvertices; j++) { through_k = g->weight[i][k]+g->weight[k][j]; if (through_k < g->weight[i][j]) g->weight[i][j] = through_k; } }
The Floyd-Warshall all-pairs shortest path runs in O(n3) time, which is asymptotically no better thanncalls to Dijkstra’s algorithm. However, the loops are so tight and the program so short that it runs better in practice.
“We can get good word-use frequencies and grammatical information from a big text database called the Brown Corpus. It contains thousands of typical English sentences, each parsed according to parts of speech. But how do we factor it all in?” Harald asked.
Each possible sentence interpretation can be thought of as a path in a graph. The vertices of this graph are the complete set of possible word choices. There will be an edge from each possible choice for the ith word to each possible choice for the (i + 1)st word. The cheapest path across this graph defines the best interpretation of the sentence.
Perhaps we can count how often that pair of words occurred together in previous texts. Or we can weigh them by the part of speech of each word. Maybe nouns don’t like to be next to nouns as much as they like being next to verbs.
We can pay a cost for walking through a particular vertex that depends upon the frequency of the word. Our best sentence will be given by the shortest path across the graph.
The constraints for many pattern recognition problems can be naturally formulated as shortest path problems in graphs. In fact, there is a particularly convenient dynamic programming solution for these problems (the Viterbi algorithm). Despite the fancy name, the Viterbi algorithm is basically solving a shortest path problem on a DAG.
The network flow problem asks for the maximum amount of flow which can be sent from vertices s to t in a given weighted graph G while respecting the maximum capacities of each pipe.
The largest bipartite matching can be readily found using network flow. Create a source nodes that is connected to every vertex in L by an edge of weight 1. Create a sink node t and connect it to every vertex in R by an edge of weight 1. Finally, assign each edge in the bipartite graph G a weight of 1. Now, the maximum possible flow fromstotdefines the largest matching in G.
The key structure is the residual flow graph, denoted as R(G, f), where Gis the input graph andfis the current flow through G.
The maximum flow fromstotalways equals the weight of the minimums-t cut. Thus, flow algorithms can be used to solve general edge and vertex connectivity problems in graphs.
Implementation
typedef struct { int v; /* neighboring vertex */ int capacity; /* capacity of edge */ int flow; /* flow through edge */ int residual; /* residual capacity of edge */ struct edgenode *next; /* next edge in list */ } edgenode; netflow(flow_graph *g, int source, int sink) { int volume; /* weight of the augmenting path */ add_residual_edges(g); initialize_search(g); bfs(g,source); volume = path_volume(g, source, sink, parent); while (volume > 0) { augment_path(g,source,sink,parent,volume); initialize_search(g); bfs(g,source); volume = path_volume(g, source, sink, parent); } } bool valid_edge(edgenode *e) { if (e->residual > 0) return (TRUE); else return(FALSE); } int path_volume(flow_graph *g, int start, int end, int parents[]) { edgenode *e; /* edge in question */ edgenode *find_edge(); if (parents[end] == -1) return(0); e = find_edge(g,parents[end],end); if (start == parents[end]) return(e->residual); else return( min(path_volume(g,start,parents[end],parents), e->residual) ); } edgenode *find_edge(flow_graph *g, int x, int y) { edgenode *p; /* temporary pointer */ p = g->edges[x]; while (p != NULL) { if (p->v == y) return(p); p = p->next; } return(NULL); } augment_path(flow_graph*g,intstart,intend,intparents[],intvolume) { edgenode *e; /* edge in question */ edgenode *find_edge(); if (start == end) return; e = find_edge(g,parents[end],end); e->flow += volume; e->residual -= volume; e = find_edge(g,end,parents[end]); e->residual += volume; augment_path(g,start,parents[end],parents,volume); }
Edmonds and Karp [EK72] proved that always selecting ashortest unweighted augmenting path guarantees that O(n3) augmentations suffice for optimization.
The secret is learning to design graphs, not algorithms. We have already seen a few instances of this idea:
Problem: “In my graphics work I need to solve the following problem. Given an arbitrary set of rectangles in the plane, how can I distribute them into a minimum number of buckets such that no subset of rectangles in any given bucket intersects another? In other words, there can not be any overlapping area between two rectangles in the same bucket.”
Solution: We formulate a graph where each vertex is a rectangle, and there is an edge if two rectangles intersect. Each bucket corresponds to anindependent set of rectangles, so there is no overlap between any two. Avertex coloringof a graph is a partition of the vertices into independent sets, so minimizing the number of colors is exactly what you want.
Problem:“In porting code from UNIX to DOS, I have to shorten several hundred file names down to at most 8 characters each. I can’t just use the first eight characters from each name, because “filename1” and “filename2” would be assigned the exact same name. How can I meaningfully shorten the names while ensuring that they do not collide?”
Solution: Construct a bipartite graph with vertices corresponding to each original file namefi for 1≤i≤n, as well as a collection of acceptable shortenings for each name fi1,…,fik. Add an edge between each original and shortened name. We now seek a set of n edges that have no vertices in common, so each file name is mapped to a distinct acceptable substitute. Bipartite matching, discussed in Section 15.6 (page 498), is exactly this problem of finding an independent set of edges in a graph.
Problem: “We need a way to separate the lines of text in the optical characterrecognition system that we are building. Although there is some white space between the lines, problems like noise and the tilt of the page makes it hard to find. How can we do line segmentation?
Solution: Consider the following graph formulation. Treat each pixel in the image as a vertex in the graph, with an edge between two neighboring pixels. The weight of this edge should be proportional to how dark the pixels are. A segmentation between two lines is a path in this graph from the left to right side of the page. We seek a relatively straight path that avoids as much blackness as possible. This suggests that theshortest pathin the pixel graph will likely find a good line segmentation.
Is the path between two vertices in a minimum spanning tree necessarily a shortest path between the two vertices in the full graph? Give a proof or a counterexample.
Assume that all edges in the graph have distinct edge weights (i.e. , no pair of edges have the same weight). Is the path between a pair of vertices in a minimum spanning tree necessarily a shortest path between the two vertices in the full graph? Give a proof or a counterexample.
不必要. 如下图,若 a 是 6 的话,minimum spanning tree 不会选择 a,但 A 和 C 间的最短路径会选择 a.
Can Prim’s and Kruskal’s algorithm yield different minimum spanning trees? Explain why or why not.
能.当有相同 weight 的边.
当所有边的 weight 不同时,图存在唯一的 minimum spanning trees,两者生成同样的树.
Does either Prim’s and Kruskal’s algorithm work if there are negative edge weights? Explain why or why not.
可以.Prim 每次选相邻最近的不在树内的点,有负 weight 的边并不影响它. 而 Kruskal 每次选最短的边,同样不受影响.
Suppose we are given the minimum spanning tree T of a given graph G (with n vertices and m edges) and a new edge e = (u,v) of weight w that we will add to G. Give an efficient algorithm to find the minimum spanning tree of the graph G + e. Your algorithm should run in O(n) time to receive full credit.
新添加的 e 在顶点 u 和 v 中间,原本的 MST 中 u 和 v 通过 u->a1->ai->v,把此路径的边与 e 比较,用 Prim 算法选最临近点.
(a) Let T be a minimum spanning tree of a weighted graph G. Construct a new graph G′ by adding a weight of k to every edge of G. Do the edges of T form a minimum spanning tree of G′? Prove the statement or give a counterexample.
(b) Let P = {s, … , t} describe a shortest weighted path between vertices s and t of a weighted graph G. Construct a new graph G′ by adding a weight of k to every edge of G. Does P describe a shortest path from s to t in G′? Prove the statement or give a counterexample.
(a)和(b)都对,并没有改变边之间的比较关系.
Devise and analyze an algorithm that takes a weighted graph G and finds the smallest change in the cost to a non-MST edge that would cause a change in the minimum spanning tree of G. Your algorithm must be correct and run in polynomial time.
总共边数 m,算法复杂度 O(m2).
Consider the problem of finding a minimum weight connected subset T of edges from a weighted connected graph G. The weight of T is the sum of all the edge weights in T.
MST 不能有环路,minimum weight connected subset T 可以有环路,所以如果一条负数 weight 的边,不在 MST 中,但却包含在 T 中,因为它能使 T 的总权值减小.
sort(edges); c := n; for edge in edges: if edge.weight < 0: if find(edge.firstEnd) != find(edge.secondEnd): --c; unite(edge.firstEnd, edge.secondEnd); else: if c == 1: break; if find(edge.firstEnd) != find(edge.secondEnd): unite(edge.firstEnd, edge.secondEnd); --c;
Let G=(V,E) be an undirected graph. A set F⊆E of edges is called a feedback-edge set if every cycle of G has at least one edge in F.
Modify Prim’s algorithm so that it runs in time O(nlogk) on a graph that has only k different edges costs.
Devise an efficient data structure to handle the following operations on a weighted directed graph:
使用 Union-Find 并添加 minimum edge.
typedef struct { int p[SET_SIZE+1]; /* parent element */ int size[SET_SIZE+1]; /* number of elements in subtree i */ int minedge[SET_SIZE+1]; int n; /* number of elements in set */ } set_union;
The single-destination shortest path problem for a directed graph seeks the shortest path from every vertex to a specified vertex v. Give an efficient algorithm to solve the single-destination shortest paths problem.
用 Floyd-Warshall 对于顶点 v 反向更新距离值.得到最终 shortest paths.
Let G be a weighted directed graph with n vertices and m edges, where all edges have positive weight. A directed cycle is a directed path that starts and ends at the same vertex and contains at least one edge. Give an O(n3) algorithm to find a directed cycle in G of minimum total weight. Partial credit will be given for an O(n2m) algorithm.
run Floyd Warshall on the graph min <- MAX_INT vertex <- None for each pair of vertices u,v if (dist(u,v) + dist(v,u) < min): min <- dist(u,v) + dist(v,u) pair <- (u,v) return path(u,v) + path(v,u)
Can we modify Dijkstra’s algorithm to solve the single-source longest path problem by changing minimum to maximum? If so, then prove your algorithm correct. If not, then provide a counterexample.
没有负 weight 的边,可以.
LetG=(V,E) be a weighted acyclic directed graph with possibly negative edge weights. Design a linear-time algorithm to solve the single-source shortest-path problem from a given source v.
for each vertex y in a topological ordering of G choose edge (x,y) minimizing d(s,x)+length(x,y) path(s,y) = path(s,x) + edge (x,y) d(s,y) = d(s,x) + length(x,y)
Let G=(V,E) be a directed weighted graph such that all the weights are positive. Let v and w be two vertices in G and k≤|V| be an integer. Design an algorithm to find the shortest path from v to w that contains exactly k edges. Note that the path need not be simple.
create the table D[V,k]; D[v,1] = 0; for i in other vertex except v: D[i,1] = MAX_INT; for m=2 to k: for every edge(i,j): D[j,m] = D[i,m-1] + D[i,j] P[i,m] = i Path = emtpy list i = w for m=k down to 1: Path.append(m); i = P[m,k] Path.append(V); Path.reverse();
Arbitrage is the use of discrepancies in currency-exchange rates to make a profit. For example, there may be a small window of time during which 1 U.S. dollar buys 0.75 British pounds, 1 British pound buys 2 Australian dollars, and 1 Australian dollar buys 0.70 U.S. dollars. At such a time, a smart trader can trade one U.S. dollar and end up with 0.75 × 2 × 0.7 = 1.05 U.S. dollars—a profit of 5%. Suppose that there are n currencies c1 , …, cn and an n × n table R of exchange rates, such that one unit of currency ci buys R[i,j] units of currency cj. Devise and analyze an algorithm to determine the maximum value of R[c1, ci1] · R[ci1, ci2] · · · R[cik−1, cik] · R[cik, c1]
log(a*b*c) = loga + lgob + log.所以求最长路径.
Adjacency lists are the right data structure for most applications of graphs.
Adjacency Lists
#define MAXV 1000 // maximum number of vertices typedef struct { int y; // adjacency info int weight; // edge weight, if any struct edgenode *next; // next edge in list } edgenode; typedef struct { edgenode *edges[MAXV + 1]; // adjacency info int degree[MAXV + 1]; // outdegree of each vertex int nvertices; // number of vertices in graph int nedges; // number of edges in graph bool directed; // is the graph directed } graph; void initialize_graph(graph *g, bool directed) { int i; g->nvertices = 0; g->nedges = 0; g->directed = directed; for (i = 1; i <= NMAX; ++i) { g->degree[i] = 0; g->edges[i] = NULL; } } void insert_edge(graph *g, int x, int y, bool directed) { edgenode *p; p = new edgenode; p-> weight = 0; p->y = y; p->next = g->edges[x]; g->edges[x] = p; g->degree[x]++; if (directed == false) { insert_edge(g, y, x, true); } else { g->nedges++; } } void read_graph(graph *g, bool directed) { int i; int m; int x, y; initialize_graph(g, directed); scanf("%d %d", &(g->nvertices), &m); for (i = 1; i <= m; ++i) { scanf("$d %d", &x, &y); insert_edge(g, x, y, directed); } } print_graph(graph *g) { int i; edgenode *p; for (i = 1; i <= g->nvertices; ++i) { printf("%d: ", i); p = g->edges[i]; while (p != NULL) { printf("%d ", p->y); p = p->next; } printf("\n"); } }
The key idea behind graph traversal is to mark each vertex when we first visit it and keep track of what we have not yet completely explored. Although bread crumbs or unraveled threads have been used to mark visited places in fairy-tale mazes, we will rely on Boolean flags or enumerated types.
Each vertex will exist in one of three states:
先遍历完一个点的所有相邻点。
bool processed[MAXV+1]; /* which vertices have been processed */ bool discovered[MAXV+1]; /* which vertices have been found */ int parent[MAXV+1]; /* discovery relation */ initialize_search(graph *g) { int i; /* counter */ for (i=1; i<=g->nvertices; i++) { processed[i] = discovered[i] = FALSE; parent[i] = -1; } } bfs(graph *g, int start) { queue q; /* queue of vertices to visit */ int v; /* current vertex */ int y; /* successor vertex */ edgenode *p; /* temporary pointer */ init_queue(&q); enqueue(&q,start); discovered[start] = TRUE; while (empty_queue(&q) == FALSE) { v = dequeue(&q); process_vertex_early(v); processed[v] = TRUE; p = g->edges[v]; while (p != NULL) { y = p->y; if ((processed[y] == FALSE) || g->directed) process_edge(v,y); if (discovered[y] == FALSE) { enqueue(&q,y); discovered[y] = TRUE; parent[y] = v; } p = p->next; } process_vertex_late(v); } } find_path(int start, int end, int parents[]) { if ((start == end) || (end == -1)) printf("\n%d",start); else { find_path(start,parents[end],parents); printf(" %d",end); } }
Because vertices are discovered in order of increasing distance from the root, this tree has a very important property. The unique tree path from the root to each node x∈V uses the smallest number of edges (or equivalently, intermediate nodes) possible on any root-to-xpath in the graph.
There are two points to remember when using breadth-first search to find a shortest path fromxtoy: First, the shortest path tree is only useful if BFS was performed with x as the root of the search. Second, BFS gives the shortest path only if the graph is unweighted.
Properly implemented using adjacency lists, any such algorithm is destined to be linear, since BFS runs in O(n+m) time on both directed and undirected graphs. This is optimal, since it is as fast as one can hope to read any n-vertex, m-edge graph.
The difference between BFS and DFS results is in the order in which they explore vertices. This order depends completely upon the container data structure used to store the discovered but not processed vertices.
DFS organizes vertices by entry/exit times, and edges into tree and back edges. This organization is what gives DFS its real power.
Implementation
The beauty of implementingdfsrecursively is that recursion eliminates the need to keep an explicit stack:
dfs(graph *g, int v) { edgenode *p; /* temporary pointer */ int y; /* successor vertex */ if (finished) return; /* allow for search termination */ discovered[v] = TRUE; time = time + 1; entry_time[v] = time; process_vertex_early(v); p = g->edges[v]; while (p != NULL) { y = p->y; if (discovered[y] == FALSE) { parent[y] = v; process_edge(v,y); dfs(g,y); } else if ((!processed[y]) || (g->directed)) process_edge(v,y); if (finished) return; p = p->next; } process_vertex_late(v); time = time + 1; exit_time[v] = time; processed[v] = TRUE; }
But any back edge going from x to an ancestorycreates a cycle with the tree path fromytox. Such a cycle is easy to find using dfs:
process_edge(int x, int y) { if (parent[x] != y) { /* found back edge! */ printf("Cycle from %d to %d:",y,x); find_path(y,x,parent); printf("\n\n"); finished = TRUE; } }
Observe that there is a single point of failure—a single vertex whose deletion disconnects a connected component of the graph. Such a vertex is called an articulation vertex or cut-node.
More robust graphs without such a vertex are said to be biconnected.
Temporarily delete each vertex v, and then do a BFS or DFS traversal of the remaining graph to establish whether it is still connected. The total time fornsuch traversals is O(n(m+n)). There is a clever linear-time algorithm, however, that tests all the vertices of a connected graph using a single depth-first search.
Let reachable_ancestor[v]
denote the earliest reachable ancestor of
vertex v, meaning the oldest ancestor ofvthat we can reach by a
combination of tree edges and back edges. Initially,
reachable_ancestor[v] = v
:
int reachable_ancestor[MAXV+1]; /*earliestreachableancestorofv*/ int tree_out_degree[MAXV+1]; /* DFStree outdegree ofv*/ process_vertex_early(int v) { reachable_ancestor[v] = v; }
We update reachable_ancestor[v]
whenever we encounter a back edge
that takes us to an earlier ancestor than we have previously seen. The
relative age/rank of our ancestors can be determined from
their entry_time’s
:
process_edge(int x, int y) { int class; /* edge class */ class = edge_classification(x,y); if (class == TREE) tree_out_degree[x] = tree_out_degree[x] + 1; if ((class == BACK) && (parent[x] != y)) { if (entry_time[y] < entry_time[ reachable_ancestor[x] ] ) reachable_ancestor[x] = y; } }
The key issue is determining how the reachability relation impacts whether vertexv is an articulation vertex. There are three cases:
The routine below systematically evaluates each of the three
conditions as we back up from the vertex after traversing all outgoing
edges. We use entry_time[v]
to represent the age of vertex v. The
reachability time time_v
calculated below denotes the oldest vertex that
can be reached using back edges.
process_vertex_late(int v) { bool root; /* is the vertex the root of the DFS tree? */ int time_v; /* earliest reachable time for v */ int time_parent; /* earliest reachable time for parent[v] */ if (parent[v] < 1) { /* test if v is the root */ if (tree_out_degree[v] > 1) printf("root articulation vertex: %d \n",v); return; } root = (parent[parent[v]] < 1); /* is parent[v] the root? */ if ((reachable_ancestor[v] == parent[v]) && (!root)) printf("parent articulation vertex: %d \n",parent[v]); if (reachable_ancestor[v] == v) { printf("bridge articulation vertex: %d \n",parent[v]); if (tree_out_degree[v] > 0) /* test if v is not a leaf */ printf("bridge articulation vertex: %d \n",v); } time_v = entry_time[reachable_ancestor[v]]; time_parent = entry_time[ reachable_ancestor[parent[v]] ]; if (time_v < time_parent) reachable_ancestor[parent[v]] = reachable_ancestor[v]; }
We can alternately talk about reliability in terms of edge failures instead of vertex failures.
In fact
all bridges can be identified in the same O(n+m) time. Edge (x, y) is a
bridge if (1) it is a tree edge, and (2) no back edge connects from
yor below toxor above. This can be computed with a minor modification
of the reachable_ancestor
function.
For directed graphs, depth-first search labelings can take on a wider range of possibilities. Indeed, all four of the edge cases in Figure below can occur in traversing directed graphs.
The correct labeling of each edge can be readily determined from the state, discovery time, and parent of each vertex, as encoded in the following function:
int edge_classification(int x, int y) { if (parent[y] == x) return(TREE); if (discovered[y] && !processed[y]) return(BACK); if (processed[y] && (entry_time[y]>entry_time[x])) return(FORWARD); if (processed[y] && (entry_time[y]<entry_time[x])) return(CROSS); printf("Warning: unclassified edge (%d,%d)\n",x,y); }
A directed graph isstrongly connectedif there is a directed path between any two vertices.
The algorithm is based on the observation that it is easy to find a directed cycle using a depth-first search, since any back edge plus the down path in the DFS tree gives such a cycle. All vertices in this cycle must be in the same strongly connected component. Thus, we can shrink (contract) the vertices on this cycle down to a single vertex representing the component, and then repeat. This process terminates when no directed cycle remains, and each vertex represents a different strongly connected component.
We update our notion of the oldest reachable vertex in response to (1) nontree edges and (2) backing up from a vertex.
strong_components(graph *g) { int i; /* counter */ for (i=1; i<=(g->nvertices); i++) { low[i] = i; scc[i] = -1; } components_found = 0; init_stack(&active); initialize_search(&g); for (i=1; i<=(g->nvertices); i++) if (discovered[i] == FALSE) { dfs(g,i); } }
Define low[v]to be the oldest vertex known to be in the same strongly connected component asv. This vertex is not necessarily an ancestor, but may also be a distant cousin of v because of cross edges. Cross edges that point vertices from previous strongly connected components of the graph cannot help us, because there can be no way back from them tov, but otherwise cross edges are fair game. Forward edges have no impact on reachability over the depth-first tree edges, and hence can be disregarded:
int low[MAXV+1]; /* oldest vertex surely in component of v */ int scc[MAXV+1]; /* strong component number for each vertex */ process_edge(int x, int y) { int class; /* edge class */ class = edge_classification(x,y); if (class == BACK) { if (entry_time[y] < entry_time[ low[x] ] ) low[x] = y; } if (class == CROSS) { if (scc[y] == -1) /* component not yet assigned */ if (entry_time[y] < entry_time[ low[x] ] ) low[x] = y; } }
A new strongly connected component is found whenever the lowest reachable vertex fromvis v. If so, we can clear the stack of this component. Otherwise, we give our parent the benefit of the oldest ancestor we can reach and backtrack:
process_vertex_early(int v) { push(&active,v); } process_vertex_late(int v) { if (low[v] == v) { /* edge (parent[v],v) cuts off scc */ pop_component(v); } if (entry_time[low[v]] < entry_time[low[parent[v]]]) low[parent[v]] = low[v]; } pop_component(int v) { int t; /* vertex placeholder */ components_found = components_found + 1; scc[ v ] = components_found; while ((t = pop(&active)) != v) { scc[ t ] = components_found; } }
Give a linear algorithm to compute the chromatic number of graphs where each vertex has degree at most 2. Must such graphs be bipartite?
这样的图不必要是 bipartite 的.反例是:3 个顶点,两两相连.
因为每个顶点最多 2 度,使用 DFS 遍历,对子顶点着色与父顶点相反的颜色.当遇到一个回归的边,那么对当前定点着色与父顶点不同,并且与回归边上的祖先定点不同.
只有一次遍历,复杂度 O(m+n) (m edges, n vertices).
Given pre-order and in-order traversals of a binary tree, is it possible to reconstruct the tree? If so, sketch an algorithm to do it. If not, give a counterexample. Repeat the problem if you are given the pre-order and post-order traversals.
没有相同元素,给予 pre-order and in-order traversals 能重构 binary search tree.代码如下.若有相同元素,给予:
preorder = {1,1} inorder = {1,1}
可以重构:
1 1
/ or \
1 1
每次 preorder 的数都要去搜索在 inoder 所在位置,若书是平衡的,那么 n 个元素每次搜索后总的算法复杂度 O(nlogn),但不是平衡的,一下就变成 O(n2).
所以利用 hash table,先把 inorder 的元素和位置 hash 起来,那么总的算法时间:O(n).
以下假设元素都小于 255,简单的利用数组映射来模拟 hash table.
struct Node { int val; struct Node* left; struct Node* right; Node(int val_in) { val = val_in; left = NULL; right = NULL; } }; const int kMax = 256; int map_index[kMax]; void MapToIndex(int inorder[], int n) { for (int i = 0; i < n; ++i) { map_index[inorder[i]] = i; } } Node *BuildInorderPreorder(in in[], in pre[], int n, int offset) { if (n == 0) { return NULL: } int root_val = pre[0]; int i = map_index[root_val] - offset; Node *root = new Node(root_val); root->left = BuildInorderPreorder(in, pre+1, i, offset); root->right = BuildInorderPreorder(in+i+1, pre+i+1, offset+i+1); return root; }
The square of a directed graph G = (V,E) is the graph G2 = (V,E2) such that (u,w)∈E2 iff there exists v∈V, such that (u,v)∈E and (u,w)∈E; i.e., there is a path of exactly two edges from u to w. square of a graph Give efficient algorithms for both adjacency lists and matrices.
adjacency matrices 算法复杂度:O(n3).
MakeSquareGraph(G, n) for i=1 to n for j=1 to n G2[i][j] = 0 for i=1 to n for j=1 to n if (G[i][j] == 1) for k=1 to n if (G[j][k] == 1) G2[i][k] = 1 return G2
Consider a set of movies \(M_1, M_2, \ldots, M_k\). There is a set of customers, each one of which indicates the two movies they would like to see this weekend. Movies are shown on Saturday evening and Sunday evening. Multiple movies may be screened at the same time. You must decide which movies should be televised on Saturday and which on Sunday, so that every customer gets to see the two movies they desire. Is there a schedule where each movie is shown at most once? Design an efficient algorithm to find such a schedule if one exists.
把问题转换成图问题解决。建立无向图,顶点是每部电影,边 E(1,2)表示有个客户想看 M1 和 M2.如下图实例,有电影 M1-M4,3 个客户,1 个客户想看 M1 和 M3,一个客户想看 M1 和 M4,一个客户想看 M2 和 M4.那么把图分成(M1,M2)和(M3,M4),周六日各放一组,满足所有客户要求。
若多一个客户想看 M3 和 M4,如下图,无论怎么分图,都有 2 部电影相连,所以不能满足所有客户的要求。可以得出:若原本的图是 bipartite graph,那么能找到满足客户的放映安排。若不是,就不能满足客户要求。
Your job is to arrange n ill-behaved children in a straight line, facing front. You are given a list of m statements of the form i hates j. If i hates j, then you do not want put i somewhere behind j, because then i is capable of throwing something at j.
Which data structures are used in depth-first and breath-first search?
Write a function to traverse binary search tree and return the ith node in sorted order.
struct Node { int val; struct Node* left; struct Node* right; Node(int val_in) { val = val_in; left = NULL; right = NULL; } }; bool FindIthElementCore(struct Node *root, int ith, int *index, int *value) { if (root == NULL) { return false; } if (FindIthElementCore(root->left, ith, index, value)) { return true; } cout << ith << ": " << *index << ": " << root->val << endl; if (ith == *index) { *value = root->val; return true; } (*index)++; if (FindIthElementCore(root->right, ith, index, value)) { return true; } else { return false; } } bool FindIthElement(struct Node *root, int ith, int *value) { int start = 0; return FindIthElementCore(root, ith, &start, value); }
Problem: Given an array-based heap on n elements and a real number x, efficiently determine whether the kth smallest element in the heap is greater than or equal to x. Your algorithm should be O(k) in the worst-case, independent of the size of the heap. Hint: you do not have to find the kth smallest element; you need only determine its relationship to x.
Solution: There are at least two different ideas that lead to correct but inefficient algorithms for this problem:
An O(k) solution can look at only k elements smaller than x, plus at most O(k) elements greater than x. Consider the following recursive procedure, called at the root with i= 1 with count=k:
int heap_compare(priority_queue *q, int i, int count, int x) { if ((count <= 0) || (i > q->n)) return(count); if (q->q[i] < x) { count = heap_compare(q, pq_young_child(i), count-1, x); count = heap_compare(q, pq_young_child(i)+1, count, x); } return(count); }
If the root of the min-heap is ≥ x, then no elements in the heap can be less than x, as by definition the root must be the smallest element. This procedure searches the children of all nodes of weight smaller than x until either (a) we have found k of them, when it returns 0, or (b) they are exhausted, when it returns a value greater than zero. Thus it will find enough small elements if they exist.
But how long does it take? The only nodes whose children we look at are those < x, and at most k of these in total. Each have at most visited two children, so we visit at most 3k nodes, for a total time of O(k).
Mergesort is a great algorithm for sorting linked lists, because it does not rely on random access to elements as does heapsort or quicksort. Its primary disadvantage is the need for an auxilliary buffer when sorting arrays. It is easy to merge two sorted linked lists without using any extra space, by just rearranging the pointers. However, to merge two sorted arrays (or portions of an array), we need use a third array to store the result of the merge to avoid stepping on the component arrays
int binary_search(item_type s[], item_type key, int low, int high) { int middle; /* index of middle element */ if (low > high) return (-1); /* key not found */ middle = (low+high)/2; if (s[middle] == key) return(middle); if (s[middle] > key) return (binary_search(s,key,low,middle-1)); else return (binary_search(s,key,middle+1,high)); }
This algorithm runs inO(lgn+s), wheresis the number of occurrences of the key. This can be as bad as linear if the entire array consists of identical keys. A faster algorithm results by modifying binary search to search for the boundary of the block containing k, instead of kitself. Suppose we delete the equality test
if (s[middle] == key) return(middle);
from the implementation above and return the index low
instead of
−1
on each unsuccessful search. All searches will now be
unsuccessful, since there is no equality test. The search will proceed
to the right half whenever the key is compared to an identical array
element, eventually terminating at the right boundary. Repeating the
search after reversing the direction of the binary comparison will
lead us to the left boundary. Each search takes O(lgn) time, so we can
count the occurrences in logarithmic time regardless of the size of
the block.
Now suppose we have an array A consisting of a run of 0’s, followed
by an unbounded run of 1’s, and would like to identify the exact
point of transition between them. Binary search on the array would
provide the transition point in lgn tests, if we had a bound non the
number of elements in the array. In the absence of such
a bound, we can test repeatedly at larger intervals (A[1], A[2],
A[4], A[8], A[16],...
) until we find a first nonzero value. Now we
have a window containing the target and can proceed with binary
search. This one-sided binary search finds the transition pointpusing at
most 2lgp comparisons, regardless of how large the array actually is.
First, observe that the square root ofn≥1 must be at least 1 and at
most n. Let l = 1
and r = n
. Consider the midpoint of this
interval, m=(l+r)/2
. How does m2 compare to n? If n≥m2 , then the
square root must be greater than m, so the algorithm repeats with
l=m
. If n<m2 , then the square root must be less than m, so the
algorithm repeats with r=m
.
Suppose that we start with values l and r such that f(l)>0 and f(r)<0.
If f is a continuous function, there must exist a root between l and
r. Depending upon the sign of f(m), where m=(l+r)/2
, we can cut this
window containing the root in half with each test and stop soon as our
estimate becomes sufficiently accurate.
divide-and-conquer recurrences of the form T(n)=aT(n/b)+f(n)
1. If $f(n) = O(n^{log_{b}^{a-\epsilon}})$ for some constant $\epsilon > 0$, then $T(n) = \Theta(n^{log_{b}^a})$.The Grinch is given the job of partitioning 2n players into two teams of n players each. Each player has a numerical rating that measures how good he/she is at the game. He seeks to divide the players as unfairly as possible, so as to create the biggest possible talent imbalance between team A and team B. Show how the Grinch can do the job in O(nlogn) time.
用个 O(nlogn)的排序算法对 2n 个队根据实力排序,前 n 个作为一队,后 n 个作为一队。
For each of the following problems, give an algorithm that finds the desired numbers within the given amount of time. To keep your answers brief, feel free to use algorithms from the book as subroutines. For the example,S={6,13,19,3,8}, 19−3 maximizes the difference, while 8−6 minimizes the difference.
(a) Let S be an unsorted array of n integers. Give an algorithm that finds the pair x, y∈S that maximizes|x−y|. Your algorithm must run in O(n) worst-case time.
(b) Let S be a sorted array of n integers. Give an algorithm that finds the pair x, y∈S that maximizes |x−y|. Your algorithm must run in O(1) worst-case time.
(c) Let S be an unsorted array of n integers. Give an algorithm that finds the pair x, y∈S that minimizes |x−y|, for x ≠ y. Your algorithm must run in O(nlogn) worst-case time.
(d) Let S be a sorted array of n integers. Give an algorithm that finds the pair x, y∈S that minimizes |x−y|, for x ≠ y. Your algorithm must run in O(n) worst-case time.
Take a sequence of 2n real numbers as input. Design an O(nlogn) algorithm that partitions the numbers intonpairs, with the property that the partition minimizes the maximum sum of a pair. For example, say we are given the numbers (1,3,5,9). The possible partitions are ((1,3),(5,9)), ((1,5),(3,9)), and ((1,9),(3,5)). The pair sums for these partitions are (4,14), (6,12), and (10,8). Thus the third partition has 10 as its maximum sum, which is the minimum over the three partitions.
start = 0;
end = 2n - 1;
while (start < end) {
pair(S[star], S[end]);
start++;
end--;
Assume that we are given n pairs of items as input, where the first item is a and the second item is one of three colors (red, blue, or yellow). Further assume that the items are sorted by number. Give an O(n) algorithm to sort the items by color (all reds before all blues before all yellows) such that the numbers for identical colors stay sorted. For example: (1,blue), (3,red), (4,blue), (6,yellow), (9,red) should become (3,red), (9,red), (1,blue), (4,blue), (6,yellow).
The mode of a set of numbers is the number that occurs most frequently in the set. The set (4,6,2,4,3,1) has a mode of 4. Give an efficient and correct algorithm to compute the mode of a set of n numbers.
Given two sets S1 and S2 (each of size n), and a number x, describe an O(nlogn) algorithm for finding whether there exists a pair of elements, one from S1 and one from S2, that add up to x. (For partial credit, give a Θ(n2) algorithm for this problem.)
sort S1 in O(nlogn) sort S2 in O(nlogn) begin = 0; end = n - 1; while (begin < n && end >=0) { if ((S1[begin] + S2[end]) < X) { begin++; } else if ((S1[begin] + S2[end]) > X) { end--; } else { return true; } } return false;
Outline a reasonable method of solving each of the following problems. Give the order of the worst-case complexity of your methods.
都使用 Hash Table,O(n)
Given a set of S containing n real numbers, and a real number x. We seek an algorithm to determine whether two elements of S exist whose sum is exactly x.
(1): Binary search
sort S in O(nlogn); for (int i = 0; i < n; ++i) { binarysearch S[i] in S[i+1,n] }
Scan
sort S in O(nlogn); i = 0; j = n - 1; while (i < j) { if (s[i] + s[j] < X) { i++; } else if (s[i] + s[j] > X) { j--; } else { break; } }
(2)
i = 0; j = n - 1; while (i < j) { if (s[i] + s[j] < X) { i++; } else if (s[i] + s[j] > X) { j--; } else { break; } }
Give an efficient algorithm to compute the union of sets A and B, where n = max( | A | , | B | ). The output should be an array of distinct elements that form the union of the sets, such that they appear more than once in the union.
set U to empty; int i = 0; int j = 0; while (i < na && j < na) { if (A[i] < B[j]) { add A[i] into U; i++; } else (A[i] > B[j]) { add B[j] into U; j++; } else { add A[i] into U; i++; j++; } } if (i < na) { while (i < na) { add A[i] into U; i++; } if (j < nb) { while (j < nb) { add B[j] into U; j++; } }
Given a set S of n integers and an integer T, give an O(nk − 1logn) algorithm to test whether k of the integers in S add up to T.
Design an O(n) algorithm that, given a list of n elements, finds all the elements that appear more than n / 2 times in the list. Then, design an O(n) algorithm that, given a list of n elements, finds all the elements that appear more than n / 4 times.
Hash Table 可以解决。或
数组中最多有一个数超过重复 n/2 次,并且排序后的第 ceiling(n/2)个数必定是这个数。
#include <stack> using std::stack; bool FindMoreThanHalf(int *array, int n, int *res) { stack<int> stk; int i; for (i = 0; i < n; ++i) { if (stk.empty()) { stk.push(array[i]); } else { if (stk.top() == array[i]) { stk.push(array[i]); } else { stk.pop(); } } } if (stk.empty()) { return false; } int candidate = stk.top(); int times = 0; for (i = 0; i < n; ++i) { if (array[i] == candidate) { times++; } } if (times > n / 2) { *res = candidate; return true; } return false; }
Devise an algorithm for finding the k smallest elements of an unsorted set of n integers in O(n + klogn).
You wish to store a set of n numbers in either a max-heap or a sorted array. For each application below, state which data structure is better, or if it does not matter. Explain your answers.
Give an O(nlogk)-time algorithm that merges k sorted lists with a total of n elements into one sorted list. (Hint: use a heap to speed up the elementary O(kn)-time algorithm).
(a) Give an efficient algorithm to find the second-largest key among n keys. You can do better than 2n − 3 comparisons. (b) Then, give an efficient algorithm to find the third-largest key among n keys. How many key comparisons does your algorithm do in the worst case? Must your algorithm determine which key is largest and second-largest in the process?
Random Selection可以找出任意的第几大值,平均时间复杂度:O(n),比较次数将是 n 的倍数,最坏时间复杂度可以达到:O(nlogn)。
Tournament Algorithm找第二大元素比较次数 O(n+logn);找第 k 个最大元素,比较次数为 O(n+klogn)。
Use the partitioning idea of quicksort to give an algorithm that finds the median element of an array of n integers in expected O(n) time. (Hint: must you look at both sides of the partition?)
unsigned int seed = time(NULL); int randint(int m, int n) { return m + rand_r(&seed) / (RAND_MAX / (n + 1 - m) + 1); } void RandomSelectionK(int *array, int l, int u, int k) { if (l >= u) { return; } swap(array[l], array[randint(l, u)]); int pivot = array[l]; int i = l; int j = u + 1; while (true) { do { ++i; } while (i <= u && array[i] < pivot); do { --j; } while (array[j] > pivot); if (i > j) { break; } swap(array[i], array[j]); } swap(array[l], array[j]); if (j < k) { RandomSelectionK(array, j + 1, u, k); } else if (j > k) { RandomSelectionK(array, l, j - 1, k); } }
f(n) = 2*f(n/2) + n ==> f(n) = 2k * f(n/2k) + kn = (n+2)logn f(n) = f(n/3) + f(2n/3) + n ==> f(n) = O(nlogn)
Suppose an array A consists of n elements, each of which is red, white, or blue. We seek to sort the elements so that all the reds come before all the whites, which come before all the blues The only operation permitted on the keys are
Find a correct and efficient algorithm for red-white-blue sorting. There is a linear-time solution.
2 次扫描。
Stable sorting algorithms leave equal-key items in the same relative order as in the original permutation. Explain what must be done to ensure that mergesort is a stable sorting algorithm.
在合并时元素相等时选 index 小的元素在前。
Show that n positive integers in the range 1 to k can be sorted in O(nlogk) time. The interesting case is when k < < n.
We seek to sort a sequence S of n integers with many duplications, such that the number of distinct integers in S is O(logn). Give an O(nloglogn) worst-case time algorithm to sort such sequences.
balanced binary search tree.
Let A[1..n] be an array such that the first \(n-\sqrt n\) elements are already sorted (though we know nothing about the remaining elements). Give an algorithm that sorts A in substantially better than nlogn steps.
+ $O(\sqrt{n}log(\sqrt{n})$ 排序后面的 $\sqrt{n}$ 个元素。Assume that the array A[1..n] only has numbers from \(\{1,\ldots, n^2\}\) but that at most loglogn of these numbers ever appear. Devise an algorithm that sorts A in substantially less than O(nlogn).
和 23 一样,用 balanced binary search tree,树的高度不超过 loglogn,最后的复杂度 O(n*logloglogn)。
Let P be a simple, but not necessarily convex, polygon and q an arbitrary point not necessarily in P. Design an efficient algorithm to find a line segment originating from q that intersects the maximum number of edges of P. In other words, if standing at point q, in what direction should you aim a gun so the bullet will go through the largest number of walls. A bullet through a vertex of P gets credit for only one wall. An O(nlogn) algorithm is possible.
A company database consists of 10,000 sorted names, 40% of whom are known as good customers and who together account for 60% of the accesses to the database. There are two data structure options to consider for representing the database:
Only if we do not find the query name on a binary search of the first array do we do a binary search of the second array. Demonstrate which option gives better expected performance. Does this change if linear search on an unsorted array is used instead of binary search for both options?
single array is better.
two array is better.
Suppose you are given an array A of n sorted numbers that has been circularly shifted k positions to the right. For example, {35,42,5,15,27,29} is a sorted array that has been circularly shifted k = 2 positions, while {27,29,35,42,5,15} has been shifted k = 4 positions.
if (k == 0) { return A[n-1]; } else { return A[k-1]; }
int FindLargestNumber(int *array, int l, int h) { if (array[l] < array[h]) { return array[h]; } if (l == h) { return array[h]; } int mid; mid = (l + h) / 2; if ((mid + 1 <= h) && array[mid] > array[mid + 1]) { return array[mid]; } if ((mid - 1 >= l) && array[mid - 1] > array[mid]) { return array[mid - 1]; } if (array[mid] < array[h]) { return FindLargestNumber(array, l, mid - 1); } else { return FindLargestNumber(array, mid + 1, h); } }
Consider the numerical 20 Questions game. In this game, Player 1 thinks of a number in the range 1 to n. Player 2 has to figure out this number by asking the fewest number of true/false questions. Assume that nobody cheats.
Suppose that you are given a sorted sequence of distinct integers . Give an O(lgn) algorithm to determine whether there exists an i index such as ai = i. For example, in { − 10, − 3,3,5,7}, a3 = 3. In {2,3,4,5,6,7}, there is no such i.
bool CheckEqualIndex(int *array, int l, int h) { while (l <= h) { int mid = (l + h) / 2; if (array[mid] > (mid + 1)) { h = mid - 1; } else if (array[mid] < (mid + 1)) { l = mid + 1; } else { return true; } } return false; }
Suppose that you are given a sorted sequence of distinct integers , drawn from 1 to m where n < m. Give an O(lgn) algorithm to find an integer that is not present in a. For full credit, find the smallest such integer.
int FindMissingElement(int *array, int l, int h) { while (l <= h) { int mid = (l + h) / 2; if (array[mid] > (mid + 1)) { h = mid - 1; } else if (array[mid] <= (mid + 1)) { l = mid + 1; } } return l + 1; }
Let M be an n*m integer matrix in which the entries of each row are sorted in increasing order (from left to right) and the entries in each column are in increasing order (from top to bottom). Give an efficient algorithm to find the position of an integer x in M, or to determine that x is not there. How many comparisons of x with matrix entries does your algorithm use in worst case?
O(m+n)
bool FindElement(int **array, int x, int n, int m, int *pos_x, int *pos_y) { int row = 0, col = m - 1; while (row < n && col >= 0) { if (array[row][col] == x) { *pos_x = row; *pos_y = col; return true; } else if (array[row][col] > x) { col--; } else { row++; } } return true; }
Consider an n*n array A containing integer elements (positive, negative, and zero). Assume that the elements in each row of A are in strictly increasing order, and the elements of each column of A are in strictly decreasing order. (Hence there cannot be two zeroes in the same row or the same column.) Describe an efficient algorithm that counts the number of occurrences of the element 0 in A. Analyze its running time.
int CountZero(int **array, int n) { int row = n - 1, col = n - 1; int count = 0; while (row >=0 && col >= 0) { if (array[row][col] == 0) { count++; row--; } else if(array[row][col] > 0) { col--; } else { row--; } } return count; }
If you are given a million integers to sort, what algorithm would you use to sort them? How much time and memory would that consume?
Describe advantages and disadvantages of the most popular sorting algorithms.
Merge sort:
Insertion/Selection sort:
Heap sort:
Quick sort:
Implement an algorithm that takes an input array and returns only the unique elements in it.
排序,然后扫描输出.O(nlogn).
You have a computer with only 2Mb of main memory. How do you use it to sort a large file of 500 Mb that is on disk?
Design a stack that supports push, pop, and retrieving the minimum element in constant time. Can you do this?
只有一个 stack 办不到.如果两个 stack,可以利用另外一个 stack 存储最小值.
Given a search string of three words, find the smallest snippet of the document that contains all three of the search words—i.e., the snippet with smallest number of words in it. You are given the index positions where these words occur in the document, such as word1: (1, 4, 5), word2: (3, 9, 10), and word3: (2, 6, 15). Each of the lists are in sorted order, as above.
复杂度:O(nlogk),n 是所有字母的位置个数,k 是字母个数。这里 k=3,所以 O(n)
#include <queue> using std::priority_queue; #include <utility> using std::make_pair; using std::pair; #include <vector> using std::vector; #include <algorithm> using std::max; using std::min; #include <limits> using std::numeric_limits; int FindSmallestSnippet(vector<vector<int> > &index_positions) { // max-priority, select smallest position, use -index_positions[i][j], (i,j) priority_queue<pair<int, pair<int ,int> > > queue; int max_pos = 0; // the max pos of the snippet int i; for (i = 0; i < index_positions.size(); ++i) { int pos = index_positions[i][0]; max_pos = max(max_pos, pos); queue.push(make_pair(-pos, make_pair(i, 0))); } int smallest_len = numeric_limits<int>::max(); while (queue.size() == index_positions.size()) { int min_pos = -queue.top().first; smallest_len = min(smallest_len, max_pos - min_pos + 1); int word_pos = queue.top().second.first; int index = queue.top().second.second; queue.pop(); ++index; if (index < index_positions[word_pos].size()) { int next_pos = index_positions[word_pos][index]; max_pos = max(max_pos, next_pos); queue.push(make_pair(-next_pos, make_pair(word_pos, index))); } } return smallest_len; }
You are given 12 coins. One of them is heavier or lighter than the rest. Identify this coin in just three weighings.
Advantages of contiguously-allocated arrays include:
The downside of arrays is that we cannot adjust their size in the middle of a program’s execution.
Actually, we can efficiently enlarge arrays as we need them, through the miracle of dynamic arrays. The apparent waste in this procedure involves the recopying of the old contents on each expansion. Thus, each of thenelements move only two times on average, and the total work of managing the dynamic array is the sameO(n) as it would have been if a single array of sufficient size had been allocated in advance! The primary thing lost using dynamic arrays is the guarantee that each array access takes constant time in the worst case.
The relative advantages of linked lists over static arrays include:
while the relative advantages of arrays include:
A common problem for compilers and text editors is determining whether the parentheses in a string are balanced and properly nested. For example, the string ((())())() contains properly nested pairs of parentheses, which the strings )()( and ()) do not. Give an algorithm that returns true if a string contains properly nested and balanced parentheses, and false if otherwise. For full credit, identify the position of the first offending parenthesis if the string is not properly nested and balanced.
#include <string> using std::string; #include <stack> using std::stack; bool BalancedParentheses(string parentheses, int *pos) { stack<int> stk; const int kLeftPar = 1; int i; for (i = 0; i < parentheses.size(); ++i) { if (parentheses[i] == '(') { stk.push(kLeftPar); } else { if (stk.empty()) { *pos = i; return false; } stk.pop(); } } if (!stk.empty()) { *pos = --i; return false; } return true; }
Write a program to reverse the direction of a given singly-linked list. In other words, after the reversal all pointers should now point backwards. Your algorithm should take linear time.
struct Node { int value; struct Node *next; Node(int in_value, struct Node* in_next) : value(in_value), next(in_next) { } }; void ReverseLinkedList(Node **head) { if (!head || *head == NULL) { return; } Node *prev, *p, *next; prev = *head; p = prev->next; prev->next = NULL; while (p != NULL) { next = p->next; p->next = prev; prev = p; p = next; } *head = prev; }
We have seen how dynamic arrays enable arrays to grow while still achieving constant-time amortized performance. This problem concerns extending dynamic arrays to let them both grow and shrink on demand.
(a) Consider an underflow strategy that cuts the array size in half whenever the array falls below half full. Give an example sequence of insertions and deletions where this strategy gives a bad amortized cost.
(b) Then, give a better underflow strategy than that suggested above, one that achieves constant amortized cost per deletion.
Design a dictionary data structure in which search, insertion, and deletion can all be processed inO(1) time in the worst case. You may assume the set elements are integers drawn from a finite set 1,2, .., n, and initialization can take O(n)time.
因为元素个数是有限集合中的数,用 bit array 表示每个数。
Find the overhead fraction (the ratio of data space over total space) for each of the following binary tree implementations on n nodes:
(a) All nodes store data, two child pointers, and a parent pointer. The data field requires four bytes and each pointer requires four bytes.
(b) Only leaf nodes store data; internal nodes store two child pointers. The data field requires four bytes and each pointer requires two bytes.
Describe how to modify any balanced tree data structure such that search, insert, delete, minimum, and maximum still take O(logn) time each, but successor and predecessor now take O(1) time each. Which operations have to be modified to support this?
在树节点中添加指向 successor 和 predecessor 的指针。不影响操作 search, minimum, 和 maximum。只需在 insert 和 delete 操作相应更新指向 successor 和 predecessor 的指针。
Suppose you have access to a balanced dictionary data structure, which supports each of the operations search, insert, delete, minimum, maximum, successor, and predecessor in O(logn) time. Explain how to modify the insert and delete operations so they still take O(logn) but now minimum and maximum take O(1) time. (Hint: think in terms of using the abstract dictionary operations, instead of mucking about with pointers and the like.)
存储 max 和 min 这两个数。
Design a data structure to support the following operations:
All operations must take O(logn) time on an n-element set.
Balanced binary tree.
A concatenate operation takes two sets S1 and S2, where every key in S1 is smaller than any key in S2, and merges them together. Give an algorithm to concatenate two binary search trees into one binary search tree. The worst-case running time should be O(h), where h is the maximal height of the two trees.
S1 中的所有元素小于 S2,用 O(logn)的时间找出 S2 的最小元素,然后 S1 成为它的左子树,S2 成为它的右子树,组成新的搜索树。
In the bin-packing problem, we are given n metal objects, each weighing between zero and one kilogram. Our goal is to find the smallest number of bins that will hold the n objects, with each bin holding one kilogram at most.
使用 BST。主要找到能容纳这个元素的最小 bin,若所有 bin 都小于这个元素大小,就插入一个新的。
min_node = NULL; while node != NULL: if (node->weight >= w && node->left < w) { min_node = node; break; } else if (node->left >= w) { node = node->left; } else { node = node->right; } if (min_node == NULL) { bst->insert(new node(w)); } else { bst->delete(min_node); min_node->weight -= w; bst->insert(min_node); }
最大堆使用。每次选最大容量的 bin。若最大 bin 小于这个元素大小,就插入一个新的。
Suppose that we are given a sequence of n values x1,x2, …, xn and seek to quickly answer repeated queries of the form: given i and j, find the smallest value in xi,…,xj.
(a) Design a data structure that uses O(n2) space and answers queries in O(1) time.
(b) Design a data structure that uses O(n) space and answers queries in O(logn) time. For partial credit, your data structure can use O(nlogn) space and have O(logn) query time.
Suppose you are given an input set S of n numbers, and a black box that if given any sequence of real numbers and an integer k instantly and correctly answers whether there is a subset of input sequence whose sum is exactly k. Show how to use the black box O(n) times to find a subset of S that adds up to k.
R = S for i = 1 to n: if bb(R/{si}) is True: R = R / {si}
Let A[1..n] be an array of real numbers. Design an algorithm to perform any sequence of the following operations:
• Add(i,y)– Add the value y to the ith number.
• Partial-sum(i)– Return the sum of the first i numbers
There are no insertions or deletions; the only change is to the values of the numbers. Each operation should take O(logn) steps. You may use one additional array of size n as a work space.
建立叶节点数n的 balanced binary tree,n个叶节点依次存储 A[1..n],书的内节点存储子树的和。
Extend the data structure of the previous problem to support insertions and deletions. Each element now has both a key and a value. An element is accessed by its key. The addition operation is applied to the values, but the elements are accessed by its key. The Partial sum operation is different.
The worst case running time should still be O(nlogn) for any sequence of O(n) operations.
建立以 key 排序的平衡搜索二叉树,并每个节点中添加一个左子树和的值。
Design a data structure that allows one to search, insert, and delete an integer X in O(1) time (i.e. , constant time, independent of the total number of integers stored). Assume that 1≤X≤n and that there are m+n units of space available, where m is the maximum number of integers that can be in the table at any one time. (Hint: use two arrays A[1..n] and B[1..m].) You are not allowed to initialize either A or B, as that would take O(m) or O(n) operations. This means the arrays are full of random garbage to begin with, so you must be very careful.
与Programming Pearls的 Column 课后题一样。
建立两个数组 A[1..n],B[1..m]和一个表示元素个数的变量 k。
What method would you use to look up a word in a dictionary?
Hash Table.
Imagine you have a closet full of shirts. What can you do to organize your shirts for easy retrieval?
以颜色排序,并二分搜索查找。
Write a function to find the middle node of a singly-linked list.
struct Node { int value; Node *next; }; Node* FindMidNode(Node *head) { Node *p, *q; p = head; q = head; i = 0; while (p != NULL) { i++; p = p->next; if (i == 2) { q = q->next; i = 0; } } return q; }
Write a function to compare whether two binary trees are identical. Identical trees have the same key value at each position and the same structure.
struct Node { int value; Node *left; Node *right; }; bool CompareBinaryTree(Node *head_m, Node *head_n) { if (head_m == NULL && head_n == NULL) { return true; } if (head_m == NULL || head_n == NULL) { return false; } return (head_m->value == head_n->value) && CompareBinaryTree(head_m->left, head_n->left) && CompareBinaryTree(head_m->right, head_n->right); }
Write a program to convert a binary search tree into a linked list
struct Node { int value; Node *next; }; struct TNode { int value; TNode *left; TNode *right; TNode(int value_in) { value = value_in; left = NULL; right = NULL; } }; void InsertToList(Node **head, int value) { Node *new_node = new Node; new_node->value = value; new_node->next = *head; *head = new_node; } void ConvertTreeToList(const TNode *root, Node **head) { if (root == NULL) { return; } ConvertTreeToList(root->right, head); InsertToList(head, root->value); ConvertTreeToList(root->left, head); }
Implement an algorithm to reverse a linked list. Now do it without recursion.
void ReverseLinkedList(Node **head) { if (!head || *head == NULL) { return; } Node *prev, *p, *next; prev = *head; p = prev->next; prev->next = NULL; while (p != NULL) { next = p->next; p->next = prev; prev = p; p = next; } *head = prev; }
What is the best data structure for maintaining URLs that have been visited by a Web crawler? Give an algorithm to test whether a given URL has already been visited, optimizing both space and time.
Hash Table.
Reverse the words in a sentence—i.e., “My name is Chris” becomes “Chris is name My.” Optimize for time and space.
void Reverse(char *begin, char *end) { char temp; while (begin < end) { temp = *begin; *begin = *end; *end = temp; begin++; end--; } } void ReverseWords(char *str) { char *word_begin; word_begin = NULL; char *p; p = str; while (*p != '\0') { if (word_begin == NULL && *p != ' ') { word_begin = p; } if (word_begin != NULL && (*(p+1) == ' ' || *(p+1) == '\0')) { Reverse(word_begin, p); word_begin = NULL; } ++p; } Reverse(str, p - 1); }
Determine whether a linked list contains a loop as quickly as possible without using any extra storage. Also, identify the location of the loop.
利用两个指针,一个快指针和一个慢指针,快的每次都比慢的多前进一个节点,如果存在 loop,快的总会与慢的相重叠。
loop 的起始点:
You have an unordered array X of n integers. Find the array M containing n elements where Mi is the product of all integers in X except for Xi. You may not use division. You can use extra memory. (Hint: There are solutions faster than O(n2).)
对数组 X 扫描 2 次计算出如下 2 组数组:
$$ \begin{align} P_{0} = 1; P_{k}=X_{k}P_{k-1}=\prod_{i=1}^{k}X_{i} \newline Q_{n+1} = 1; Q_{k}=X_{k}Q_{k+1}=\prod_{i=k}^{n}X_{i} \end{align} $$所以得到 M:
$$ \begin{align} M_{i} = P_{i-1} Q_{i+1}, i\in[1,n] \end{align} $$Give an algorithm for finding an ordered word pair (e.g., “New York”) occurring with the greatest frequency in a given webpage. Which data structures would you use? Optimize both time and space.
Hash Table.
Effective C++ 系列的作者 Scott Meyers 在 Dconf 中 The Last Thing D Needs 聊了些 C++的特性,稍微总结一下。
1 2 3 4 5 6 7 8 9 10 11 |
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
|
For
1
|
|
type deduction for cx yields:
Context | Type |
---|---|
auto | int |
decltype | const int |
template(T parameter) | int |
template(T& parameter) | const int |
template(T&& parameter) | const int& |
lambda (by-value capture) | const int |
lambda (int capture) | int |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
|
In essence, the One Definition Rule states that the same entity should have the exact same definition throughout an application, otherwise the effects are undefined.
The fundamental problem is that the code that doesn’t see the specialized version of your class template member function might still compile, is likely to link, and sometimes might even run. This is because in the absence of (a forward declaration of) the explicit specialization, the non-specialized version kicks in, likely implementing a generic functionality that works for your specialized type as well.
1 2 3 4 5 6 7 8 9 10 11 12 13 |
|
1 2 3 |
|
Sorts can be stable or unstable. Which are guaranteed to be stable? * sort –> not guaranteed * stable_sort –> guaranteed * list::sort –> guaranteed
Five sequence containers:
Essential Complexity: due to inherent design tensions.
Essential Complexity
1 2 3 |
|
What is the type of Point::x?
1 2 |
|
What is the type of cp.x?
C++ soluction:
1 2 |
|
1 2 3 4 5 6 7 8 9 10 11 12 |
|
Assume typo and diagnose now?
Assume later specialization and defer lookup until instantiation?
C++ solution:
Accidental Complexity
erase
.sort
is sometimes stable.Our two most important tools are (1) the RAM model of computation and (2) the asymptotic analysis of worst-case complexity.
What value is returned by the following function? Express your answer as a function of n. Give the worst-case running time using the Big Oh notation.
function mystery(n) r:=0 for i:=1 to n-1 do for j:=i+1 to n do for k:=1 to j do r:=r+1 return(r)
Time: O(n3)
What value is returned by the following function? Express your answer as a function of n. Give the worst-case running time using Big Oh notation.
function pesky(n) r:=0 for i:=1 to n do for j:=1 to i do for k:=j to i+j do r:=r+1 return(r)
Time: O(n3).
What value is returned by the following function? Express your answer as a function of n. Give the worst-case running time using Big Oh notation.
function prestiferous(n) r:=0 for i:=1 to n do for j:=1 to i do for k:=j to i+j do for l:=1 to i+j-k do r:=r+1 return(r)
Time: O(n4).
Assume that Christmas hasndays. Exactly how many presents did my “true love” send me? (Do some research if you do not understand this question.)
假设一共有n天,每 i 天收到的礼物数是:
\begin{align} p_i = \sum_{k=1}^{i}k \end{align}总的礼物数:
\begin{align} \sum_{i=1}^{n} p_i = \sum_{i=1}^{n}\sum_{k=1}^{i}k=\frac{n^3+3n^2+2n}{6} \end{align}You are given a set S of n numbers. You must pick a subset S’ of k numbers from S such that the probability of each element of S occurring in S’ is equal (i.e., each is selected with probability k / n). You may make only one pass over the numbers. What if n is unknown?
We have 1,000 data items to store on 1,000 nodes. Each node can store copies of exactly three different items. Propose a replication scheme to minimize data loss as nodes fail. What is the expected number of data entries that get lost when three random nodes fail?
不考虑 RAID 的 XOR 做法这里。
1000 个数据做 3 份拷贝,如何做 3 份拷贝呢?
3 份拷贝以相邻一格的方式存储,如下
nodes: 1 2 3 ... 1000 copy1: data1 data2 data3 .. data1000 copy2: data1000 data1 data2 .. data999 copy3: data999 data1000 data1 .. data998
每 3 个点共享 3 个拷贝点,如下
nodes: 1 2 3 ... 1000 copy1: data1 data2 data3 .. data1000 copy2: data3 data1 data2 .. data999 copy3: data2 data3 data1 .. data998
Consider the following algorithm to find the minimum element in an
array of numbers . One extra variable tmp is allocated to hold the
current minimum value. Start from A[0]
; “tmp” is compared against
A[1], A[2], , A[N]
in order. When A[i] < tmp, tmp = A[i]
. What is the
expected number of times that the assignment operation tmp = A[i] is
performed?
期望的次数是第n个元素是最小值的概率的总和。n个元素平均分布,任意元素是最小值的概率是 1/n。
E(n) = E(n-1) +1/n, E[1] = 1
You have a 100-story building and a couple of marbles. You must identify the lowest floor for which a marble will break if you drop it from this floor. How fast can you find this floor if you are given an infinite supply of marbles? What if you have only two marbles?
n 个球时在总楼层 r 中某个楼层 x 抛,两种情况: 1.破碎,剩下的总楼层 x-1 用剩下的 n-1 个球; 2.没破碎,剩下的总楼层 r-x 用 n 个球
如此把问题分解成小问题。如下代码求得最快的次数为 14。其中一条最坏情况: 9–>22–>34–>45–>55–>64–>72–>79–>85–>90–>94–>97–>99
/* Drop Marbles n: num of marbles r: num of floors drop_qeq: the drop sequence marble_drop: minimum number of trails needed to find the critical floor in worst case marble_drop[n][r] = 1 + min{max(marble_drop[n-1][x-1], marble[n][r-x]) : x in {1,2,...,r}} */ int DropMarbles(int n, int r, int **drop_seq) { int marble_drop[n+1][r+1]; int i, j; for (j = 0; j <= r; ++j) { marble_drop[1][j] = j; } for (i = 0; i <= n; ++i) { marble_drop[i][1] = 1; marble_drop[i][0] = 0; } int min_sofar; for (i = 2; i <= n; ++i) { for (j = 2; j <= r; ++j) { marble_drop[i][j] = numeric_limits<int>::max(); for (int x = 1; x <= j; ++x) { min_sofar = 1 + max(marble_drop[i-1][x-1], marble_drop[i][j-x]); if (min_sofar < marble_drop[i][j]) { marble_drop[i][j] = min_sofar; drop_seq[i][j] = x; } } } } return marble_drop[n][r]; }
You are given 10 bags of gold coins. Nine bags contain coins that each weigh 10 grams. One bag contains all false coins that weigh one gram less. You must identify this bag in just one weighing. You have a digital balance that reports the weight of what is placed on it.
一共 10 袋 bag1-10, 分别从 bag1 中取 1 个金币,bag2 中取 2 个金币……bag10 中取 10 个金币,称重总的重量 W。如果每个金币都是 10grams 的话,所以金币总重量是 550。N=550-W。得到缺失的重量,也是 bag 的号数,所以 bagN 中含有错误金币。
You have eight balls all of the same size. Seven of them weigh the same, and one of them weighs slightly more. How can you find the ball that is heavier by using a balance and only two weighings?
8==> 3,3,2
Suppose we start with n companies that eventually merge into one big company. How many different ways are there for them to merge?
1. 2 个公司(a,b)时,合并只有一种方法 [ab] 2. 当有 n 个公司时,如何把它用 n-1 个公司表示,f(n)=f(n-1)g(n) 3. n 个公司第一步从中选择两个公司合并,连带合并后的新公司一共 n-1 个公司,化简到 n-1 个公司表示。 4. n 个选 2 个的组合个数是: $\binom{1000}{2}=n(n-1)/2$所以
f(n) = ∑i=2n\frac{i(i-1)}{2} = \frac{n!(n-1)!}{2n-2}
A Ramanujam number can be written two different ways as the sum of two cubes—i.e., there exist distinct a, b, c, and d such that a3 + b3 = c3 + d3. Generate all Ramanujam numbers where a,b,c,d < n.
#include <vector> using std::vector; bool FindEqual(const vector<int> &num_cube, int low, int high, const int &sum, vector<int> *res) { if (low >= high) { return false; } int i, j; i = low; j = high; int add; while (i < j) { add = num_cube[i] + num_cube[j]; if (add == sum) { res->push_back(i); res->push_back(j); return true; } if (add > sum) { --j; } else { ++i; } } return false; } void RamanujamNum(int n, vector<vector<int> > *res) { vector<int> num_cube(n); int i, j; for (i = 0; i < n; ++i) { num_cube[i] = i*i*i; } vector<int> ram_num; bool find; for (i = 0; i < n - 1; ++i) { for (j = i + 3; j < n; ++j) { find = FindEqual(num_cube, i+1, j-1, num_cube[i] + num_cube[j], &ram_num); if (find) { ram_num.push_back(i); ram_num.push_back(j); res->push_back(ram_num); ram_num.clear(); } } } }
Six pirates must divide $300 dollars among themselves. The division is to proceed as follows. The senior pirate proposes a way to divide the money. Then the pirates vote. If the senior pirate gets at least half the votes he wins, and that division remains. If he doesn’t, he is killed and then the next senior-most pirate gets a chance to do the division. Now you have to tell what will happen and why (i.e., how many pirates survive and how the division is done)? All the pirates are intelligent and the first priority is to stay alive and the next priority is to get as much money as possible.
从后往前推
Reconsider the pirate problem above, where only one indivisible dollar is to be divided. Who gets the dollar and how many are killed?
要至少一半的同意,间隔要有一半的人会死去才会同意之前那个人,所以之后每 2+2K (K>=1)的海盗才能活。
Big things that are made from smaller things of exactly the same type as the big thing. A decomposition rule describes how to get smaller things from big things.
As all combinatorial objects above are recursive objects here are a few possible decompositon rules for them:
Write a function to perform integer division without using either the / or * operators. Find a fast way to do it.
void DivideCore(int m, int n, int *quot, int *rem) { rem = m; quot = 0; while (rem >= n) { rem -= n; quot++; } } void Divide(int m, int n, int *quot, int *rem) { int mult_n = 0; int last_n; while (m % n == 0) { last_n = n; n = n + n; mult_n++; } DivideCore(m, n, quot, rem); for (int i = 0; i < mult_n; ++i) { quot = quot + quot; } }
There are 25 horses. At most, 5 horses can race together at a time. You must determine the fastest, second fastest, and third fastest horses. Find the minimum number of races in which this can be done.
7 次。
How many piano tuners are there in the entire world?
需要把问题分解:1.世界有多少架钢琴;2.每位钢琴调音师能调多少台钢琴。
估算世界有多少架钢琴,需要知道:
估算每位钢琴调音师能调多少台钢琴,需要知道:
How many gas stations are there in the United States?
分解问题成:
How much does the ice in a hockey rink weigh?
分解成:
做如下估算: 1.冰场的长度:70m; 2.冰场的宽度:30m; 3.冰的厚度:10cm=0.1; 4.冰的密度与水相当,估算 1000kg/m3 .
V = 70 * 30 * 0.1 = 210 m3 W = 210 *1000 = 210,000kg
How many miles of road are there in the United States?
美国近似是一个矩形,高 1000mile 和长 3000mile。美国大部分地区是乡村,道路比较稀疏,平均下来可以把美国想成一个网状的道路结构,每隔 1mile 一条道路,最后如下网格,1000 条 3000mile 和 3000 条 1000mile 的路,总的 6,000,000mile 的路。
On average, how many times would you have to flip open the Manhattan phone book at random in order to find a specific name?
假设电话本有 1000 页,也就是 500 个翻面。
简单答案:翻到正确页的概率是 1/500。
复杂点答案:上面没有考虑不断翻页,会翻到相同的页面。翻到错误页面的概率是 499/500,N 次后的错误概率是(499/500)N ,所以 N 次后的正确页面概率是 P=1- (499/500)N 。
那么: N=1 P = 0.002 N=2 P = 0.004 … N=1150 P = 0.89999
达到 90%的概率,所以需要 1150 翻页。
书本和习题大部分代码实现。
]]>typedef struct Node* Nodeptr; struct Node { Node(string inword, int incount, Nodeptr innext) { word = inword; count = incount; next = innext; } string word; int count; Nodeptr next; }; #define NHASH 29989 #define MULT 31 Nodeptr bin[NHASH]; unsigned int Hash(const string &str) { unsigned int h = 0; for (string::const_iterator it = str.begin(); it != str.end(); ++it) { h = MULT * h + *it; } return h % NHASH; } void InWord(const string &str) { Nodeptr p; int h; h = Hash(str); for (p = bin[h]; p != NULL; p = p->next) { if (str.compare(p->word) == 0) { (p->count)++; return; } } p = new Node(str, 1, bin[h]); bin[h] = p; } int main(int argc, char *argv[]) { string str; int i; for (i = 0; i < NHASH; ++i) { bin[i] = NULL; } while (cin >> str) { InWord(str); } for (i = 0; i < NHASH; ++i) { for (Nodeptr p = bin[i]; p != NULL; p = p->next) { cout << p->word << " " << p->count << endl; } } return 0; }
利用指针指向不同单词的开头,并按照 K 个单词对比方式排序,利用二分搜索定位相同 K 长度的文本,并利用Reservoir sampling在不知道长度的情况下,均等的随机选取一个。
#define MAXINPUT 4000000 #define MAXWORDS 800000 #define K 2 char input_letters[MAXINPUT]; char *word[MAXWORDS]; int WordNcmp(const char *p, const char *q, int n) { while (*p == *q) { if (*p == 0 && --n == 0) { return 0; } ++p; ++q; } return *p - *q; } int SortCmp(const void *a, const void *b) { const char **p = (const char**)(a); const char **q = (const char**)(b); return WordNcmp(*p, *q, K); } char* SkipNword(char *p, int n) { for (; n > 0; p++) { if (*p == 0) { --n; } } return p; } int FindPhrase(char **word, int n, char *phrase) { int l = -1; int u = n; int m; while (l + 1 != u) { m = (l + u) / 2; if (WordNcmp(word[m], phrase, K) < 0) { l = m; } else { u = m; } } return u; } int main(int argc, char *argv[]) { int nword = 0; word[0] = input_letters; while (scanf("%s", word[nword]) != EOF) { word[nword + 1] = word[nword] + strlen(word[nword]) + 1; nword++; if (nword == MAXWORDS) { break; } } int i; for (i = 0; i < K; ++i) { word[nword][i] = 0; } for (i = 0; i < K; ++i) { printf("%s ", word[i]); } qsort(word, nword, sizeof(word[0]), SortCmp); char *phrase = input_letters; int printlen = 100; int find_index; char *p; for (; printlen > 0; --printlen) { int find_index = FindPhrase(word, nword, phrase); for (i = 0; WordNcmp(phrase, word[find_index + i], K) == 0; ++i) { if ((rand() % (i + 1)) == 0) { p = word[find_index + i]; } } phrase = SkipNword(p, 1); if (strlen(SkipNword(phrase, K - 1)) == 0) { break; } printf("%s ", SkipNword(phrase, K - 1)); } printf("\n"); return 0; }
利用 Hash 表加快搜索相同 K 长度的文本。
#define MAXINPUT 4000000 #define MAXWORDS 800000 #define K 2 char input_letters[MAXINPUT]; char *word[MAXWORDS]; int WordNcmp(const char *p, const char *q, int n) { while (*p == *q) { if (*p == 0 && --n == 0) { return 0; } ++p; ++q; } return *p - *q; } char* SkipNword(char *p, int n) { for (; n > 0; p++) { if (*p == 0) { --n; } } return p; } #define NHASH 499979 #define MULT 31 int bin[NHASH]; int next[MAXWORDS]; unsigned int Hash(char *str) { unsigned int h = 0; char *p = str; for (int n = K; n > 0; p++) { h = MULT * h + (unsigned char)(*p); if (*p == 0) { --n; } } return h % NHASH; } void InitHash(char **word, int nword) { int i; for (i = 0; i < NHASH; ++i) { bin[i] = - 1; } for (i = 0; i < nword; ++i) { unsigned int h = Hash(word[i]); next[i] = bin[h]; bin[h] = i; } } int main(int argc, char *argv[]) { int nword = 0; word[0] = input_letters; while (scanf("%s", word[nword]) != EOF) { word[nword + 1] = word[nword] + strlen(word[nword]) + 1; nword++; if (nword == MAXWORDS) { break; } } int i; for (i = 0; i < K; ++i) { word[nword][i] = 0; } InitHash(word, nword); for (i = 0; i < K; ++i) { printf("%s ", word[i]); } char *phrase = input_letters; int printlen = 100; char *p; for (; printlen > 0; --printlen) { i = 0; for (int j = bin[Hash(phrase)]; j >= 0; j = next[j]) { if (WordNcmp(word[j], phrase, K) == 0 && (rand() % (++i) == 0)) { p = word[j]; } } phrase = SkipNword(p, 1); if (strlen(SkipNword(phrase, K - 1)) == 0) { break; } printf("%s ", SkipNword(phrase, K - 1)); } printf("\n"); return 0; }
void SiftDown(int *x, int l, int u) { int i = l; int child; for (;;) { child = i * 2; if (child > u) { break; } if (child + 1 <= u) { if (x[child + 1] < x[child]) { child++; } } if (x[i] <= x[child]) { break; } swap(x[i], x[child]); i = child; } } void HeapSort(int *x, int n) { int i; for (i = n / 2; i >= 1; --i) { SiftDown(x, i, n); } for (i = n; i >= 2; --i) { swap(x[1], x[i]); SiftDown(x, 1, i - 1); } }
找出最长重复超过 M 次的字符串。
经过排序后,越是相邻的越是相同的多,至少重复 M 次,就是计算相邻 M 个位置的字符所重复的字符长度,即 ComLen(pstr[i], pstr[i + kM])
int CmpPstr(const void *a, const void *b) { const char **p = (const char **)a; const char **q = (const char **)b; return strcmp(*p, *q); } int ComLen(char *p, char *q) { int i = 0; while (*p && (*p == *q)) { ++i; ++p; ++q; } return i; } #define kMaxN 500000 #define kM 1 char str[kMaxN]; char *pstr[kMaxN]; int main(int argc, char *argv[]) { int ch; int n = 0; while ((ch = getchar()) != EOF) { str[n] = ch; pstr[n] = &str[n]; ++n; } str[n] = 0; qsort(pstr, n, sizeof(char *), CmpPstr); int maxlen = 0; int maxindex = 0; for (int i = 0; i < n - kM; ++i) { if (ComLen(pstr[i], pstr[i + kM]) > maxlen) { maxlen = ComLen(pstr[i], pstr[i + kM]); maxindex = i; } } printf("%.*s\n", maxlen, pstr[maxindex]); return 0; }
找出两个文本中最长的共同字符串。
经典Longest common substring problem. 利用 Dynamic Programming 解决。复杂度 O(mn).
vector<string> LongestCommonString(const string &s, const string &t) { vector<vector<int> > array; int len_s = s.size(); int len_t = t.size(); int i, j; array.resize(len_s); for (i = 0; i < len_s; ++i) { array[i].resize(len_t); } int max_len = 0; vector<int> end_indexs; for (i = 0; i < len_s; ++i) { for (j = 0; j < len_t; ++j) { if (s[i] == t[j]) { if (i == 0 || j == 0) { array[i][j] = 1; } else { array[i][j] = array[i-1][j-1] + 1; } if (array[i][j] == max_len) { end_indexs.push_back(i); } else if (array[i][j] > max_len) { max_len = array[i][j]; end_indexs.clear(); end_indexs.push_back(i); } } } } vector<string> res; for (vector<int>::iterator it = end_indexs.begin(); it != end_indexs.end(); ++it) { res.push_back(s.substr(*it - max_len + 1, max_len)); } return res; }
产生单词层次的 Markov 文本。
int main(int argc, char *argv[]) { const int kMax = 50000; const int kK = 5; const int kPrintlen = 1000; char str[kMax]; int c, n; n = 0; while ((c = getchar()) != EOF) { str[n++] = c; } str[n] = 0; char *p, *q, *next_p; p = str; int i, eq_sofar, j; for (i = 0; i < kK; ++i) { printf("%c", str[i]); } for (i = 0; i < kPrintlen; ++i) { eq_sofar = 0; for (q = str; q < str + n - kK + 1; ++q) { for (j = 0; j < kK && *(p + j) == *(q + j); ++j) { } if (j == kK) { eq_sofar++; if (rand() % eq_sofar == 0) { next_p = q; } } } c = *(next_p + kK); if (c == 0) { break; } putchar(c); p = next_p + 1; } return 0; }
void swap(int *array, int m, int n) { int temp; temp = array[m]; array[m] = array[n]; array[n] = temp; } int randint(int m, int n) { return m + (rand() / (RAND_MAX / (n - m + 1) + 1)); } void isort(int *array, int n) { int i, j, t; for (i = 1; i < n; ++i) { t = array[i]; for (j = i; j >= 0 && array[j - 1] < t; --j) { array[j] = array[j - 1]; } array[j - 1] = t; } } void qsort1(int *array, int l, int u) { /*use array[l] for the mid element */ if (l >= u) { return; } int m; m = l; for (int i = l + 1; i <= u; ++i) { if (array[i] < array[l]) { swap(array, ++m, i); } } swap(array, l, m); qsort1(array, l, m - 1); qsort1(array, m + 1, u); } void qsort2(int *array, int l, int u) { /*use array[l] for the mid element, from back to start, always swap the first element */ if (l >= u) { return; } int i, m; i = m = u + 1; do { do { --i; } while (array[i] < array[l]); swap(array, --m, i); } while (i > l); qsort2(array, l, m - 1); qsort2(array, m + 1, u); } void qsort3(int *array, int l, int u) { /* two-way partition, use array[l] for the mid element */ if (l >= u) { return; } int t, i , j; t = array[l]; i = l; j = u + 1; for (;;) { do { ++i; } while (i <= u && array[i] < t); do { --j; } while (array[j] > t); if (i > j) { break; } swap(array, i, j); } swap(array, l, j); qsort3(array, l, j - 1); qsort3(array, j + 1, u); } const int kCutOff = 50; void qsort4(int *array, int l, int u) { /* qsort3 + randomization + isort small subarrays + swap inline */ if (u - l < kCutOff) { return; } int t, i , j; swap(array, l, randint(l, u)); t = array[l]; i = l; j = u + 1; for (;;) { do { ++i; } while (i <= u && array[i] < t); do { --j; } while (array[j] > t); if (i > j) { break; } swap(array, i, j); } swap(array, l, j); qsort3(array, l, j - 1); qsort3(array, j + 1, u); }
从 n 中生成不重复的 m 个随机数。
void GenerateSortedRand(int m, int n) { int select = m; int remaining = n; for (int i = 0; i < n && select > 0; ++i) { if (rand() % remaining < select) { cout << i << " "; --select; } --remaining; } cout << endl; }
void GenKnuth(int m, int n) { for (int i = 0; i < n && m > 0; ++i) { if (rand() % (n - i) < m) { cout << i << " "; --m; } } cout << endl; }
void GenSets(int m, int n) { set<int> num_set; while (num_set.size() < m) { num_set.insert(rand() % n); } for (set<int>::iterator it = num_set.begin(); it != num_set.end(); ++it) { cout << *it << " "; } cout << endl; }
int randint(int m, int n) { return m + (rand() / (RAND_MAX / (n - m + 1) + 1)); } int compare(const void *a, const void *b) { return (*static_cast<const int*>(a) - *static_cast<const int*>(b)); } void GenShuf(int m, int n) { int *x = new int[n]; int i = 0; for (i = 0; i < n; ++i) { x[i] = i; } for (i = 0; i < m; ++i) { int j = randint(i, n - 1); int t = x[j]; x[j] = x[i]; x[i] = t; } qsort(x, m, sizeof(int), compare); for (i = 0; i < m; ++i) { cout << x[i] << " "; } cout << endl; delete x; }
在数组 n 中以算法复杂度 O(n)找出第 k 个小的元素。
void swap(int *array, int m, int n) { int temp; temp = array[m]; array[m] = array[n]; array[n] = temp; } int randint(int m, int n) { return m + (rand() / (RAND_MAX / (n - m + 1) + 1)); } void SelectK(int *array, int l, int u, int k) { if (l >= u) { return; } int t, i, j; swap(array, l, randint(l, u)); t = array[l]; i = l; j = u + 1; for (;;) { do { ++i; } while (i <= u && array[i] < t); do { --j; } while (array[j] > t); if (i > j) { break; } swap(array, i, j); } swap(array, l, j); if (j < k) { SelectK(array, j + 1, u, k); } else if (j > k) { SelectK(array, l, j - 1, k); } }
int randint(int m, int n) { return m + (rand() / (RAND_MAX / (n - m + 1) + 1)); } int bigrand() { return RAND_MAX * rand() + rand(); }
int randint(int m, int n) { return m + (rand() / (RAND_MAX / (n - m + 1) + 1)); } void GenerateM(int m, int n) { int i, t; i = randint(0, n - 1); for(int j = 0; j < m; ++j) { t = i + j; if (t >= n) { t -= n; } cout << t << " " << endl; } cout << endl; }
0..n-1 中生成 m 个随机数。
int randint(int m, int n) { return m + (rand() / (RAND_MAX / (n - m + 1) + 1)); } int compare(const void *a, const void *b) { return (*static_cast<const int*>(a) - *static_cast<const int*>(b)); } void GenShuf(int m, int n) { int *x = new int[n]; int i = 0; for (i = 0; i < n; ++i) { x[i] = i; } for (i = 0; i < m; ++i) { int j = randint(i, n - 1); int t = x[j]; x[j] = x[i]; x[i] = t; } for (i = 0; i < m; ++i) { cout << x[i] << " "; } cout << endl; delete x; }
如果允许有重复的数,如何生成排序的 m 个随机数。
void GenSets(int m, int n) { multiset<int> num_set; while (num_set.size() < m) { num_set.insert(rand() % n); } for (multiset<int>::iterator it = num_set.begin(); it != num_set.end(); ++it) { cout << *it << " "; } cout << endl; }
如果可以重复并顺序随机。
void GenM(int m, int n) { for (int i = 0; i < m; ++i) { cout << randint(0, n - 1) << " "; } cout << endl; }
int randint(int m, int n) { return m + (rand() / (RAND_MAX / (n - m + 1) + 1)); } void GenSets(int m, int n) { set<int> num_set; int t; for (int i = n - m ; i < n; ++i) { t = randint(0, i); if (num_set.find(t) == num_set.end()) { num_set.insert(t); } else { num_set.insert(i); } } for (set<int>::iterator it = num_set.begin(); it != num_set.end(); ++it) { cout << *it << " "; } cout << endl; }
int randint(int m, int n) { return m + (rand() / (RAND_MAX / (n - m + 1) + 1)); } int Select() { int res; int i = 0; res = object[i]; ++i; while (IsEnd(object[i])) { int j = randint(0, i); if (j < 1) { res = object[i]; } ++i; } return res; }
More: 选 k 个
生成 N>1e6 组的 m 个随机数,计算生成每个随机数出现的概率,是不是符合预期,还是偏差很大而不是随机的。
给出数组中找出连续子数组最大和。
直接算每个子区间的和并比较得出最大值。算法复杂度 O(n3)。
float FindMaxSubvectorAlg1(const vector<float> &num) { int i, j, k; float sum, max_sofar; max_sofar = 0; for (i = 0; i < num.size(); ++i) { for (j = 0; j < num.size(); ++j) { sum = 0; for (k = i; k <= j; k++) { sum += num[k]; if (sum > max_sofar) { max_sofar = sum; } } } } return max_sofar; }
因为 x[i..j]直接的和可以基于 x[i..j-1]的和算出,不用重头开始算。算法复杂度 O(n2)。
float FindMaxSubvectorAlg2(const vector<float> &num) { int i, j; float sum, max_sofar; max_sofar = 0; for (i = 0; i < num.size(); ++i) { sum = 0; for (j = i; j < num.size(); ++j) { sum += num[j]; if (sum > max_sofar) { max_sofar = sum; } } } return max_sofar; }
先算出 x[0..i]区间的和为 cum_vector[i]
,那么 x[i..j]区间的和就是
cum_vector[j] - cum-vector[i-1]
float FindMaxSubvectorAlg2b(const vector<float> &num) { vector<float> cum_vector(num.size() + 1); int i, j; cum_vector[0] = 0; for (i = 1; i < cum_vector.size(); ++i) { cum_vector[i] = cum_vector[i - 1] + num[i]; } float sum, max_sofar; max_sofar = 0; for (i = 1; i < cum_vector.size(); ++i) { for (j = i; j < cum_vector.size(); ++j) { sum = cum_vector[j] - cum_vector[i - 1]; if (sum > max_sofar) { max_sofar = sum; } } } return max_sofar; }
Divide-and-Conquer 算法。
float FindMaxSubvectorAlg3Core(const vector<float> &num, int l, int u) { if (l > u) { return 0; } if (l == u) { return max<float>(num[l], 0); } int m; m = (l + u) / 2; float lmax, rmax, sum; lmax = sum = 0; for (int i = m; i >= l; --i) { sum += num[i]; if (sum > lmax) { lmax = sum; } } rmax = sum = 0; for (int i = m + 1; i <= u; ++i) { sum += num[i]; if (sum > rmax) { rmax = sum; } } return max(lmax + rmax, max(FindMaxSubvectorAlg3Core(num, l, m), FindMaxSubvectorAlg3Core(num, m + 1, u))); } float FindMaxSubvectorAlg3(const vector<float> &num) { return FindMaxSubvectorAlg3Core(num, 0, num.size() - 1); }
假定已经解决了 x[0..i-1]的情况,那么如何扩展到 x[0..i]的情况,只多了 x[i] 元素?
max_sofar
,和必须以
x[i-1]结尾的子数组最大和;
max_sofar
比较,就能解决
x[0..i]的情况;
只扫描一遍,算法复杂度 O(n)。
float FindMaxSubvectorAlg4(const vector<float> &num) { float max_sofar, max_ending_here; max_sofar = max_ending_here = 0; for (int i = 0; i < num.size(); ++i) { max_ending_here += num[i]; if (max_ending_here < 0) { max_ending_here = 0; } if (max_ending_here > max_sofar) { max_sofar = max_ending_here; } } return max_sofar; }
cum[i]=x[0]+x[1]...x[i]
, 那么要 x[l..u]
区间的和为 0 的话,cum[l-1] = cum[u]
算法复杂度 O(n) + O(nlogn) + O(n-1) = O(nlogn).
找出子数组和与一个特定值 r 最相近,算法类似,只是 step3 找出与 r 最相近的相邻数组元素。
cum[i]=x[0]+x[1]...x[i]