vendor/souffle/utility/ParallelUtil.h

OILS / vendor / souffle / utility / ParallelUtil.h View on Github | oilshell.org

884 lines, 412 significant

1	/*
2	* Souffle - A Datalog Compiler
3	* Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved
4	* Licensed under the Universal Permissive License v 1.0 as shown at:
5	* - https://opensource.org/licenses/UPL
6	* - <souffle root>/licenses/SOUFFLE-UPL.txt
7	*/
8
9	/************************************************************************
10	*
11	* @file ParallelUtil.h
12	*
13	* A set of utilities abstracting from the underlying parallel library.
14	* Currently supported APIs: OpenMP and Cilk
15	*
16	***********************************************************************/
17
18	#pragma once
19
20	#include <atomic>
21	#include <cassert>
22	#include <cstddef>
23	#include <memory>
24	#include <new>
25
26	// https://bugs.llvm.org/show_bug.cgi?id=41423
27	#if defined(__cpp_lib_hardware_interference_size) && (__cpp_lib_hardware_interference_size != 201703L)
28	using std::hardware_constructive_interference_size;
29	using std::hardware_destructive_interference_size;
30	#else
31	// 64 bytes on x86-64 │ L1_CACHE_BYTES │ L1_CACHE_SHIFT │ __cacheline_aligned │
32	// ...
33	constexpr std::size_t hardware_constructive_interference_size = 2 * sizeof(max_align_t);
34	constexpr std::size_t hardware_destructive_interference_size = 2 * sizeof(max_align_t);
35	#endif
36
37	#ifdef _OPENMP
38
39	/**
40	* Implementation of parallel control flow constructs utilizing OpenMP
41	*/
42
43	#include <omp.h>
44
45	#ifdef __APPLE__
46	#define pthread_yield pthread_yield_np
47	#elif !defined(_MSC_VER)
48	#include <sched.h>
49	// pthread_yield is deprecated and should be replaced by sched_yield
50	#define pthread_yield sched_yield
51	#elif defined _MSC_VER
52	#include <thread>
53	#define NOMINMAX
54	#include <windows.h>
55	#define pthread_yield std::this_thread::yield
56	#endif
57
58	#ifdef _MSC_VER
59	// support for a parallel region
60	#define PARALLEL_START __pragma(omp parallel) {
61	#define PARALLEL_END }
62
63	// support for parallel loops
64	#define pfor __pragma(omp for schedule(dynamic)) for
65	#else
66	// support for a parallel region
67	#define PARALLEL_START _Pragma("omp parallel") {
68	#define PARALLEL_END }
69
70	// support for parallel loops
71	#define pfor _Pragma("omp for schedule(dynamic)") for
72	#endif
73
74	// spawn and sync are processed sequentially (overhead to expensive)
75	#define task_spawn
76	#define task_sync
77
78	// section start / end => corresponding OpenMP pragmas
79	// NOTE: disabled since it causes performance losses
80	//#define SECTIONS_START _Pragma("omp parallel sections") {
81	// NOTE: we stick to flat-level parallelism since it is faster due to thread pooling
82	#define SECTIONS_START {
83	#define SECTIONS_END }
84
85	// the markers for a single section
86	//#define SECTION_START _Pragma("omp section") {
87	#define SECTION_START {
88	#define SECTION_END }
89
90	// a macro to create an operation context
91	#define CREATE_OP_CONTEXT(NAME, INIT) [[maybe_unused]] auto NAME = INIT;
92	#define READ_OP_CONTEXT(NAME) NAME
93
94	#else
95
96	// support for a parallel region => sequential execution
97	#define PARALLEL_START {
98	#define PARALLEL_END }
99
100	// support for parallel loops => simple sequential loop
101	#define pfor for
102
103	// spawn and sync not supported
104	#define task_spawn
105	#define task_sync
106
107	// sections are processed sequentially
108	#define SECTIONS_START {
109	#define SECTIONS_END }
110
111	// sections are inlined
112	#define SECTION_START {
113	#define SECTION_END }
114
115	// a macro to create an operation context
116	#define CREATE_OP_CONTEXT(NAME, INIT) [[maybe_unused]] auto NAME = INIT;
117	#define READ_OP_CONTEXT(NAME) NAME
118
119	// mark es sequential
120	#define IS_SEQUENTIAL
121
122	#endif
123
124	#ifndef IS_SEQUENTIAL
125	#define IS_PARALLEL
126	#endif
127
128	#ifdef IS_PARALLEL
129	#include <mutex>
130	#include <vector>
131	#define MAX_THREADS (omp_get_max_threads())
132	#else
133	#define MAX_THREADS (1)
134	#endif
135
136	namespace souffle {
137
138	struct SeqConcurrentLanes {
139	struct TrivialLock {
140	~TrivialLock() {}
141	};
142
143	using lane_id = std::size_t;
144	using unique_lock_type = TrivialLock;
145
146	explicit SeqConcurrentLanes(std::size_t = 1) {}
147	SeqConcurrentLanes(const SeqConcurrentLanes&) = delete;
148	SeqConcurrentLanes(SeqConcurrentLanes&&) = delete;
149
150	virtual ~SeqConcurrentLanes() {}
151
152	std::size_t lanes() const {
153	return 1;
154	}
155
156	void setNumLanes(const std::size_t) {}
157
158	unique_lock_type guard(const lane_id) const {
159	return TrivialLock();
160	}
161
162	void lock(const lane_id) const {
163	return;
164	}
165
166	void unlock(const lane_id) const {
167	return;
168	}
169
170	void beforeLockAllBut(const lane_id) const {
171	return;
172	}
173
174	void beforeUnlockAllBut(const lane_id) const {
175	return;
176	}
177
178	void lockAllBut(const lane_id) const {
179	return;
180	}
181
182	void unlockAllBut(const lane_id) const {
183	return;
184	}
185	};
186
187	#ifdef IS_PARALLEL
188
189	/**
190	* A small utility class for implementing simple locks.
191	*/
192	class Lock {
193	// the underlying mutex
194	std::mutex mux;
195
196	public:
197	struct Lease {
198	Lease(std::mutex& mux) : mux(&mux) {
199	mux.lock();
200	}
201	Lease(Lease&& other) : mux(other.mux) {
202	other.mux = nullptr;
203	}
204	Lease(const Lease& other) = delete;
205	~Lease() {
206	if (mux != nullptr) {
207	mux->unlock();
208	}
209	}
210
211	protected:
212	std::mutex* mux;
213	};
214
215	// acquired the lock for the live-cycle of the returned guard
216	Lease acquire() {
217	return Lease(mux);
218	}
219
220	void lock() {
221	mux.lock();
222	}
223
224	bool try_lock() {
225	return mux.try_lock();
226	}
227
228	void unlock() {
229	mux.unlock();
230	}
231	};
232
233	// /* valuable source: http://locklessinc.com/articles/locks/ */
234
235	namespace detail {
236
237	/* Pause instruction to prevent excess processor bus usage */
238	#if defined _MSC_VER
239	#define cpu_relax() YieldProcessor()
240	#else
241	#ifdef __x86_64__
242	#define cpu_relax() asm volatile("pause\n" : : : "memory")
243	#else
244	#define cpu_relax() asm volatile("" : : : "memory")
245	#endif
246	#endif
247
248	/**
249	* A utility class managing waiting operations for spin locks.
250	*/
251	class Waiter {
252	int i = 0;
253
254	public:
255	Waiter() = default;
256
257	/**
258	* Conducts a wait operation.
259	*/
260	void operator()() {
261	++i;
262	if ((i % 1000) == 0) {
263	// there was no progress => let others work
264	pthread_yield();
265	} else {
266	// relax this CPU
267	cpu_relax();
268	}
269	}
270	};
271	} // namespace detail
272
273	/* compare: http://en.cppreference.com/w/cpp/atomic/atomic_flag */
274	class SpinLock {
275	std::atomic<int> lck{0};
276
277	public:
278	SpinLock() = default;
279
280	void lock() {
281	detail::Waiter wait;
282	while (!try_lock()) {
283	wait();
284	}
285	}
286
287	bool try_lock() {
288	int should = 0;
289	return lck.compare_exchange_weak(should, 1, std::memory_order_acquire);
290	}
291
292	void unlock() {
293	lck.store(0, std::memory_order_release);
294	}
295	};
296
297	/**
298	* A read/write lock for increased access performance on a
299	* read-heavy use case.
300	*/
301	class ReadWriteLock {
302	/**
303	* Based on paper:
304	* Scalable Reader-Writer Synchronization
305	* for Shared-Memory Multiprocessors
306	*
307	* Layout of the lock:
308	* 31 ... 2 1 0
309	* +-------------------------+--------------------+--------------------+
310	* \| interested reader count \| waiting writer \| active writer flag \|
311	* +-------------------------+--------------------+--------------------+
312	*/
313
314	std::atomic<int> lck{0};
315
316	public:
317	ReadWriteLock() = default;
318
319	void start_read() {
320	// add reader
321	auto r = lck.fetch_add(4, std::memory_order_acquire);
322
323	// wait until there is no writer any more
324	detail::Waiter wait;
325	while (r & 0x3) {
326	// release reader
327	end_read();
328
329	// wait a bit
330	wait();
331
332	// apply as a reader again
333	r = lck.fetch_add(4, std::memory_order_acquire);
334
335	} // while there is a writer => spin
336	}
337
338	void end_read() {
339	lck.fetch_sub(4, std::memory_order_release);
340	}
341
342	void start_write() {
343	detail::Waiter wait;
344
345	// set wait-for-write bit
346	auto stat = lck.fetch_or(2, std::memory_order_acquire);
347	while (stat & 0x2) {
348	wait();
349	stat = lck.fetch_or(2, std::memory_order_acquire);
350	}
351
352	// the caller may starve here ...
353	int should = 2;
354	while (!lck.compare_exchange_strong(
355	should, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
356	wait();
357	should = 2;
358	}
359	}
360
361	bool try_write() {
362	int should = 0;
363	return lck.compare_exchange_strong(should, 1, std::memory_order_acquire, std::memory_order_relaxed);
364	}
365
366	void end_write() {
367	lck.fetch_sub(1, std::memory_order_release);
368	}
369
370	bool try_upgrade_to_write() {
371	int should = 4;
372	return lck.compare_exchange_strong(should, 1, std::memory_order_acquire, std::memory_order_relaxed);
373	}
374
375	void downgrade_to_read() {
376	// delete write bit + set num readers to 1
377	lck.fetch_add(3, std::memory_order_release);
378	}
379	};
380
381	/**
382	* An implementation of an optimistic r/w lock.
383	*/
384	class OptimisticReadWriteLock {
385	/**
386	* The version number utilized for the synchronization.
387	*
388	* Usage:
389	* - even version numbers are stable versions, not being updated
390	* - odd version numbers are temporary versions, currently being updated
391	*/
392	std::atomic<int> version{0};
393
394	public:
395	/**
396	* The lease utilized to link start and end of read phases.
397	*/
398	class Lease {
399	friend class OptimisticReadWriteLock;
400	int version;
401
402	public:
403	Lease(int version = 0) : version(version) {}
404	Lease(const Lease& lease) = default;
405	Lease& operator=(const Lease& other) = default;
406	Lease& operator=(Lease&& other) = default;
407	};
408
409	/**
410	* A default constructor initializing the lock.
411	*/
412	OptimisticReadWriteLock() = default;
413
414	/**
415	* Starts a read phase, making sure that there is currently no
416	* active concurrent modification going on. The resulting lease
417	* enables the invoking process to later-on verify that no
418	* concurrent modifications took place.
419	*/
420	Lease start_read() {
421	detail::Waiter wait;
422
423	// get a snapshot of the lease version
424	auto v = version.load(std::memory_order_acquire);
425
426	// spin while there is a write in progress
427	while ((v & 0x1) == 1) {
428	// wait for a moment
429	wait();
430	// get an updated version
431	v = version.load(std::memory_order_acquire);
432	}
433
434	// done
435	return Lease(v);
436	}
437
438	/**
439	* Tests whether there have been concurrent modifications since
440	* the given lease has been issued.
441	*
442	* @return true if no updates have been conducted, false otherwise
443	*/
444	bool validate(const Lease& lease) {
445	// check whether version number has changed in the mean-while
446	std::atomic_thread_fence(std::memory_order_acquire);
447	return lease.version == version.load(std::memory_order_relaxed);
448	}
449
450	/**
451	* Ends a read phase by validating the given lease.
452	*
453	* @return true if no updates have been conducted since the
454	* issuing of the lease, false otherwise
455	*/
456	bool end_read(const Lease& lease) {
457	// check lease in the end
458	return validate(lease);
459	}
460
461	/**
462	* Starts a write phase on this lock be ensuring exclusive access
463	* and invalidating any existing read lease.
464	*/
465	void start_write() {
466	detail::Waiter wait;
467
468	// set last bit => make it odd
469	auto v = version.fetch_or(0x1, std::memory_order_acquire);
470
471	// check for concurrent writes
472	while ((v & 0x1) == 1) {
473	// wait for a moment
474	wait();
475	// get an updated version
476	v = version.fetch_or(0x1, std::memory_order_acquire);
477	}
478
479	// done
480	}
481
482	/**
483	* Tries to start a write phase unless there is a currently ongoing
484	* write operation. In this case no write permission will be obtained.
485	*
486	* @return true if write permission has been granted, false otherwise.
487	*/
488	bool try_start_write() {
489	auto v = version.fetch_or(0x1, std::memory_order_acquire);
490	return !(v & 0x1);
491	}
492
493	/**
494	* Updates a read-lease to a write permission by a) validating that the
495	* given lease is still valid and b) making sure that there is no currently
496	* ongoing write operation.
497	*
498	* @return true if the lease was still valid and write permissions could
499	* be granted, false otherwise.
500	*/
501	bool try_upgrade_to_write(const Lease& lease) {
502	auto v = version.fetch_or(0x1, std::memory_order_acquire);
503
504	// check whether write privileges have been gained
505	if (v & 0x1) return false; // there is another writer already
506
507	// check whether there was no write since the gain of the read lock
508	if (lease.version == v) return true;
509
510	// if there was, undo write update
511	abort_write();
512
513	// operation failed
514	return false;
515	}
516
517	/**
518	* Aborts a write operation by reverting to the version number before
519	* starting the ongoing write, thereby re-validating existing leases.
520	*/
521	void abort_write() {
522	// reset version number
523	version.fetch_sub(1, std::memory_order_release);
524	}
525
526	/**
527	* Ends a write operation by giving up the associated exclusive access
528	* to the protected data and abandoning the provided write permission.
529	*/
530	void end_write() {
531	// update version number another time
532	version.fetch_add(1, std::memory_order_release);
533	}
534
535	/**
536	* Tests whether currently write permissions have been granted to any
537	* client by this lock.
538	*
539	* @return true if so, false otherwise
540	*/
541	bool is_write_locked() const {
542	return version & 0x1;
543	}
544	};
545
546	/** Concurrent lanes locking mechanism. */
547	struct MutexConcurrentLanes {
548	using lane_id = std::size_t;
549	using unique_lock_type = std::unique_lock<std::mutex>;
550
551	explicit MutexConcurrentLanes(const std::size_t Sz) : Size(Sz), Attribution(attribution(Sz)) {
552	Lanes = std::make_unique<Lane[]>(Sz);
553	}
554	MutexConcurrentLanes(const MutexConcurrentLanes&) = delete;
555	MutexConcurrentLanes(MutexConcurrentLanes&&) = delete;
556
557	virtual ~MutexConcurrentLanes() {}
558
559	// Return the number of lanes.
560	std::size_t lanes() const {
561	return Size;
562	}
563
564	// Select a lane
565	lane_id getLane(std::size_t I) const {
566	if (Attribution == lane_attribution::mod_power_of_2) {
567	return I & (Size - 1);
568	} else {
569	return I % Size;
570	}
571	}
572
573	/** Change the number of lanes.
574	* DO not use while threads are using this object.
575	*/
576	void setNumLanes(const std::size_t NumLanes) {
577	Size = (NumLanes == 0 ? 1 : NumLanes);
578	Attribution = attribution(Size);
579	Lanes = std::make_unique<Lane[]>(Size);
580	}
581
582	unique_lock_type guard(const lane_id Lane) const {
583	return unique_lock_type(Lanes[Lane].Access);
584	}
585
586	// Lock the given lane.
587	// Must eventually be followed by unlock(Lane).
588	void lock(const lane_id Lane) const {
589	Lanes[Lane].Access.lock();
590	}
591
592	// Unlock the given lane.
593	// Must already be the owner of the lane's lock.
594	void unlock(const lane_id Lane) const {
595	Lanes[Lane].Access.unlock();
596	}
597
598	// Acquire the capability to lock all other lanes than the given one.
599	//
600	// Must eventually be followed by beforeUnlockAllBut(Lane).
601	void beforeLockAllBut(const lane_id Lane) const {
602	if (!BeforeLockAll.try_lock()) {
603	// If we cannot get the lock immediately, it means it was acquired
604	// concurrently by another lane that will also try to acquire our
605	// lane lock.
606	// So we release our lane lock to let the concurrent operation
607	// progress.
608	unlock(Lane);
609	BeforeLockAll.lock();
610	lock(Lane);
611	}
612	}
613
614	// Release the capability to lock all other lanes than the given one.
615	//
616	// Must already be the owner of that capability.
617	void beforeUnlockAllBut(const lane_id) const {
618	BeforeLockAll.unlock();
619	}
620
621	// Lock all lanes but the given one.
622	//
623	// Must already have acquired the capability to lock all other lanes
624	// by calling beforeLockAllBut(Lane).
625	//
626	// Must eventually be followed by unlockAllBut(Lane).
627	void lockAllBut(const lane_id Lane) const {
628	for (std::size_t I = 0; I < Size; ++I) {
629	if (I != Lane) {
630	Lanes[I].Access.lock();
631	}
632	}
633	}
634
635	// Unlock all lanes but the given one.
636	// Must already be the owner of all the lanes' locks.
637	void unlockAllBut(const lane_id Lane) const {
638	for (std::size_t I = 0; I < Size; ++I) {
639	if (I != Lane) {
640	Lanes[I].Access.unlock();
641	}
642	}
643	}
644
645	private:
646	enum lane_attribution { mod_power_of_2, mod_other };
647
648	struct Lane {
649	alignas(hardware_destructive_interference_size) std::mutex Access;
650	};
651
652	static constexpr lane_attribution attribution(const std::size_t Sz) {
653	assert(Sz > 0);
654	if ((Sz & (Sz - 1)) == 0) {
655	// Sz is a power of 2
656	return lane_attribution::mod_power_of_2;
657	} else {
658	return lane_attribution::mod_other;
659	}
660	}
661
662	protected:
663	std::size_t Size;
664	lane_attribution Attribution;
665
666	private:
667	mutable std::unique_ptr<Lane[]> Lanes;
668
669	alignas(hardware_destructive_interference_size) mutable std::mutex BeforeLockAll;
670	};
671
672	class ConcurrentLanes : public MutexConcurrentLanes {
673	using Base = MutexConcurrentLanes;
674
675	public:
676	using lane_id = Base::lane_id;
677	using Base::beforeLockAllBut;
678	using Base::beforeUnlockAllBut;
679	using Base::guard;
680	using Base::lock;
681	using Base::lockAllBut;
682	using Base::unlock;
683	using Base::unlockAllBut;
684
685	explicit ConcurrentLanes(const std::size_t Sz) : MutexConcurrentLanes(Sz) {}
686	ConcurrentLanes(const ConcurrentLanes&) = delete;
687	ConcurrentLanes(ConcurrentLanes&&) = delete;
688
689	lane_id threadLane() const {
690	return getLane(static_cast<std::size_t>(omp_get_thread_num()));
691	}
692
693	void setNumLanes(const std::size_t NumLanes) {
694	Base::setNumLanes(NumLanes == 0 ? omp_get_max_threads() : NumLanes);
695	}
696
697	unique_lock_type guard() const {
698	return Base::guard(threadLane());
699	}
700
701	void lock() const {
702	return Base::lock(threadLane());
703	}
704
705	void unlock() const {
706	return Base::unlock(threadLane());
707	}
708
709	void beforeLockAllBut() const {
710	return Base::beforeLockAllBut(threadLane());
711	}
712
713	void beforeUnlockAllBut() const {
714	return Base::beforeUnlockAllBut(threadLane());
715	}
716
717	void lockAllBut() const {
718	return Base::lockAllBut(threadLane());
719	}
720
721	void unlockAllBut() const {
722	return Base::unlockAllBut(threadLane());
723	}
724	};
725
726	#else
727
728	/**
729	* A small utility class for implementing simple locks.
730	*/
731	struct Lock {
732	class Lease {};
733
734	// no locking if there is no parallel execution
735	Lease acquire() {
736	return Lease();
737	}
738
739	void lock() {}
740
741	bool try_lock() {
742	return true;
743	}
744
745	void unlock() {}
746	};
747
748	/**
749	* A 'sequential' non-locking implementation for a spin lock.
750	*/
751	class SpinLock {
752	public:
753	SpinLock() = default;
754
755	void lock() {}
756
757	bool try_lock() {
758	return true;
759	}
760
761	void unlock() {}
762	};
763
764	class ReadWriteLock {
765	public:
766	ReadWriteLock() = default;
767
768	void start_read() {}
769
770	void end_read() {}
771
772	void start_write() {}
773
774	bool try_write() {
775	return true;
776	}
777
778	void end_write() {}
779
780	bool try_upgrade_to_write() {
781	return true;
782	}
783
784	void downgrade_to_read() {}
785	};
786
787	/**
788	* A 'sequential' non-locking implementation for an optimistic r/w lock.
789	*/
790	class OptimisticReadWriteLock {
791	public:
792	class Lease {};
793
794	OptimisticReadWriteLock() = default;
795
796	Lease start_read() {
797	return Lease();
798	}
799
800	bool validate(const Lease& /lease/) {
801	return true;
802	}
803
804	bool end_read(const Lease& /lease/) {
805	return true;
806	}
807
808	void start_write() {}
809
810	bool try_start_write() {
811	return true;
812	}
813
814	bool try_upgrade_to_write(const Lease& /lease/) {
815	return true;
816	}
817
818	void abort_write() {}
819
820	void end_write() {}
821
822	bool is_write_locked() const {
823	return true;
824	}
825	};
826
827	struct ConcurrentLanes : protected SeqConcurrentLanes {
828	using Base = SeqConcurrentLanes;
829	using lane_id = SeqConcurrentLanes::lane_id;
830	using unique_lock_type = SeqConcurrentLanes::unique_lock_type;
831
832	using Base::lanes;
833	using Base::setNumLanes;
834
835	explicit ConcurrentLanes(std::size_t Sz = MAX_THREADS) : Base(Sz) {}
836	ConcurrentLanes(const ConcurrentLanes&) = delete;
837	ConcurrentLanes(ConcurrentLanes&&) = delete;
838
839	virtual ~ConcurrentLanes() {}
840
841	lane_id threadLane() const {
842	return 0;
843	}
844
845	unique_lock_type guard() const {
846	return Base::guard(threadLane());
847	}
848
849	void lock() const {
850	return Base::lock(threadLane());
851	}
852
853	void unlock() const {
854	return Base::unlock(threadLane());
855	}
856
857	void beforeLockAllBut() const {
858	return Base::beforeLockAllBut(threadLane());
859	}
860
861	void beforeUnlockAllBut() const {
862	return Base::beforeUnlockAllBut(threadLane());
863	}
864
865	void lockAllBut() const {
866	return Base::lockAllBut(threadLane());
867	}
868
869	void unlockAllBut() const {
870	return Base::unlockAllBut(threadLane());
871	}
872	};
873
874	#endif
875
876	/**
877	* Obtains a reference to the lock synchronizing output operations.
878	*/
879	inline Lock& getOutputLock() {
880	static Lock outputLock;
881	return outputLock;
882	}
883
884	} // namespace souffle