Skip to content

Commit

Permalink
Concurrent tags: use parallel_cpus instead of qemu_tcg_mttcg_enabled()
Browse files Browse the repository at this point in the history
qemu_tcg_mttcg_enabled() will return true even when we only have a
single emulated CPU. This adds noticeable overhead to a CheriBSD boot without
`-smp` (243s instead of 215s) to boot.
Use an immutable view of `parallel_cpus` instead to avoid locking if max_cpu==1.
  • Loading branch information
arichardson committed Jul 20, 2021
1 parent 6c66a34 commit 11e9301
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 21 deletions.
41 changes: 28 additions & 13 deletions target/cheri-common/cheri_tagmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "exec/exec-all.h"
#include "exec/log.h"
#include "exec/ramblock.h"
#include "hw/boards.h"
#include "cheri_defs.h"
#include "cheri-helper-utils.h"
// XXX: use hbitmap? Or a different data structure?
Expand Down Expand Up @@ -73,6 +74,9 @@
* used.
*/

bool _need_concurrent_tags = false;
bool _need_concurrent_tags_initialized = false;

// Define to do some extra checks around spinlocks
//#define DEBUG_SPIN_LOCKS

Expand Down Expand Up @@ -290,7 +294,7 @@ static void lock_tag_write_tag_and_release(lock_tag *lock, bool tag)
static bool lock_tag_read(lock_tag *lock)
{
#ifdef CONFIG_DEBUG_TCG
assert(!parallel_cpus ||
assert(!need_concurrent_tags() ||
(lock->as_int & (LOCKTAG_MASK_READERS | LOCKTAG_MASK_WRITE_LOCKED)));
#endif
return lock->as_int & LOCKTAG_MASK_TAG;
Expand All @@ -300,7 +304,7 @@ static bool lock_tag_write(lock_tag *lock, bool tag, bool check_locked)
{
#ifdef CONFIG_DEBUG_TCG
if (check_locked)
assert(!parallel_cpus || (lock->as_int & LOCKTAG_MASK_WRITE_LOCKED));
assert(!need_concurrent_tags() || (lock->as_int & LOCKTAG_MASK_WRITE_LOCKED));
#endif
bool old = lock->as_int & LOCKTAG_MASK_TAG;
lock->as_int = (lock->as_int & ~LOCKTAG_MASK_TAG) | tag;
Expand All @@ -309,9 +313,9 @@ static bool lock_tag_write(lock_tag *lock, bool tag, bool check_locked)

typedef struct CheriTagBlock {
// It would be silly to use locks for non-mttcg. So support both formats and
// use one or the other depending on qemu_tcg_mttcg_enabled()
// use one or the other depending on need_concurrent_tags()
// It looks like single stepping can be turned on/off, no probably best
// not to use parallel_cpus.
// not to use need_concurrent_tags().
union {
DECLARE_BITMAP(tag_bitmap, CAP_TAGBLK_SIZE);
lock_tag locked_tags[CAP_TAGBLK_SIZE];
Expand All @@ -323,7 +327,7 @@ static CheriTagBlock *cheri_tag_new_tagblk(RAMBlock *ram, uint64_t tagidx)
{
CheriTagBlock *tagblk, *old;

size_t size = qemu_tcg_mttcg_enabled()
size_t size = need_concurrent_tags()
? sizeof(((CheriTagBlock *)0)->locked_tags)
: sizeof(((CheriTagBlock *)0)->tag_bitmap);

Expand Down Expand Up @@ -361,7 +365,7 @@ static inline QEMU_ALWAYS_INLINE CheriTagBlock *cheri_tag_block(size_t tag_index
static inline QEMU_ALWAYS_INLINE bool tagmem_get_tag(void *tagmem, size_t index,
tag_reader_lock_t *lock)
{
if (qemu_tcg_mttcg_enabled()) {
if (need_concurrent_tags()) {
lock_tag *locktag = (lock_tag *)tagmem + index;
if (lock) {
*lock = (tag_reader_lock_t)locktag;
Expand Down Expand Up @@ -405,7 +409,7 @@ static inline QEMU_ALWAYS_INLINE int
tagmem_get_tag_many(void *tagmem, size_t index, bool take_lock)
{
unsigned long result;
if (qemu_tcg_mttcg_enabled()) {
if (need_concurrent_tags()) {
lock_tag *lock = ((lock_tag *)tagmem) + index;
if (!take_lock) {
// If we don't need the lock we can just read a whole bunch of
Expand Down Expand Up @@ -436,7 +440,7 @@ static inline QEMU_ALWAYS_INLINE void tagmem_set_tag(void *tagmem, size_t index,
tag_writer_lock_t *lock,
bool lock_only)
{
if (qemu_tcg_mttcg_enabled()) {
if (need_concurrent_tags()) {
lock_tag *lockTag = (lock_tag *)tagmem + index;
if (lock) {
*lock = (tag_writer_lock_t)(lockTag);
Expand All @@ -457,7 +461,7 @@ static inline QEMU_ALWAYS_INLINE void tagmem_set_tag(void *tagmem, size_t index,
static inline QEMU_ALWAYS_INLINE void
tagmem_set_tag_many(void *tagmem, size_t index, uint8_t tags, bool take_lock)
{
if (qemu_tcg_mttcg_enabled()) {
if (need_concurrent_tags()) {
lock_tag *lock = ((lock_tag *)tagmem) + index;

// TODO: This is only used by morello, but when I merged this I did
Expand Down Expand Up @@ -515,7 +519,7 @@ static inline QEMU_ALWAYS_INLINE void tagmem_clear_tag(void *tagmem,
tag_writer_lock_t *lock,
bool lock_only)
{
if (qemu_tcg_mttcg_enabled()) {
if (need_concurrent_tags()) {
lock_tag *lockTag = (lock_tag *)tagmem + index;
if (lock) {
*lock = (tag_writer_lock_t)lockTag;
Expand Down Expand Up @@ -555,6 +559,17 @@ void cheri_tag_init(MemoryRegion *mr, uint64_t memory_size)
assert(memory_region_size(mr) == memory_size &&
"Incorrect tag mem size passed?");

if (!_need_concurrent_tags_initialized) {
_need_concurrent_tags =
qemu_tcg_mttcg_enabled() && current_machine->smp.max_cpus > 1;
_need_concurrent_tags_initialized = true;
} else {
assert(_need_concurrent_tags ==
(qemu_tcg_mttcg_enabled() && current_machine->smp.max_cpus > 1));
}
info_report("%s: need_concurrent_tags()=%d, mttcg=%d, max_cpus=%d",
__func__, need_concurrent_tags(), qemu_tcg_mttcg_enabled(),
current_machine->smp.max_cpus);
size_t cheri_ntagblks = num_tagblocks(mr->ram_block);
mr->ram_block->cheri_tags =
g_malloc0(cheri_ntagblks * sizeof(CheriTagBlock *));
Expand Down Expand Up @@ -605,7 +620,7 @@ void *cheri_tagmem_for_addr(CPUArchState *env, target_ulong vaddr,

if (tagblk != NULL) {
const size_t tagblk_index = CAP_TAGBLK_IDX(tag);
if (qemu_tcg_mttcg_enabled()) {
if (need_concurrent_tags()) {
return tagblk->locked_tags + tagblk_index;
} else {
return tagblk->tag_bitmap + BIT_WORD(tagblk_index);
Expand Down Expand Up @@ -834,7 +849,7 @@ void cheri_tag_phys_invalidate(CPUArchState *env, RAMBlock *ram,
const size_t tagblk_index = CAP_TAGBLK_IDX(tag);
if (unlikely(env && qemu_log_instr_enabled(env))) {
bool old_tag =
qemu_tcg_mttcg_enabled()
need_concurrent_tags()
? tagblock_get_locktag(tagblk, tagblk_index, NULL)
: tagblock_get_tag(tagblk, tagblk_index);
if (vaddr) {
Expand All @@ -854,7 +869,7 @@ void cheri_tag_phys_invalidate(CPUArchState *env, RAMBlock *ram,
}
}

if (qemu_tcg_mttcg_enabled()) {
if (need_concurrent_tags()) {
tagblock_clear_locktag(tagblk, tagblk_index, false, false);
} else {
tagblock_clear_tag(tagblk, tagblk_index);
Expand Down
31 changes: 23 additions & 8 deletions target/cheri-common/cheri_tagmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "qemu/osdep.h"
#include "cpu.h"
#include "exec/cpu-common.h"
#include "exec/exec-all.h"
#include "exec/memory.h"
#include "cheri_tagmem_ex_locks.h"

Expand Down Expand Up @@ -103,6 +104,20 @@ extern tag_writer_lock_t notag_lock_high;
void cheri_tag_reader_lock_impl(tag_reader_lock_t lock);
void cheri_tag_writer_lock_impl(tag_writer_lock_t lock);

extern bool _need_concurrent_tags;
extern bool _need_concurrent_tags_initialized;

static inline QEMU_ALWAYS_INLINE bool need_concurrent_tags(void)
{
cheri_debug_assert(_need_concurrent_tags_initialized);
/*
* TODO: can parallel_cpus change at runtime? If not we don't need the
* separate variable.
*/
cheri_debug_assert(_need_concurrent_tags == parallel_cpus);
return _need_concurrent_tags;
}

static inline void cheri_tag_reader_release(tag_reader_lock_t lock)
{
if (lock != TAG_LOCK_NONE && lock != NULL)
Expand All @@ -124,7 +139,7 @@ cheri_get_exception_locks(CPUArchState *env)
static inline void cheri_tag_push_free_on_exception(CPUArchState *env,
tag_lock_t lock)
{
if (qemu_tcg_mttcg_enabled()) {
if (need_concurrent_tags()) {
cheri_exception_locks_t *locks = cheri_get_exception_locks(env);
assert(locks->fill != MAX_CHERI_EXCEPTION_LOCKS);
locks->locks[locks->fill++] = lock;
Expand All @@ -133,7 +148,7 @@ static inline void cheri_tag_push_free_on_exception(CPUArchState *env,

static inline tag_lock_t cheri_tag_pop_on_exception(CPUArchState *env)
{
if (qemu_tcg_mttcg_enabled()) {
if (need_concurrent_tags()) {
cheri_exception_locks_t *locks = cheri_get_exception_locks(env);
assert(locks->fill != 0);
tag_lock_t lock = locks->locks[--locks->fill];
Expand All @@ -147,7 +162,7 @@ void cheri_tag_free_lock(tag_lock_t lock);

static inline void cheri_tag_locks_exception_thrown(CPUState *cpu)
{
if (!qemu_tcg_mttcg_enabled())
if (!need_concurrent_tags())
return;
cheri_exception_locks_t *locks = &cpu->cheri_exception_locks;
for (size_t i = 0; i != MAX_CHERI_EXCEPTION_LOCKS; i++) {
Expand Down Expand Up @@ -184,7 +199,7 @@ cheri_tag_writer_pop_free_on_exception(CPUArchState *env)
return (tag_writer_lock_t)cheri_tag_pop_on_exception(env);
}

#define cheri_tag_assert_not_mttcg() assert(!qemu_tcg_mttcg_enabled());
#define cheri_tag_assert_not_mttcg() assert(!need_concurrent_tags());

/* Note: for cheri_tag_phys_invalidate, env may be NULL */
void cheri_tag_phys_invalidate(CPUArchState *env, RAMBlock *ram,
Expand Down Expand Up @@ -216,7 +231,7 @@ static inline void cheri_lock_for_tag_invalidate(CPUArchState *env,
tag_writer_lock_t *first,
tag_writer_lock_t *second)
{
if (!qemu_tcg_mttcg_enabled()) {
if (!need_concurrent_tags()) {
if (first)
*first = NULL;
if (second)
Expand Down Expand Up @@ -246,7 +261,7 @@ static inline void
cheri_lock_for_tag_invalidate_aligned(CPUArchState *env, target_ulong vaddr,
uintptr_t pc, tag_writer_lock_t *lock)
{
if (!qemu_tcg_mttcg_enabled()) {
if (!need_concurrent_tags()) {
*lock = NULL;
} else {
cheri_tag_invalidate_aligned_impl(env, vaddr, pc, lock, true);
Expand All @@ -263,7 +278,7 @@ static inline void cheri_lock_for_tag_get(CPUArchState *env, target_ulong vaddr,
int reg, hwaddr *ret_paddr, int *prot,
uintptr_t pc, tag_reader_lock_t *lock)
{
if (!qemu_tcg_mttcg_enabled()) {
if (!need_concurrent_tags()) {
*lock = NULL;
} else {
cheri_tag_get(env, vaddr, reg, ret_paddr, prot, pc, lock);
Expand All @@ -287,7 +302,7 @@ static inline void cheri_lock_for_tag_set(CPUArchState *env, target_ulong vaddr,
int reg, hwaddr *ret_paddr,
uintptr_t pc, tag_writer_lock_t *lock)
{
if (!qemu_tcg_mttcg_enabled()) {
if (!need_concurrent_tags()) {
*lock = NULL;
} else {
cheri_tag_set_impl(env, vaddr, reg, ret_paddr, pc, lock, true);
Expand Down

0 comments on commit 11e9301

Please sign in to comment.