Badger归档 - 枫阿雨's blog

Badger源码导读(二) – 读写事务

crazyfay — Sun, 18 Dec 2022 05:59:47 +0000

Badger源码导读(二)

源码分析入口基准案例

先从Badger的基本使用入手

func main() {
    // 打开db
    db, _ := badger.Open(badger.DefaultOptions("tmp/badger"))
    defer db.Close()

    // 读写事务
    err := db.Update(func(txn *badger.Txn) error {
        txn.Set([]byte("answer"), []byte("42"))
        txn.Get([]byte("answer"))
        return nil
    })

    // 只读事务
    err = db.View(func(txn *badger.Txn) error {
        txn.Get([]byte("answer_v1"))
        return nil
    })

    // 遍历keys
    err = db.View(func(txn *badger.Txn) error {
        opts := badger.DefaultIteratorOptions
        opts.PrefetchSize = 10
        it := txn.NewIterator(opts)
        defer it.Close()
        for it.Rewind(); it.Valid(); it.Next() {
            item := it.Item()
            k := item.Key()
            err := item.Value(func(val []byte) error {
                fmt.Printf("key=%s, value=%s\n", k, val)
                return nil
            })
            if err != nil {
                return err
            }
        }
        return nil
    })
    err = db.RunValueLogGC(0.7)
    _ = err
}

读写事务

在第一章db初始化的时候，我们发现参数opt里面有一个 orcale 实例

在事务的实现中oracle实例发挥着重要的作用

Oracle的实例,一个KV引擎并发事务的管理器,负责分配事务的版本号,用来实现MVCC功能

oracle实例

func newOracle(opt Options) *oracle {
    orc := &oracle{
        isManaged:       opt.managedTxns,
        // 当前事务是否支持冲突检测
        detectConflicts: opt.DetectConflicts,
        // We're not initializing nextTxnTs and readOnlyTs. It would be done after replay in Open.
        //
        // WaterMarks must be 64-bit aligned for atomic package, hence we must use pointers here.
        // See https://golang.org/pkg/sync/atomic/#pkg-note-BUG.

        // 水位,用来进行并发控制
        readMark: &y.WaterMark{Name: "badger.PendingReads"},
        txnMark:  &y.WaterMark{Name: "badger.TxnTimestamp"},
        closer:   z.NewCloser(2),
    }
    orc.readMark.Init(orc.closer)
    orc.txnMark.Init(orc.closer)
    return orc
}

WaterMark.Init()

// Init initializes a WaterMark struct. MUST be called before using it.
func (w *WaterMark) Init(closer *z.Closer) {
    // 固定100大小的缓冲mark channel
    w.markCh = make(chan mark, 100)
    go w.process(closer)
}

mark 结构体

type mark struct {
   // Either this is an (index, waiter) pair or (index, done) or (indices, done).
   // 索引
   index   uint64
   // 传递空结构体信息的channel
   waiter  chan struct{}
   indices []uint64
   // 是否完成的标志
   done    bool // Set to true if the index is done.
}

核心方法-Update(func(*badger.Txn))

db.Update(func(*badger.Txn))

func (db *DB) Update(fn func(txn *Txn) error) error {
   // 判断状态是否关闭
   if db.IsClosed() {
      return ErrDBClosed
   }
   // 事务启动和提交时间戳由最终用户管理。这只对构建在Badger之上的数据库有用(比如Dgraph)
   if db.opt.managedTxns {
      panic("Update can only be used with managedDB=false.")
   }
   txn := db.NewTransaction(true)
   defer txn.Discard()
   // 回调的闭包
   if err := fn(txn); err != nil {
      return err
   }

   return txn.Commit()
}

`newTransaction(bool)`

func (db *DB) NewTransaction(update bool) *Txn {
   return db.newTransaction(update, false)
}

func (db *DB) newTransaction(update, isManaged bool) *Txn {
   // 设置是否只读事务,badger对读写并发的设计不同
   // 只读事务比读写事务性能要更好(少做一些事情,不需要考虑并发控制如读写冲突等)
   if db.opt.ReadOnly && update {
      // DB is read-only, force read-only transaction.
      update = false
   }

   // 创建事务实例
   txn := &Txn{
      // 只读标记
      update: update,
      // 反引用db
      db:     db,
      count:  1,                       // One extra entry for BitFin.
      size:   int64(len(txnKey) + 10), // Some buffer for the extra entry.
   }
   // 标记读写事务
   if update {
       // 如果检测冲突
      if db.opt.DetectConflicts {
         // 记录写入事务的set,检测事务冲突
         // 如读取一个已经修改了的事务就要进行检测
         txn.conflictKeys = make(map[uint64]struct{})
      }
      // 记录当前的读写事务在事务上做了多少次set操作,所有插入都会进行记录
      txn.pendingWrites = make(map[string]*Entry)
   }
    // 给事务分配一个读时间戳
   if !isManaged {
       // oracle进行事务版本号的分发
      txn.readTs = db.orc.readTs()
   }
   return txn
}

`orcale.readTs()`

func (o *oracle) readTs() uint64 {
   if o.isManaged {
      panic("ReadTs should not be retrieved for managed DB")
   }

   var readTs uint64
   // 加锁
   o.Lock()
    // 获取全局事务号(后面在commit的时候就直到为什么是-1了)
   readTs = o.nextTxnTs - 1
    // 标记当前读事务时间戳,事务已经进入读取阶段
   o.readMark.Begin(readTs)
   o.Unlock()

   // Wait for all txns which have no conflicts, have been assigned a commit
   // timestamp and are going through the write to value log and LSM tree
   // process. Not waiting here could mean that some txns which have been
   // committed would not be read.
    // 此时事务版本号已经分配好,而且也已经通知了事务的水位标记线,事务已经开始了
   y.Check(o.txnMark.WaitForMark(context.Background(), readTs))
   return readTs
}

`WaterMark.Begin()` 方法

// Begin sets the last index to the given value.
func (w *WaterMark) Begin(index uint64) {
   // 更改index
   atomic.StoreUint64(&w.lastIndex, index)
   // 写入之前创建的缓冲区为100的缓冲mark channel(消费方后面详解)
   w.markCh <- mark{index: index, done: false}
}

`WaterMark.WaitForMark(ctx, index)` 方法

传入读时间戳

// WaitForMark waits until the given index is marked as done.
// 等待比当前小的时间戳提交
func (w *WaterMark) WaitForMark(ctx context.Context, index uint64) error {
    // w.DoneUntil() => 已提交事务的最大版本号
    // 用读事务时间戳和已提交事务最大版本号进行比较
   if w.DoneUntil() >= index {
      return nil
   }
   waitCh := make(chan struct{})
    // 如果发现有比当前事务号更小的,会等待小的读取事务全部提交完成之后,会回调waiter进行close
    // 具体mark的处理过程见下文
   w.markCh <- mark{index: index, waiter: waitCh}

   select {
   case <-ctx.Done():
      return ctx.Err()
   case <-waitCh:
      return nil
   }
}

`txn.set(k,v []byte)` 方法

set操作不会真正的写磁盘,只会对事务对象进行一定的操作,一切都基于内存,直到提交的时候才会持久化到磁盘,如果事务终止的话,直接将内存中的数据释放即可,也是保证事务一致性的因素之一

func (txn *Txn) Set(key, val []byte) error {
    // 包装成entry
   return txn.SetEntry(NewEntry(key, val))
}

func (txn *Txn) SetEntry(e *Entry) error {
    return txn.modify(e)
}

`txn.modify(*Entry)`方法

func (txn *Txn) modify(e *Entry) error {
    // key的最大尺寸,因为key的最大长度的为uint32 65535
    // 在写入key的时候,会拼接提交时间戳(事务版本号)
   const maxKeySize = 65000

   switch {
       // 是否为只读事务
   case !txn.update:
      return ErrReadOnlyTxn
   case txn.discarded:
       // 事务是否已提交
      return ErrDiscardedTxn
       // 如果key为空
   case len(e.Key) == 0:
      return ErrEmptyKey
       // 通过前缀判断是否为内部key (!badger!)
       // 不得不吐槽一下，这个设计有点屎，干嘛不设计个CF呢
   case bytes.HasPrefix(e.Key, badgerPrefix):
      return ErrInvalidKey
       // 检查key大小是否超出约定值
   case len(e.Key) > maxKeySize:
      // Key length can't be more than uint16, as determined by table::header.  To
      // keep things safe and allow badger move prefix and a timestamp suffix, let's
      // cut it down to 65000, instead of using 65536.
      return exceedsSize("Key", maxKeySize, e.Key)
       // 判断vlog大小是否超出约定之
   case int64(len(e.Value)) > txn.db.opt.ValueLogFileSize:
      return exceedsSize("Value", txn.db.opt.ValueLogFileSize, e.Value)
   case txn.db.opt.InMemory && int64(len(e.Value)) > txn.db.valueThreshold():
      return exceedsSize("Value", txn.db.valueThreshold(), e.Value)
   }

    // 检查事务的尺寸
   if err := txn.checkSize(e); err != nil {
      return err
   }

   // The txn.conflictKeys is used for conflict detection. If conflict detection
   // is disabled, we don't need to store key hashes in this map.
    // 判断当前事务是否开启事务冲突检测
   if txn.db.opt.DetectConflicts {
       // 根据key的内存地址计算memHash (这里使用hash值来检测key的冲突,在一定程度上会有误判的情况)
      fp := z.MemHash(e.Key) // Avoid dealing with byte arrays.
       // 写到事务冲突的判断集合
      txn.conflictKeys[fp] = struct{}{}
   }
   // If a duplicate entry was inserted in managed mode, move it to the duplicate writes slice.
   // Add the entry to duplicateWrites only if both the entries have different versions. For
   // same versions, we will overwrite the existing entry.
    // 获取老的entry,判断是否成功和是否重复提交
   if oldEntry, ok := txn.pendingWrites[string(e.Key)]; ok && oldEntry.version != e.version {
       // 如果是重复写入(版本号相同的key在一个事务中set多次)
       // 单独记录到duplicateWrites数组
      txn.duplicateWrites = append(txn.duplicateWrites, oldEntry)
   }
   txn.pendingWrites[string(e.Key)] = e
   return nil
}

Txn 结构体

type Txn struct {
    ...
    pendingWrites   map[string]*Entry // cache stores any writes done by txn.
    duplicateWrites []*Entry          // Used in managed mode to store duplicate entries.
    ...
}

`txn.get(k []byte)` 方法

func (txn *Txn) Get(key []byte) (item *Item, rerr error) {
    // key为空直接返回错误
   if len(key) == 0 {
      return nil, ErrEmptyKey
       // 如果事务已经结束则返回错误
   } else if txn.discarded {
      return nil, ErrDiscardedTxn
   }

   item = new(Item)
    // 如果是读写事务
   if txn.update {
       // 判断当前key是否在之前被写入过,判断key是否相等,此方法实现了事务的隔离性
      if e, has := txn.pendingWrites[string(key)]; has && bytes.Equal(key, e.Key) {
          // 如果相同则直接组装数据并返回
          // 判断是否被删除或过期
         if isDeletedOrExpired(e.meta, e.ExpiresAt) {
            return nil, ErrKeyNotFound
         }
         // Fulfill from cache.
         item.meta = e.meta
         item.val = e.Value
         item.userMeta = e.UserMeta
         item.key = key
         item.status = prefetched
         item.version = txn.readTs
         item.expiresAt = e.ExpiresAt
         // We probably don't need to set db on item here.
         return item, nil
      }
      // Only track reads if this is update txn. No need to track read if txn serviced it
      // internally.
       // 标记当前key被读取,放入到一个read数组中,记录当前事务都读取了哪些key
      txn.addReadKey(key)
   }
    // 从lsm-tree中真正的读取数据,之后再详细解读
   seek := y.KeyWithTs(key, txn.readTs)
   vs, err := txn.db.get(seek)
   if err != nil {
      return nil, y.Wrapf(err, "DB::Get key: %q", key)
   }
   if vs.Value == nil && vs.Meta == 0 {
      return nil, ErrKeyNotFound
   }
   if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) {
      return nil, ErrKeyNotFound
   }

   item.key = key
   item.version = vs.Version
   item.meta = vs.Meta
   item.userMeta = vs.UserMeta
   item.vptr = y.SafeCopy(item.vptr, vs.Value)
   item.txn = txn
   item.expiresAt = vs.ExpiresAt
   return item, nil
}

`txn.Commit()` 方法

func (txn *Txn) Commit() error {
   // txn.conflictKeys can be zero if conflict detection is turned off. So we
   // should check txn.pendingWrites.
    // 判断当前txn中是否有set操作发生过
   if len(txn.pendingWrites) == 0 {
      return nil // Nothing to do.
   }
   // Precheck before discarding txn.
    // 进行相关的预处理检查
   if err := txn.commitPrecheck(); err != nil {
      return err
   }
   defer txn.Discard()
    // 真正的提交到oracle,通知oracle,当前的事务对象已经提交,可以更新水位线
    // 即比水位时间戳小的读取事务继续执行
   txnCb, err := txn.commitAndSend()
   if err != nil {
      return err
   }
   // If batchSet failed, LSM would not have been updated. So, no need to rollback anything.
    // 调用返回的闭包
   return txnCb()
}

`txn.commitPrecheck()` 方法

func (txn *Txn) commitPrecheck() error {
    // 判断事务是否已提交
   if txn.discarded {
      return errors.New("Trying to commit a discarded txn")
   }
   keepTogether := true
    // 遍历发生过修改的key,进行一些检查
   for _, e := range txn.pendingWrites {
      if e.version != 0 {
         keepTogether = false
      }
   }

   // If keepTogether is True, it implies transaction markers will be added.
   // In that case, commitTs should not be never be zero. This might happen if
   // someone uses txn.Commit instead of txn.CommitAt in managed mode.  This
   // should happen only in managed mode. In normal mode, keepTogether will
   // always be true.
   if keepTogether && txn.db.opt.managedTxns && txn.commitTs == 0 {
      return errors.New("CommitTs cannot be zero. Please use commitAt instead")
   }
   return nil
}

`txn.commitAndSend()` 方法

func (txn *Txn) commitAndSend() (func() error, error) {
   orc := txn.db.orc
   // Ensure that the order in which we get the commit timestamp is the same as
   // the order in which we push these updates to the write channel. So, we
   // acquire a writeChLock before getting a commit timestamp, and only release
   // it after pushing the entries to it.
    // 给oracle实例上写锁,以保证按事务提交的顺序写入到磁盘
   orc.writeChLock.Lock()
   defer orc.writeChLock.Unlock()

    // 创建提交时间戳
   commitTs, conflict := orc.newCommitTs(txn)
    // 检查冲突情况
   if conflict {
      return nil, ErrConflict
   }

   keepTogether := true
    // 设置版本号,把key后面拼接一个后缀(版本号)的闭包
   setVersion := func(e *Entry) {
      if e.version == 0 {
         e.version = commitTs
      } else {
         keepTogether = false
      }
   }
    // 遍历设置版本号
   for _, e := range txn.pendingWrites {
      setVersion(e)
   }
   // The duplicateWrites slice will be non-empty only if there are duplicate
   // entries with different versions.
    // 遍历设置版本号
   for _, e := range txn.duplicateWrites {
      setVersion(e)
   }

    // 把 pendingWrites 和 duplicateWrites 组装成一个entry数组
   entries := make([]*Entry, 0, len(txn.pendingWrites)+len(txn.duplicateWrites)+1)

    // 处理enry的闭包
   processEntry := func(e *Entry) {
      // Suffix the keys with commit ts, so the key versions are sorted in
      // descending order of commit timestamp.
      e.Key = y.KeyWithTs(e.Key, e.version)
      // Add bitTxn only if these entries are part of a transaction. We
      // support SetEntryAt(..) in managed mode which means a single
      // transaction can have entries with different timestamps. If entries
      // in a single transaction have different timestamps, we don't add the
      // transaction markers.
      if keepTogether {
         e.meta |= bitTxn
      }
      entries = append(entries, e)
   }

   // The following debug information is what led to determining the cause of
   // bank txn violation bug, and it took a whole bunch of effort to narrow it
   // down to here. So, keep this around for at least a couple of months.
   // var b strings.Builder
   // fmt.Fprintf(&b, "Read: %d. Commit: %d. reads: %v. writes: %v. Keys: ",
   //     txn.readTs, commitTs, txn.reads, txn.conflictKeys)
    // 遍历处理组装entry 
   for _, e := range txn.pendingWrites {
      processEntry(e)
   }
    // 遍历处理组装entry 
   for _, e := range txn.duplicateWrites {
      processEntry(e)
   }
    // 这里先不用管
   if keepTogether {
      // CommitTs should not be zero if we're inserting transaction markers.
      y.AssertTrue(commitTs != 0)
      e := &Entry{
         Key:   y.KeyWithTs(txnKey, commitTs),
         Value: []byte(strconv.FormatUint(commitTs, 10)),
         meta:  bitFinTxn,
      }
      entries = append(entries, e)
   }
    // 把entry写请求打包好,批量发送给db实例进行异步写入
   req, err := txn.db.sendToWriteCh(entries)
   if err != nil {
      orc.doneCommit(commitTs)
      return nil, err
   }
    // 返回ret的闭包,再commit的最后执行
   ret := func() error {
       // 等待写操作完成
      err := req.Wait()
      // Wait before marking commitTs as done.
      // We can't defer doneCommit above, because it is being called from a
      // callback here.
       // 标记commit操作完成
      orc.doneCommit(commitTs)
      return err
   }
   return ret, nil
}

`oracle.newCommitTs()`方法

func (o *oracle) newCommitTs(txn *Txn) (uint64, bool) {
   o.Lock()
   defer o.Unlock()
    // 检查活跃的事务是否冲突,已经提交的事务不需要检查
   if o.hasConflict(txn) {
      return 0, true
   }

   var ts uint64
   if !o.isManaged {
       // 对读取操作标记完成
      o.doneRead(txn)
       // 清理已完成事务
      o.cleanupCommittedTransactions()

      // This is the general case, when user doesn't specify the read and commit ts.
       // 读取时间戳是next-1,提交时间戳是next
      ts = o.nextTxnTs
      o.nextTxnTs++
       // 此时读取操作结束,进入事务提交阶段,不能再进行事务的其他操作(不需要上锁)
      o.txnMark.Begin(ts)

   } else {
      // If commitTs is set, use it instead.
      ts = txn.commitTs
   }

   y.AssertTrue(ts >= o.lastCleanupTs)

    // 冲突检测
   if o.detectConflicts {
      // We should ensure that txns are not added to o.committedTxns slice when
      // conflict detection is disabled otherwise this slice would keep growing.
       // 读阶段完成,提交阶段没有完成的事务会记录到o.committedTxns
      o.committedTxns = append(o.committedTxns, committedTxn{
         ts:           ts, // 提交版本号
         conflictKeys: txn.conflictKeys, // 冲突检查的set(事务写的key)
      })
   }

   return ts, false
}

`oracle.hasConflict()` 方法

// hasConflict must be called while having a lock.
func (o *oracle) hasConflict(txn *Txn) bool {
    // reads数组为0则无冲突
   if len(txn.reads) == 0 {
      return false
   }
    // commitedTxns: 表示活跃事务的数组
   for _, committedTxn := range o.committedTxns {
      // If the committedTxn.ts is less than txn.readTs that implies that the
      // committedTxn finished before the current transaction started.
      // We don't need to check for conflict in that case.
      // This change assumes linearizability. Lack of linearizability could
      //  cause the read ts of a new txn to be lower than the commit ts of
      // a txn before it (@mrjn).
       // 判断事务时间戳是否小于读取时间戳
      if committedTxn.ts <= txn.readTs {
          // 如果小于,则不会影响的事务的读取
         continue
      }
       // 遍历事务读取的key,如果在别的事务中发生过修改操作,如果是则发生冲突
      for _, ro := range txn.reads {
         if _, has := committedTxn.conflictKeys[ro]; has {
            return true
         }
      }
   }

   return false
}

`oracle.doneRead()` 方法

func (o *oracle) doneRead(txn *Txn) {
   if !txn.doneRead {
      txn.doneRead = true
       // 通知readMark完成
      o.readMark.Done(txn.readTs)
   }
}

func (w *WaterMark) Done(index uint64) {
    // 创建一个mark实例,向markchannel发送消息
    w.markCh <- mark{index: index, done: true}
}

`oracle.cleanupCommittedTransactions()` 方法

func (o *oracle) cleanupCommittedTransactions() { // Must be called under o.Lock
    // 检查版本冲突
   if !o.detectConflicts {
      // When detectConflicts is set to false, we do not store any
      // committedTxns and so there's nothing to clean up.
      return
   }
   // Same logic as discardAtOrBelow but unlocked
   var maxReadTs uint64
    // 获取最大读取时间戳
   if o.isManaged {
      maxReadTs = o.discardTs
   } else {
       // 获取读事务标记水位作为最大读取时间戳
      maxReadTs = o.readMark.DoneUntil()
   }
    // 断言是否大于最后清理时间戳
   y.AssertTrue(maxReadTs >= o.lastCleanupTs)

   // do not run clean up if the maxReadTs (read timestamp of the
   // oldest transaction that is still in flight) has not increased
    // 如果等于则直接返回
   if maxReadTs == o.lastCleanupTs {
      return
   }
    // 如果不等于,则赋值最后清理时间戳为最大读取时间戳
   o.lastCleanupTs = maxReadTs
    // 创建空切片
   tmp := o.committedTxns[:0]
    // 小于水位的事务已经不会产生冲突了,清理数组
   for _, txn := range o.committedTxns {
      if txn.ts <= maxReadTs {
         continue
      }
       // 活跃状态的保存
      tmp = append(tmp, txn)
   }
   o.committedTxns = tmp
}

`db.sendToWriteCh()` 方法

异步写入

func (db *DB) sendToWriteCh(entries []*Entry) (*request, error) {
   if atomic.LoadInt32(&db.blockWrites) == 1 {
      return nil, ErrBlockedWrites
   }
   var count, size int64
    // 遍历entrys,计算数据条数
   for _, e := range entries {
      size += e.estimateSizeAndSetThreshold(db.valueThreshold())
      count++
   }
    // 现在单词事务最大写入
   if count >= db.opt.maxBatchCount || size >= db.opt.maxBatchSize {
      return nil, ErrTxnTooBig
   }

   // We can only service one request because we need each txn to be stored in a contigous section.
   // Txns should not interleave among other txns or rewrites.
   req := requestPool.Get().(*request)
   req.reset()
   req.Entries = entries
   req.Wg.Add(1)
   req.IncrRef()     // for db write
    // 打包好的整个事务请求,传到写事务channel,真正的写入到磁盘
   db.writeCh <- req // Handled in doWrites.
   y.NumPutsAdd(db.opt.MetricsEnabled, int64(len(entries)))

   return req, nil
}

`watermark.process()` 方法

process用于处理Mark通道。这不是线程安全的，因此只为进程运行一个 goroutine。一个就足够了，因为所有 goroutine 操作都使用纯粹的内存和 cpu。每个索引必须按顺序发出至少一个开始水位，否则等待者可能会无限期地被阻塞。

示例：我们在 100 处有一个水位，在 101 处有一个等待者，如果在索引 101 处没有发出水印，那么等待者将无限期地卡住，因为它无法决定 101 处的任务是否已决定不发出水位或它没有安排好了

整个方法使用cas和局部变量，最终保证了原子性

// process is used to process the Mark channel. This is not thread-safe,
// so only run one goroutine for process. One is sufficient, because
// all goroutine ops use purely memory and cpu.
// Each index has to emit atleast one begin watermark in serial order otherwise waiters
// can get blocked idefinitely. Example: We had an watermark at 100 and a waiter at 101,
// if no watermark is emitted at index 101 then waiter would get stuck indefinitely as it
// can't decide whether the task at 101 has decided not to emit watermark or it didn't get
// scheduled yet.

func (w *WaterMark) process(closer *z.Closer) {
   defer closer.Done()
    // 创建一个堆
   var indices uint64Heap
   // pending maps raft proposal index to the number of pending mutations for this proposal.
    // 记录并发冲突值的用于检测的实例
   pending := make(map[uint64]int)
    // 存储回调channel, 一个时间戳上可以等待多个channel, 在orcale.readTs()中的waitForMark()
   waiters := make(map[uint64][]chan struct{})
    // 初始化堆
   heap.Init(&indices)

    // 真正执行逻辑的闭包函数
   processOne := func(index uint64, done bool) {
      // If not already done, then set. Otherwise, don't undo a done entry.
       // 通过传入的时间戳,从pending数组中取值
      prev, present := pending[index]
       // 如果不存在则push进堆中
      if !present {
         heap.Push(&indices, index)
      }

      delta := 1
       // 根据done判断是开始事务还是结束事务进行置位1或-1
      if done {
         delta = -1
      }
       // 如果是一个begin操作,即开启事务的标记的时候,在pending数组计数位里+1
       // 如果是一个commit操作,即终止事务的标记的时候,在pending数组计数位里-1
       // 让所有事务都能感知到活跃事务之间的关联
      pending[index] = prev + delta

      // Update mark by going through all indices in order; and checking if they have
      // been done. Stop at the first index, which isn't done.
       // 获取当前的水位信息
      doneUntil := w.DoneUntil()
       // 当前水位大于时间戳，证明已经不需要再去关注并发性了
      if doneUntil > index {
          // 断言结束操作
         AssertTruef(false, "Name: %s doneUntil: %d. Index: %d", w.Name, doneUntil, index)
      }

      until := doneUntil
      loops := 0
       // 循环对堆数组进行pop遍历操作,弹出最小的事务的时间戳
      for len(indices) > 0 {
         min := indices[0]
          // 判断是否大于0,证明最小的事务时间戳没有结束
         if done := pending[min]; done > 0 {
             // 没有其他事务在等待,跳出循环
            break // len(indices) will be > 0.
         }
         // Even if done is called multiple times causing it to become
         // negative, we should still pop the index.
          // done <= 0 则说明事务已经提交,删除它曾经存在的痕迹
         heap.Pop(&indices)
         delete(pending, min)
          // 水位移动
         until = min
         loops++
      }
       // 判断水位是否发生了变化
      if until != doneUntil {
          // 有所变化则通过cas赋值
         AssertTrue(atomic.CompareAndSwapUint64(&w.doneUntil, doneUntil, until))
      }
       // 唤醒操作的闭包
      notifyAndRemove := func(idx uint64, toNotify []chan struct{}) {
          // 遍历通知channel的数组,一个个close掉
         for _, ch := range toNotify {
            close(ch)
         }
          // 在waiters中移除对应的时间戳
         delete(waiters, idx) // Release the memory back.
      }
        // 如果水位发生移动
      if until-doneUntil <= uint64(len(waiters)) {
         // Issue #908 showed that if doneUntil is close to 2^60, while until is zero, this loop
         // can hog up CPU just iterating over integers creating a busy-wait loop. So, only do
         // this path if until - doneUntil is less than the number of waiters.
          // 遍历原水位到当前水位
         for idx := doneUntil + 1; idx <= until; idx++ {
             // 把水位中的index拿出,得到回调函数的channel
            if toNotify, ok := waiters[idx]; ok {
                // 进行逐个关闭
               notifyAndRemove(idx, toNotify)
            }
         }
      } else {
         for idx, toNotify := range waiters {
            if idx <= until {
               notifyAndRemove(idx, toNotify)
            }
         }
      } // end of notifying waiters.
   }

    // 此方法的主体,循环for-select处理
   for {
      select {
          // 关闭任务
      case <-closer.HasBeenClosed():
         return
          // 接收markChannel, 100长的channel
      case mark := <-w.markCh:
          // 判断有无水位的信息
         if mark.waiter != nil {
             // 获取已提交事务的最大版本号的水位
            doneUntil := atomic.LoadUint64(&w.doneUntil)
             // 比较时间戳大小关系,如果当前已提交事务时间戳大于读时间戳,不需要等待,直接close
            if doneUntil >= mark.index {
               close(mark.waiter)
            } else {
                // 否则的话,读时间戳大于水位时间戳
                // 在之前有未完成的活跃事务,不能获取读取时间戳,否则可能读取道脏数据
                // 创建waiters
               ws, ok := waiters[mark.index]
               if !ok {
                   // 如果该读时间戳未在waiters中存在,创建channel数组
                  waiters[mark.index] = []chan struct{}{mark.waiter}
               } else {
                   // 如果不空的话,说明之前已经有其他的事务在找个时间戳上等待,直接append
                  waiters[mark.index] = append(ws, mark.waiter)
               }
            }
             // 读取时间戳和提交时间戳的操作都没有mark对象
         } else {
             // 如果当前时间戳是有效的
            if mark.index > 0 {
                // 对这个时间戳调用闭包进行逻辑处理
                // mark.done是一个bool值,在begin的时候是false,在完成的时候为true
               processOne(mark.index, mark.done)
            }
             // 遍历堆数组,对所有的节点进行一次处理逻辑
            for _, index := range mark.indices {
               processOne(index, mark.done)
            }
         }
      }
   }
}

总结

1. 检查事务是否关闭
2. 创建一个事务对象
  1. 如果是一个读写事务
    1. 创建一个存储哪些key存在冲突的记录map
    2. 记录在当前事务上发生写入的key的列表
    3. 分配一个事务读取时间戳
      1. 从 nextReadTnx中获取一个时间戳的值
      2. 然后标记为开始,记录最后的事物时间戳后发送给一个markChan
      3. 等待与这个时间戳冲突的事物提交完成
        1. 获取最后一个已提交的时间戳
        2. 创建一个用于wait回调的chan
        3. 等待waitChan的回调，或者上下文的取消
3. Defer 丢弃最终的事务
  1. 标记readTs时间戳已经完成
4. 执行闭包函数
  1. Set kv
    1. 检查kv的合法性
    2. 检查当前事务的执行的命令数量以及存储大小是否超过阈值
    3. 按照key计算一个hash值，然后加入冲突检查map中
    4. 如果这个key 在当前事务中被写入过，并且与之前的版本不同，则计入重复写入的数组中
    5. 将该key的写入操作记录到pending数组里
  2. Get k
    1. 如果这是一个读写事务
      1. 如果在pending数组里存在一个key，并且没有过期 则复制数据并返回
5. 提交事务
  1. 检查 pending数组是否为空，如果为空则直接返回
  2. 事务提交的前置检查
  3. 提交并发送日志返回一个回调函数进行等待
    1. 获取一把锁
    2. 创建一个提交时间戳
      1. 是否存在冲突
        1. 如果读时间戳大于已提交的时间戳则忽略
        2. 读时间戳小于已提交的时间戳则判断是否存在读后写的情况存在就冲突
      2. 拿到当前事务的readTs标记其为完成
      3. 清理已经提交的事务的记录数组
      4. 标记当前的事务开始进行提交，分配了一个新的提交时间戳
      5. 将当前提交事务的时间戳和冲突的key组成对象记录在已提交事务的数组中
    3. 遍历每个在当前事务中写入的key，为其分配唯一的版本号
    4. 遍历pending数组处理每一个实体kv对象
    5. 追加一条标记事务结束的内部kv实体
      1. 创建一个写入请求
      2. 发送给写者channel中
      3.  返回一个回调函数
        1. 等待批量写请求处理完成
        2. 标记事务已经提交完成
        3. 将buf写入到mmap关联的文件内存中
      4. 写入磁盘
        1. 如果值日志offset大于vlog最大文件大小，或者写入条数超购阈值
          1. 写入磁盘 如果设置了同步则会系统调用sync 最后根据offset进行截断
          2. 创建一个新的vlog文件

Badger源码导读(二) – 读写事务最先出现在枫阿雨's blog。

Badger源码导读(一) – DB初始化

crazyfay — Sun, 04 Dec 2022 04:53:21 +0000

Badger源码导读

源码分析入口基准案例

先从Badger的基本使用入手

func main() {
    // 打开db
    db, _ := badger.Open(badger.DefaultOptions("tmp/badger"))
    defer db.Close()

    // 读写事务
    err := db.Update(func(txn *badger.Txn) error {
        txn.Set([]byte("answer"), []byte("42"))
        txn.Get([]byte("answer"))
        return nil
    })

    // 只读事务
    err = db.View(func(txn *badger.Txn) error {
        txn.Get([]byte("answer_v1"))
        return nil
    })

    // 遍历keys
    err = db.View(func(txn *badger.Txn) error {
        opts := badger.DefaultIteratorOptions
        opts.PrefetchSize = 10
        it := txn.NewIterator(opts)
        defer it.Close()
        for it.Rewind(); it.Valid(); it.Next() {
            item := it.Item()
            k := item.Key()
            err := item.Value(func(val []byte) error {
                fmt.Printf("key=%s, value=%s\n", k, val)
                return nil
            })
            if err != nil {
                return err
            }
        }
        return nil
    })
    err = db.RunValueLogGC(0.7)
    _ = err
}

DB初始化过程

初始化参数

badger.open()传入的是一个option,先看一下option结构体的字段都有哪些

type Options struct {
    // Required options.

    // Dir: Badger是KV分离的存储引擎,Dir位置存储的是 Key 和指向Value的逻辑指针
    // ValueDir: 存储的是Value日志,即值所在的地址,默认情况下Dir和ValueDir在同一个path目录下
    Dir      string
    ValueDir string

    // Usually modified options.

    // SyncWrites: 同步写,即写入的时候主动同步到磁盘(mmap不会立即刷盘)
    SyncWrites        bool
    NumVersionsToKeep int
    // ReadOnly: 如其名,是否设置为只读
    ReadOnly          bool
    // Logger: 如其名,log对象
    Logger            Logger
    // Compression: 压缩归并的级别
    Compression       options.CompressionType
    // InMemory: 是否只基于内存
    InMemory          bool
    MetricsEnabled    bool
    // Sets the Stream.numGo field
    NumGoroutines int

    // Fine tuning options.

    // MemTableSize: 内存表的尺寸限制
    MemTableSize        int64
    BaseTableSize       int64
    BaseLevelSize       int64
    LevelSizeMultiplier int
    TableSizeMultiplier int
    // MaxLevels: 最大容忍的level级别,LSM-T的级数L0-L(max-1)
    MaxLevels           int

    VLogPercentile float64
    // ValueThreshold: 值大小的阈值,如果Value的大小不超过这个设定值,则不会将KV进行分离
    // 此处是在工业实践中的一种权衡,KV分离会造成不可避免的读放大
    // (两次的随机读,先在LSM-T中读取一次指针,再通过指针从ValueLog中读取一次值)
    ValueThreshold int64
    // NumMemtables: 内存表的数量
    NumMemtables   int
    // Changing BlockSize across DB runs will not break badger. The block size is
    // read from the block index stored at the end of the table.
    // BlockSize: 每个block的大小(sst由block和index等组成)
    BlockSize          int
    // BloomFalsePositive: 布隆过滤器假阳性的比例
    BloomFalsePositive float64
    // BlockCacheSize: 块缓存的大小
    BlockCacheSize     int64
    // IndexCacheSize: 索引缓存的大小
    IndexCacheSize     int64

    NumLevelZeroTables      int
    NumLevelZeroTablesStall int

    // ValueLogFileSize: 存储值的Valuelog文件的最大大小
    ValueLogFileSize   int64
    // ValueLogMaxEntries: 存储值的Valuelog文件的最大键值对数量
    ValueLogMaxEntries uint32

    // NumCompactors: 日志合并压缩协程同时运行的最大数量
    NumCompactors        int
    CompactL0OnClose     bool
    LmaxCompaction       bool
    ZSTDCompressionLevel int

    // When set, checksum will be validated for each entry read from the value log file.
    // VerifyValueChecksum: 是否进行参数校验值的检查
    VerifyValueChecksum bool

    // Encryption related options.
    // EncryptionKey: 加密字段 
    EncryptionKey                 []byte        // encryption key
    // EncryptionKeyRotationDuration: 加密字段有效时长
    EncryptionKeyRotationDuration time.Duration // key rotation duration

    // BypassLockGuard will bypass the lock guard on badger. Bypassing lock
    // guard can cause data corruption if multiple badger instances are using
    // the same directory. Use this options with caution.
    BypassLockGuard bool

    // ChecksumVerificationMode decides when db should verify checksums for SSTable blocks.
    ChecksumVerificationMode options.ChecksumVerificationMode

    // DetectConflicts determines whether the transactions would be checked for
    // conflicts. The transactions can be processed at a higher rate when
    // conflict detection is disabled.
    // DetectConflicts: 事务的冲突检测 
    DetectConflicts bool

    // NamespaceOffset specifies the offset from where the next 8 bytes contains the namespace.
    NamespaceOffset int

    // Transaction start and commit timestamps are managed by end-user.
    // This is only useful for databases built on top of Badger (like Dgraph).
    // Not recommended for most users.
    managedTxns bool

    // 4. Flags for testing purposes
    // ------------------------------
    // 有关批处理的参数
    maxBatchCount int64 // max entries in batch
    maxBatchSize  int64 // max batch size in bytes

    maxValueThreshold float64
}

传入指定的路径，并默认配置信息，如果有需要更改的信息可以使用 WithX() 方法（此处使用了建造者模式）

badger.DefaultOptions("tmp/badger")

// DefaultOptions sets a list of recommended options for good performance.
// Feel free to modify these to suit your needs with the WithX methods.
func DefaultOptions(path string) Options {
    return Options{

        Dir:      path,
        ValueDir: path,

        MemTableSize:        64 << 20,
        BaseTableSize:       2 << 20,
        BaseLevelSize:       10 << 20,
        TableSizeMultiplier: 2,
        LevelSizeMultiplier: 10,
        MaxLevels:           7,
        NumGoroutines:       8,
        MetricsEnabled:      true,

        NumCompactors:           4, // Run at least 2 compactors. Zero-th compactor prioritizes L0.
        NumLevelZeroTables:      5,
        NumLevelZeroTablesStall: 15,
        NumMemtables:            5,
        BloomFalsePositive:      0.01,
        BlockSize:               4 * 1024,
        SyncWrites:              false,
        NumVersionsToKeep:       1,
        CompactL0OnClose:        false,
        VerifyValueChecksum:     false,
        Compression:             options.Snappy,
        BlockCacheSize:          256 << 20,
        IndexCacheSize:          0,

        // The following benchmarks were done on a 4 KB block size (default block size). The
        // compression is ratio supposed to increase with increasing compression level but since the
        // input for compression algorithm is small (4 KB), we don't get significant benefit at
        // level 3.
        // NOTE: The benchmarks are with DataDog ZSTD that requires CGO. Hence, no longer valid.
        // no_compression-16              10     502848865 ns/op     165.46 MB/s    -
        // zstd_compression/level_1-16     7     739037966 ns/op     112.58 MB/s    2.93
        // zstd_compression/level_3-16     7     756950250 ns/op     109.91 MB/s    2.72
        // zstd_compression/level_15-16    1    11135686219 ns/op      7.47 MB/s    4.38
        // Benchmark code can be found in table/builder_test.go file
        ZSTDCompressionLevel: 1,

        // Nothing to read/write value log using standard File I/O
        // MemoryMap to mmap() the value log files
        // (2^30 - 1)*2 when mmapping < 2^31 - 1, max int32.
        // -1 so 2*ValueLogFileSize won't overflow on 32-bit systems.
        ValueLogFileSize: 1<<30 - 1,

        ValueLogMaxEntries: 1000000,

        VLogPercentile: 0.0,
        ValueThreshold: maxValueThreshold,

        Logger:                        defaultLogger(INFO),
        EncryptionKey:                 []byte{},
        EncryptionKeyRotationDuration: 10 * 24 * time.Hour, // Default 10 days.
        DetectConflicts:               true,
        NamespaceOffset:               -1,
    }
}

Open函数(核心)

badger.Open(opt) 函数

此方法代码过长,在此只保留核心部分代码,部分逻辑将以伪代码或注释表示,并省去部分错误处理逻辑

func Open(opt Options) (*DB, error) {

    // 检查参数
    checkAndSetOptions(&opt)

    // 创建了三个目录锁,防止其他进程注册到同一个目录造成冲突
    var dirLockGuard, valueDirLockGuard *directoryLockGuard

    // Create directories and acquire lock on it only if badger is not running in InMemory mode.
    // We don't have any directories/files in InMemory mode so we don't need to acquire
    // any locks on them.
    // 判断参数配置为只基于内存
    if !opt.InMemory {
        // 创建目录
        createDirs(opt)

        var err error
        if !opt.BypassLockGuard {
            // 给Dir加目录锁
            dirLockGuard, _ = acquireDirectoryLock(opt.Dir, lockFile, opt.ReadOnly)
            // 方法末尾释放锁
            defer func() {
                if dirLockGuard != nil {
                    _ = dirLockGuard.release()
                }
            }()
            // 获取Key&ValuePtr的绝对路径
            absDir, _ := filepath.Abs(opt.Dir)

            // 获取ValueLog的绝对路径
            absValueDir, _ := filepath.Abs(opt.ValueDir)

            // 如果ValueDir和Dir不相同,需要各自加锁
            if absValueDir != absDir {
                // 给ValueDir加目录锁
                valueDirLockGuard, _ = acquireDirectoryLock(opt.ValueDir, lockFile, opt.ReadOnly)

                // 释放锁
                defer func() {
                    if valueDirLockGuard != nil {
                        _ = valueDirLockGuard.release()
                    }
                }()
            }
        }
    }

    // 打开或创建Manifest文件,(采用mmap方式打开,在后面详细展开)
    manifestFile, manifest, _ := openOrCreateManifestFile(opt)

    // 关闭Manifest文件
    defer func() {
        if manifestFile != nil {
            _ = manifestFile.close()
        }
    }()

    // 创建内存中的db数据结构
    db := &DB{
        // memtable, 因为有多个,所以要创建数组
        imm:              make([]*memTable, 0, opt.NumMemtables),
        // 刷新请求的channel
        flushChan:        make(chan flushTask, opt.NumMemtables),
        // 写请求的channel
        writeCh:          make(chan *request, kvWriteChCapacity),
        // 配置信息opt
        opt:              opt,
        // 刚初始化好的manifest实例
        manifest:         manifestFile,
        // Key&ValuePtr目录锁
        dirLockGuard:     dirLockGuard,
        // Value目录锁
        valueDirGuard:    valueDirLockGuard,
        // Oracle的实例,一个KV引擎并发事务的管理器,负责分配事务的版本号,用来实现MVCC功能,在读写事务时详细展开
        orc:              newOracle(opt),
        pub:              newPublisher(),
        allocPool:        z.NewAllocatorPool(8),
        bannedNamespaces: &lockedKeys{keys: make(map[uint64]struct{})},
        threshold:        initVlogThreshold(&opt),
    }
    // Cleanup all the goroutines started by badger in case of an error.
    // 关闭badger的所有任务协程的钩子函数
    defer func() {
        if err != nil {
            opt.Errorf("Received err: %v. Cleaning up...", err)
            db.cleanup()
            db = nil
        }
    }()

    // 块缓存相关配置
    // LSM-T结构中SST里面数据是以块(block)为单位分割的
    // 当开启块缓存之后,LSM-T会把最近被访问到的高热的块缓存在内存中,以加块响应速度
    if opt.BlockCacheSize > 0 {
        // 缓存不在此次源码阅读的讨论范围之内,不影响核心功能,暂且略过
        // 值得一提的是badger是使用的缓存是badger社区研发的一个高性能本地并发缓存的库,有兴趣的同学可以自行研究
        numInCache := opt.BlockCacheSize / int64(opt.BlockSize)
        if numInCache == 0 {
            // Make the value of this variable at least one since the cache requires
            // the number of counters to be greater than zero.
            numInCache = 1
        }

        config := ristretto.Config{
            NumCounters: numInCache * 8,
            MaxCost:     opt.BlockCacheSize,
            BufferItems: 64,
            Metrics:     true,
            OnExit:      table.BlockEvictHandler,
        }
        db.blockCache, err = ristretto.NewCache(&config)
        if err != nil {
            return nil, y.Wrap(err, "failed to create data cache")
        }
    }

    // 索引缓存相关配置
    // 索引是每个Key所对应的偏离量的值,每一个SSTable有一个元数据块即索引块
    // 可以方便对Key的二分查找,定位当前的key在哪一个sstable文件里,在文件中的偏移量是多少
    if opt.IndexCacheSize > 0 {
        // Index size is around 5% of the table size.
        indexSz := int64(float64(opt.MemTableSize) * 0.05)
        numInCache := opt.IndexCacheSize / indexSz
        if numInCache == 0 {
            // Make the value of this variable at least one since the cache requires
            // the number of counters to be greater than zero.
            numInCache = 1
        }

        config := ristretto.Config{
            NumCounters: numInCache * 8,
            MaxCost:     opt.IndexCacheSize,
            BufferItems: 64,
            Metrics:     true,
        }
        db.indexCache, err = ristretto.NewCache(&config)
        if err != nil {
            return nil, y.Wrap(err, "failed to create bf cache")
        }
    }

    // 对缓存模块的监控检测
    db.closers.cacheHealth = z.NewCloser(1)
    go db.monitorCache(db.closers.cacheHealth)

    // 如果仅基于内存
    if db.opt.InMemory {
        // 默认关闭写同步
        db.opt.SyncWrites = false
        // If badger is running in memory mode, push everything into the LSM Tree.
        // 把所有数据只写在LSM-T中
        db.opt.ValueThreshold = math.MaxInt32
    }

    // Key的注册,与并发事务相关,之后再详细展开
    krOpt := KeyRegistryOptions{
        ReadOnly:                      opt.ReadOnly,
        Dir:                           opt.Dir,
        EncryptionKey:                 opt.EncryptionKey,
        EncryptionKeyRotationDuration: opt.EncryptionKeyRotationDuration,
        InMemory:                      opt.InMemory,
    }
    db.registry, _ = OpenKeyRegistry(krOpt)

    // 计算消耗的内存等数据统计信息
    db.calculateSize()
    db.closers.updateSize = z.NewCloser(1)
    go db.updateSize(db.closers.updateSize)

    // 打开一个memTable实例
    // memtable是在内存中的一个复杂数据结构
    if err := db.openMemTables(db.opt); err != nil {
        return nil, y.Wrapf(err, "while opening memtables")
    }
    // 检查
    if !db.opt.ReadOnly {
        // 创建一个新的.mem文件
        // .mem文件就是LSM-T中的预写日志文件(wal)
        if db.mt, err = db.newMemTable(); err != nil {
            return nil, y.Wrapf(err, "cannot create memtable")
        }
    }

    // newLevelsController potentially loads files in directory.
    // 创建内存中level管理器
    // LSM-T是分层结构的, LevelsController实例负责维护整个层级结构
    // 进行日志归并,压缩处理等操作,通过Manifest进行初始配置
    // 或者是,manifest文件就是LevelController持久化之后的ondisk版本,可以加快badger的恢复重启速度
    // 先打开SSTable,加载索引块,元数据块,缓存到内存当中
    if db.lc, err = newLevelsController(db, &manifest); err != nil {
        return db, err
    }

    // Initialize vlog struct.
    // 初始化vlog
    db.vlog.init(db)

    if !opt.ReadOnly {
        // 启动日志归并的工作协程,后续再展开
        db.closers.compactors = z.NewCloser(1)
        db.lc.startCompact(db.closers.compactors)

        db.closers.memtable = z.NewCloser(1)
        go func() {
            _ = db.flushMemtable(db.closers.memtable) // Need levels controller to be up.
        }()
        // Flush them to disk asap.
        for _, mt := range db.imm {
            db.flushChan <- flushTask{mt: mt}
        }
    }
    // We do increment nextTxnTs below. So, no need to do it here.
    // 拿到启动时最大事务的版本号(时间戳)
    db.orc.nextTxnTs = db.MaxVersion()
    db.opt.Infof("Set nextTxnTs to %d", db.orc.nextTxnTs)

    // 真正打开vlog文件
    if err = db.vlog.open(db); err != nil {
        return db, y.Wrapf(err, "During db.vlog.open")
    }

    // Let's advance nextTxnTs to one more than whatever we observed via
    // replaying the logs.
    // 事务相关,等待之前事务的恢复
    db.orc.txnMark.Done(db.orc.nextTxnTs)
    // In normal mode, we must update readMark so older versions of keys can be removed during
    // compaction when run in offline mode via the flatten tool.
    db.orc.readMark.Done(db.orc.nextTxnTs)
    // 事务号自增
    db.orc.incrementNextTs()

    // 监听配置信息的更改
    go db.threshold.listenForValueThresholdUpdate()

    // 从数据库中检索被禁止的命名空间并更新内存结构(非重点)
    if err := db.initBannedNamespaces(); err != nil {
        return db, errors.Wrapf(err, "While setting banned keys")
    }

    // 启动处理磁盘写请求的协程
    // badger的写任务是并发写任务,可以充分发挥ssd的性能
    db.closers.writes = z.NewCloser(1)
    go db.doWrites(db.closers.writes)

    if !db.opt.InMemory {
        // 真正开启vlog的GC, 后面再详细讲解
        db.closers.valueGC = z.NewCloser(1)
        go db.vlog.waitOnGC(db.closers.valueGC)
    }

    // 监听协程(非重点)
    db.closers.pub = z.NewCloser(1)
    go db.pub.listenForUpdates(db.closers.pub)

    // 释放锁
    valueDirLockGuard = nil
    dirLockGuard = nil
    manifestFile = nil
    // 返回db
    return db, nil
}

创建Manifest文件

openOrCreateManifestFile(opt) 函数

func openOrCreateManifestFile(opt Options) (ret *manifestFile, result Manifest, err error) {
    // 如果Inmemory则返回空的Manifest
    if opt.InMemory {
        return &manifestFile{inMemory: true}, Manifest{}, nil
    }
    return helpOpenOrCreateManifestFile(opt.Dir, opt.ReadOnly, manifestDeletionsRewriteThreshold)
}

func helpOpenOrCreateManifestFile(dir string, readOnly bool, deletionsThreshold int) (*manifestFile, Manifest, error) {
    // 拼接path
    path := filepath.Join(dir, ManifestFilename)
    var flags y.Flags
    if readOnly {
        flags |= y.ReadOnly
    }
    // 尝试打开文件
    fp, err := y.OpenExistingFile(path, flags) // We explicitly sync in addChanges, outside the lock.
    if err != nil {
        // 校验文件是否存在
        if !os.IsNotExist(err) {
            return nil, Manifest{}, err
        }
        // 如果仅读则无法创建直接返回
        if readOnly {
            return nil, Manifest{}, fmt.Errorf("no manifest found, required for read-only db")
        }
        // 真正创建manifest实例
        m := createManifest()
        // 覆盖写,执行完此条语句后就可以在目录中看到MANIFEST文件存在了(此时MANIFEST文件中仅有魔数bdg)
        fp, netCreations, _ := helpRewrite(dir, &m)

        // 断言,确保创建成功
        y.AssertTrue(netCreations == 0)
        // 创建manifestFile实例在内存中保存信息
        mf := &manifestFile{
            fp:                        fp,
            directory:                 dir,
            manifest:                  m.clone(),
            deletionsRewriteThreshold: deletionsThreshold,
        }
        return mf, m, nil
    }

    // 文件存在加载恢复的逻辑暂不展开
    ......
}

func createManifest() Manifest {
    levels := make([]levelManifest, 0)
    return Manifest{
        Levels: levels,
        Tables: make(map[uint64]TableManifest),
    }
    // Tables: map[uint64]TableManifest
    // uint64: 行号,第n个level
}

打开Memtable

memtable 结构体

// memTable structure stores a skiplist and a corresponding WAL. Writes to memTable are written
// both to the WAL and the skiplist. On a crash, the WAL is replayed to bring the skiplist back to
// its pre-crash form.
type memTable struct {
    sl         *skl.Skiplist
    wal        *logFile
    maxVersion uint64
    opt        Options
    buf        *bytes.Buffer
}

openMemTables(opt) 方法

func (db *DB) openMemTables(opt Options) error {
    // We don't need to open any tables in in-memory mode.
    // 如果是只基于内存则直接返回(那我走?)
    if db.opt.InMemory {
        return nil
    }
    // 读取目录中的全部文件
    files, _ := ioutil.ReadDir(db.opt.Dir)

    var fids []int
    // 遍历目录中的文件
    for _, file := range files {
        // 检查当前文件名是否包含一个.mem的后缀(在第一次初始化过程中肯定不会存在)
        // 此时目录中应有的文件为 LOCK MANIFEST KEYREGISTRY
        if !strings.HasSuffix(file.Name(), memFileExt) {
            continue
        }
        // 如果有.mem文件,则取文件的命名转为int值作为fid
        // 例: 000001.mem 000002.mem
        fsz := len(file.Name())
        fid, _ := strconv.ParseInt(file.Name()[:fsz-len(memFileExt)], 10, 64)

        fids = append(fids, int(fid))
    }

    // Sort in ascending order.
    // 按照fid排序
    sort.Slice(fids, func(i, j int) bool {
        return fids[i] < fids[j]
    })
    // 按照fid顺序遍历
    for _, fid := range fids {
        flags := os.O_RDWR
        if db.opt.ReadOnly {
            flags = os.O_RDONLY
        }
        // 真正的打开.mem文件,采用mmap方式加载.mem文件中的数据
        mt, err := db.openMemTable(fid, flags)
        if err != nil {
            return y.Wrapf(err, "while opening fid: %d", fid)
        }
        // If this memtable is empty we don't need to add it. This is a
        // memtable that was completely truncated.
        if mt.sl.Empty() {
            mt.DecrRef()
            continue
        }
        // These should no longer be written to. So, make them part of the imm.
        db.imm = append(db.imm, mt)
    }
    // 设置最新的fid序列号
    if len(fids) != 0 {
        db.nextMemFid = fids[len(fids)-1]
    }
    db.nextMemFid++
    return nil
}

创建Memtable

newMemTable() 方法

func (db *DB) newMemTable() (*memTable, error) {
    // 真正创建.mem文件
    mt, err := db.openMemTable(db.nextMemFid, os.O_CREATE|os.O_RDWR)
    if err == z.NewFile {
        db.nextMemFid++
        return mt, nil
    }

    if err != nil {
        db.opt.Errorf("Got error: %v for id: %d\n", err, db.nextMemFid)
        return nil, y.Wrapf(err, "newMemTable")
    }
    return nil, errors.Errorf("File %s already exists", mt.wal.Fd.Name())
}

openMemTable(fid, flags) 方法

func (db *DB) openMemTable(fid, flags int) (*memTable, error) {
    // 拼接路径
    filepath := db.mtFilePath(fid)
    // 创建memtable中的skiplist
    s := skl.NewSkiplist(arenaSize(db.opt))
    // 创建memtable实例
    mt := &memTable{
        sl:  s,
        opt: db.opt,
        buf: &bytes.Buffer{},
    }
    // We don't need to create the wal for the skiplist in in-memory mode so return the mt.
    // 如果只基于内存,则不需要创建wal文件,直接返回
    if db.opt.InMemory {
        return mt, z.NewFile
    }
    // 创建wal文件实例
    mt.wal = &logFile{
        fid:      uint32(fid),
        path:     filepath,
        registry: db.registry,
        writeAt:  vlogHeaderSize,
        opt:      db.opt,
    }
    // 调用系统函数创建wal文件
    lerr := mt.wal.open(filepath, flags, 2*db.opt.MemTableSize)
    // 如果未成功创建新文件或其他失败则返回err
    if lerr != z.NewFile && lerr != nil {
        return nil, y.Wrapf(lerr, "While opening memtable: %s", filepath)
    }

    // Have a callback set to delete WAL when skiplist reference count goes down to zero. That is,
    // when it gets flushed to L0.
    // 用来关闭的回调函数
    s.OnClose = func() {
        if err := mt.wal.Delete(); err != nil {
            db.opt.Errorf("while deleting file: %s, err: %v", filepath, err)
        }
    }
    // 成功创建mmap则返回 lerr (z.NewFile)
    if lerr == z.NewFile {
        return mt, lerr
    }
    // 当且仅当MemTableSize设置为0时造成 lerr == nil的适合执行到此
    // 此时mmap未进行截断,在UpdateSkipList()中遍历wal文件并重新截断,如果wal文件不存在会返回错误
    err := mt.UpdateSkipList()
    return mt, y.Wrapf(err, "while updating skiplist")
}

创建levelController

newLevelsController(db, mf) 函数

func newLevelsController(db *DB, mf *Manifest) (*levelsController, error) {
    // 断言,进行一些必要的校验
    y.AssertTrue(db.opt.NumLevelZeroTablesStall > db.opt.NumLevelZeroTables)
    // 关联db实例,创建level数组对应层级关系(例:levels[0] => L0层)
    // levelHandler就是真正负责某一层sst管理器的主要操作
    s := &levelsController{
        kv:     db,
        levels: make([]*levelHandler, db.opt.MaxLevels),
    }
    // 状态统计的一个的对象(set结构),key为fid,用以判断对应的fid是否存在于这一层
    s.cstatus.tables = make(map[uint64]struct{})
    // 合并状态的信息
    s.cstatus.levels = make([]*levelCompactStatus, db.opt.MaxLevels)

    // 按层遍历,每一层都创建一个levelhandler实例
    for i := 0; i < db.opt.MaxLevels; i++ {
        s.levels[i] = newLevelHandler(db, i)
        s.cstatus.levels[i] = new(levelCompactStatus)
    }
    // 基于内存,那我走?🤡
    if db.opt.InMemory {
        return s, nil
    }
    // Compare manifest against directory, check for existent/non-existent files, and remove.
    // 对manifest文件进行校验
    if err := revertToManifest(db, mf, getIDMap(db.opt.Dir)); err != nil {
        return nil, err
    }

    var mu sync.Mutex
    tables := make([][]*table.Table, db.opt.MaxLevels)
    var maxFileID uint64

    // We found that using 3 goroutines allows disk throughput to be utilized to its max.
    // Disk utilization is the main thing we should focus on, while trying to read the data. That's
    // the one factor that remains constant between HDD and SSD.
    // 一种针对并发控制的负载均衡策略,对于ssd来说,创建3个协程能够最大的发挥ssd的优点
    throttle := y.NewThrottle(3)

    start := time.Now()
    var numOpened int32
    // 创建一个定时触发器进行超时控制
    tick := time.NewTicker(3 * time.Second)
    // 钩子函数关闭定时器
    defer tick.Stop()

    // manifest清单文件的Tables
    // 拿到每个table对应的fid
    // 第一次初始化的适合因为Tables为空,会直接跳过
    for fileID, tf := range mf.Tables {
        fname := table.NewFilename(fileID, db.opt.Dir)
        select {
        case <-tick.C:
            db.opt.Infof("%d tables out of %d opened in %s\n", atomic.LoadInt32(&numOpened),
                len(mf.Tables), time.Since(start).Round(time.Millisecond))
        default:
        }
        if err := throttle.Do(); err != nil {
            closeAllTables(tables)
            return nil, err
        }
        if fileID > maxFileID {
            maxFileID = fileID
        }
        go func(fname string, tf TableManifest) {
            var rerr error
            defer func() {
                throttle.Done(rerr)
                atomic.AddInt32(&numOpened, 1)
            }()
            dk, err := db.registry.DataKey(tf.KeyID)
            if err != nil {
                rerr = y.Wrapf(err, "Error while reading datakey")
                return
            }
            topt := buildTableOptions(db)
            // Explicitly set Compression and DataKey based on how the table was generated.
            topt.Compression = tf.Compression
            topt.DataKey = dk

            mf, err := z.OpenMmapFile(fname, db.opt.getFileFlags(), 0)
            if err != nil {
                rerr = y.Wrapf(err, "Opening file: %q", fname)
                return
            }
            t, err := table.OpenTable(mf, topt)
            if err != nil {
                if strings.HasPrefix(err.Error(), "CHECKSUM_MISMATCH:") {
                    db.opt.Errorf(err.Error())
                    db.opt.Errorf("Ignoring table %s", mf.Fd.Name())
                    // Do not set rerr. We will continue without this table.
                } else {
                    rerr = y.Wrapf(err, "Opening table: %q", fname)
                }
                return
            }

            mu.Lock()
            tables[tf.Level] = append(tables[tf.Level], t)
            mu.Unlock()
        }(fname, tf)
    }
    // 关闭相关的任务协程
    if err := throttle.Finish(); err != nil {
        closeAllTables(tables)
        return nil, err
    }
    db.opt.Infof("All %d tables opened in %s\n", atomic.LoadInt32(&numOpened),
        time.Since(start).Round(time.Millisecond))
    // 记录当前fid最大值
    s.nextFileID = maxFileID + 1
    // 初始化每个level的tables
    for i, tbls := range tables {
        s.levels[i].initTables(tbls)
    }

    // Make sure key ranges do not overlap etc.
    // 必要的数据校验
    if err := s.validate(); err != nil {
        _ = s.cleanupLevels()
        return nil, y.Wrap(err, "Level validation")
    }

    // Sync directory (because we have at least removed some files, or previously created the
    // manifest file).
    // 手动进行同步刷盘
    if err := syncDir(db.opt.Dir); err != nil {
        _ = s.close()
        return nil, err
    }

    return s, nil
}

创建levelHandler

newLevelHandler(db, level) 函数

func newLevelHandler(db *DB, level int) *levelHandler {
    return &levelHandler{
        level:    level,
        strLevel: fmt.Sprintf("l%d", level),
        db:       db,
    }
}

初始化tables

initTables(tables) 方法

// initTables replaces s.tables with given tables. This is done during loading.
func (s *levelHandler) initTables(tables []*table.Table) {
    // 加锁
    s.Lock()
    defer s.Unlock()

    // 赋值与相关值的初始化
    s.tables = tables
    s.totalSize = 0
    s.totalStaleSize = 0
    for _, t := range tables {
        s.addSize(t)
    }
    // 如果是L0层,需要拿每个fid排序
    if s.level == 0 {
        // Key range will overlap. Just sort by fileID in ascending order
        // because newer tables are at the end of level 0.
        sort.Slice(s.tables, func(i, j int) bool {
            return s.tables[i].ID() < s.tables[j].ID()
        })
    } else {
        // L0层往上,拿每个table文件的MinKey排序
        // Sort tables by keys.
        sort.Slice(s.tables, func(i, j int) bool {
            return y.CompareKeys(s.tables[i].Smallest(), s.tables[j].Smallest()) < 0
        })
    }
}

初始化vlog

init(db) 方法

// init initializes the value log struct. This initialization needs to happen
// before compactions start.
func (vlog *valueLog) init(db *DB) {
    // 加载配置
    vlog.opt = db.opt
    vlog.db = db
    // We don't need to open any vlog files or collect stats for GC if DB is opened
    // in InMemory mode. InMemory mode doesn't create any files/directories on disk.

    // inmem,那我走?🤡
    if vlog.opt.InMemory {
        return
    }
    // 指定的vlog目录
    vlog.dirPath = vlog.opt.ValueDir
    // GC模块用到的channel
    vlog.garbageCh = make(chan struct{}, 1) // Only allow one GC at a time.
    // 创建一个GC模块相关文件
    lf, err := InitDiscardStats(vlog.opt)
    y.Check(err)
    vlog.discardStats = lf
}

打开vlog

open(db) 方法

func (vlog *valueLog) open(db *DB) error {
    // We don't need to open any vlog files or collect stats for GC if DB is opened
    // in InMemory mode. InMemory mode doesn't create any files/directories on disk.
    // 不想再做解释了,inmem,那我走!!!
    if db.opt.InMemory {
        return nil
    }
    // 填充文件fid到filesMap
    if err := vlog.populateFilesMap(); err != nil {
        return err
    }
    // If no files are found, then create a new file.
    // 如果没有.vlog文件
    if len(vlog.filesMap) == 0 {
        if vlog.opt.ReadOnly {
            return nil
        }
        // 创建一个.vlog文件
        _, err := vlog.createVlogFile()
        return y.Wrapf(err, "Error while creating log file in valueLog.open")
    }
    fids := vlog.sortedFids()
    for _, fid := range fids {
        lf, ok := vlog.filesMap[fid]
        y.AssertTrue(ok)

        // Just open in RDWR mode. This should not create a new log file.
        lf.opt = vlog.opt
        if err := lf.open(vlog.fpath(fid), os.O_RDWR,
            2*vlog.opt.ValueLogFileSize); err != nil {
            return y.Wrapf(err, "Open existing file: %q", lf.path)
        }
        // We shouldn't delete the maxFid file.
        if lf.size == vlogHeaderSize && fid != vlog.maxFid {
            vlog.opt.Infof("Deleting empty file: %s", lf.path)
            if err := lf.Delete(); err != nil {
                return y.Wrapf(err, "while trying to delete empty file: %s", lf.path)
            }
            delete(vlog.filesMap, fid)
        }
    }

    if vlog.opt.ReadOnly {
        return nil
    }
    // Now we can read the latest value log file, and see if it needs truncation. We could
    // technically do this over all the value log files, but that would mean slowing down the value
    // log open.
    last, ok := vlog.filesMap[vlog.maxFid]
    y.AssertTrue(ok)
    lastOff, err := last.iterate(vlog.opt.ReadOnly, vlogHeaderSize,
        func(_ Entry, vp valuePointer) error {
            return nil
        })
    if err != nil {
        return y.Wrapf(err, "while iterating over: %s", last.path)
    }
    if err := last.Truncate(int64(lastOff)); err != nil {
        return y.Wrapf(err, "while truncating last value log file: %s", last.path)
    }

    // Don't write to the old log file. Always create a new one.
    if _, err := vlog.createVlogFile(); err != nil {
        return y.Wrapf(err, "Error while creating log file in valueLog.open")
    }
    return nil
}

populateFilesMap() 方法

func (vlog *valueLog) populateFilesMap() error {
    vlog.filesMap = make(map[uint32]*logFile)

    // 从目录中拿到每个文件的句柄
    files, _ := ioutil.ReadDir(vlog.dirPath)

    found := make(map[uint64]struct{})
    for _, file := range files {
        // 判断是否以.vlog作为后缀
        if !strings.HasSuffix(file.Name(), ".vlog") {
            continue
        }
        // 对.vlog文件进行校验,去除fid,进行消重判断
        fsz := len(file.Name())
        fid, err := strconv.ParseUint(file.Name()[:fsz-5], 10, 32)
        if err != nil {
            return errFile(err, file.Name(), "Unable to parse log id.")
        }
        if _, ok := found[fid]; ok {
            return errFile(err, file.Name(), "Duplicate file found. Please delete one.")
        }
        found[fid] = struct{}{}

        lf := &logFile{
            fid:      uint32(fid),
            path:     vlog.fpath(uint32(fid)),
            registry: vlog.db.registry,
        }
        // 最后保存到vlog的filesMap当中
        vlog.filesMap[uint32(fid)] = lf
        if vlog.maxFid < uint32(fid) {
            vlog.maxFid = uint32(fid)
        }
    }
    // 直到每个.vlog文件的fid都添加到了map中
    // 第一次初始化时没有.vlog文件,故直接跳过
    return nil
}

创建vlog文件

createVlogFile() 方法

func (vlog *valueLog) createVlogFile() (*logFile, error) {
    // 最大的fid
    fid := vlog.maxFid + 1
    // 根据fid命名
    path := vlog.fpath(fid)
    // 创建一个句柄实例
    lf := &logFile{
        fid:      fid,
        path:     path,
        registry: vlog.db.registry,
        writeAt:  vlogHeaderSize,
        opt:      vlog.opt,
    }
    // 进行系统调用打开文件,通过mmap的方式
    // .vlog文件初始化时会创建一个2G的文件
    err := lf.open(path, os.O_RDWR|os.O_CREATE|os.O_EXCL, 2*vlog.opt.ValueLogFileSize)
    if err != z.NewFile && err != nil {
        return nil, err
    }

    // 进行数据初始化更新的操作
    vlog.filesLock.Lock()
    vlog.filesMap[fid] = lf
    y.AssertTrue(vlog.maxFid < fid)
    vlog.maxFid = fid
    // writableLogOffset is only written by write func, by read by Read func.
    // To avoid a race condition, all reads and updates to this variable must be
    // done via atomics.
    atomic.StoreUint32(&vlog.writableLogOffset, vlogHeaderSize)
    vlog.numEntriesWritten = 0
    vlog.filesLock.Unlock()

    return lf, nil
}

Badger源码导读(一) – DB初始化最先出现在枫阿雨's blog。

Badger归档 - 枫阿雨's blog

Badger源码导读(二) – 读写事务

Badger源码导读(二)

源码分析入口基准案例

读写事务

oracle实例

核心方法-Update(func(*badger.Txn))

newTransaction(bool)

orcale.readTs()

WaterMark.Begin() 方法

WaterMark.WaitForMark(ctx, index) 方法

txn.set(k,v []byte) 方法

txn.modify(*Entry)方法

txn.get(k []byte) 方法

txn.Commit() 方法

txn.commitPrecheck() 方法

txn.commitAndSend() 方法

oracle.newCommitTs()方法

oracle.hasConflict() 方法

oracle.doneRead() 方法

oracle.cleanupCommittedTransactions() 方法

db.sendToWriteCh() 方法

watermark.process() 方法

总结

Badger源码导读(一) – DB初始化

Badger源码导读

源码分析入口基准案例

DB初始化过程

初始化参数

Open函数(核心)

创建Manifest文件

打开Memtable

创建Memtable

创建levelController

创建levelHandler

初始化tables

初始化vlog

打开vlog

创建vlog文件

`newTransaction(bool)`

`orcale.readTs()`

`WaterMark.Begin()` 方法

`WaterMark.WaitForMark(ctx, index)` 方法

`txn.set(k,v []byte)` 方法

`txn.modify(*Entry)`方法

`txn.get(k []byte)` 方法

`txn.Commit()` 方法

`txn.commitPrecheck()` 方法

`txn.commitAndSend()` 方法

`oracle.newCommitTs()`方法

`oracle.hasConflict()` 方法

`oracle.doneRead()` 方法

`oracle.cleanupCommittedTransactions()` 方法

`db.sendToWriteCh()` 方法

`watermark.process()` 方法