| 1 | + | // Copyright 2022 Dolthub, Inc. |
| 2 | + | // |
| 3 | + | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | + | // you may not use this file except in compliance with the License. |
| 5 | + | // You may obtain a copy of the License at |
| 6 | + | // |
| 7 | + | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | + | // |
| 9 | + | // Unless required by applicable law or agreed to in writing, software |
| 10 | + | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | + | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | + | // See the License for the specific language governing permissions and |
| 13 | + | // limitations under the License. |
| 14 | + | |
| 15 | + | package tree |
| 16 | + | |
| 17 | + | import ( |
| 18 | + | "bytes" |
| 19 | + | "context" |
| 20 | + | "encoding/json" |
| 21 | + | "errors" |
| 22 | + | "io" |
| 23 | + | |
| 24 | + | "github.com/dolthub/go-mysql-server/sql" |
| 25 | + | |
| 26 | + | "github.com/dolthub/dolt/go/store/hash" |
| 27 | + | "github.com/dolthub/dolt/go/store/prolly/message" |
| 28 | + | ) |
| 29 | + | |
| 30 | + | const DefaultFixedChunkLength = 4000 |
| 31 | + | |
| 32 | + | var ErrInvalidChunkSize = errors.New("invalid chunkSize; value must be a multiple of 20") |
| 33 | + | |
| 34 | + | func mustNewBlobBuilder(chunkSize int) *BlobBuilder { |
| 35 | + | b, _ := NewBlobBuilder(chunkSize) |
| 36 | + | return b |
| 37 | + | } |
| 38 | + | |
| 39 | + | // NewBlobBuilder writes the contents of |reader| as an append-only |
| 40 | + | // tree, returning the root node or an error if applicable. |chunkSize| |
| 41 | + | // fixes the split size of leaf and intermediate node chunks. |
| 42 | + | func NewBlobBuilder(chunkSize int) (*BlobBuilder, error) { |
| 43 | + | if chunkSize%hash.ByteLen != 0 { |
| 44 | + | return nil, ErrInvalidChunkSize |
| 45 | + | } |
| 46 | + | |
| 47 | + | keys := make([][]byte, chunkSize/hash.ByteLen) |
| 48 | + | for i := range keys { |
| 49 | + | keys[i] = zeroKey |
| 50 | + | } |
| 51 | + | return &BlobBuilder{ |
| 52 | + | chunkSize: chunkSize, |
| 53 | + | keys: keys, |
| 54 | + | }, nil |
| 55 | + | } |
| 56 | + | |
| 57 | + | type blobNodeWriter interface { |
| 58 | + | Write(ctx context.Context, r io.Reader) (hash.Hash, uint64, error) |
| 59 | + | } |
| 60 | + | |
| 61 | + | type BlobBuilder struct { |
| 62 | + | ns NodeStore |
| 63 | + | S message.Serializer |
| 64 | + | chunkSize int |
| 65 | + | keys [][]byte |
| 66 | + | wr blobNodeWriter |
| 67 | + | lastN Node |
| 68 | + | topLevel int |
| 69 | + | |
| 70 | + | levelCap int |
| 71 | + | buf []byte |
| 72 | + | vals [][]byte |
| 73 | + | subtrees []uint64 |
| 74 | + | } |
| 75 | + | |
| 76 | + | func (b *BlobBuilder) SetNodeStore(ns NodeStore) { |
| 77 | + | b.ns = ns |
| 78 | + | b.S = message.NewBlobSerializer(ns.Pool()) |
| 79 | + | } |
| 80 | + | |
| 81 | + | // Reset clears the BlobBuilder for re-use. |
| 82 | + | func (b *BlobBuilder) Reset() { |
| 83 | + | b.wr = nil |
| 84 | + | b.topLevel = 0 |
| 85 | + | } |
| 86 | + | |
| 87 | + | // Init calculates tree dimensions for a given blob. |
| 88 | + | func (b *BlobBuilder) Init(dataSize int) { |
| 89 | + | b.Reset() |
| 90 | + | |
| 91 | + | if dataSize == 0 { |
| 92 | + | return |
| 93 | + | } |
| 94 | + | |
| 95 | + | if dataSize <= b.chunkSize { |
| 96 | + | b.wr = &blobLeafWriter{ |
| 97 | + | bb: b, |
| 98 | + | buf: make([]byte, dataSize), |
| 99 | + | } |
| 100 | + | return |
| 101 | + | } |
| 102 | + | |
| 103 | + | b.wr = &blobLeafWriter{ |
| 104 | + | bb: b, |
| 105 | + | buf: make([]byte, b.chunkSize), |
| 106 | + | } |
| 107 | + | |
| 108 | + | numAddrs := b.chunkSize / hash.ByteLen |
| 109 | + | dataSize = dataSize / b.chunkSize |
| 110 | + | for dataSize > 0 { |
| 111 | + | dataSize = dataSize / numAddrs |
| 112 | + | b.topLevel += 1 |
| 113 | + | } |
| 114 | + | |
| 115 | + | // Allocate everything we need in batch, slice them up down below. |
| 116 | + | if b.levelCap < b.topLevel { |
| 117 | + | b.expand(numAddrs) |
| 118 | + | b.levelCap = b.topLevel |
| 119 | + | } |
| 120 | + | |
| 121 | + | writers := make([]blobLevelWriter, b.topLevel) |
| 122 | + | for i, addrs := 0, 0; i < b.topLevel; i, addrs = i+1, addrs+numAddrs { |
| 123 | + | wr := &writers[i] |
| 124 | + | wr.bb = b |
| 125 | + | wr.child = b.wr |
| 126 | + | wr.buf = b.buf[addrs*hash.ByteLen : (addrs+numAddrs)*hash.ByteLen] |
| 127 | + | wr.vals = b.vals[addrs : addrs+numAddrs] |
| 128 | + | wr.subtrees = b.subtrees[addrs : addrs+numAddrs] |
| 129 | + | wr.level = i + 1 |
| 130 | + | wr.sz = numAddrs |
| 131 | + | b.wr = wr |
| 132 | + | } |
| 133 | + | } |
| 134 | + | |
| 135 | + | func (b *BlobBuilder) expand(numAddrs int) { |
| 136 | + | b.buf = make([]byte, b.topLevel*numAddrs*hash.ByteLen) |
| 137 | + | b.vals = make([][]byte, numAddrs*b.topLevel) |
| 138 | + | b.subtrees = make([]uint64, numAddrs*b.topLevel) |
| 139 | + | } |
| 140 | + | |
| 141 | + | // Chunk builds the blob tree by passing the Reader to the chain of level |
| 142 | + | // writers, terminated in a leaf writer. The leaf writer reads chunks from the |
| 143 | + | // Reader and writes them, returning their hashes to its parent level writer. |
| 144 | + | // When the parent level writer fills up with addresses, it writes a chunk and |
| 145 | + | // returns that address to its parent. This continues until the Reader returns |
| 146 | + | // io.EOF, when every writer in the chain completes its chunk and we return the |
| 147 | + | // root node. |
| 148 | + | func (b *BlobBuilder) Chunk(ctx context.Context, r io.Reader) (Node, hash.Hash, error) { |
| 149 | + | if b.wr == nil { |
| 150 | + | return Node{}, hash.Hash{}, nil |
| 151 | + | } |
| 152 | + | h, _, err := b.wr.Write(ctx, r) |
| 153 | + | if err != nil && err != io.EOF { |
| 154 | + | return Node{}, hash.Hash{}, err |
| 155 | + | } |
| 156 | + | return b.lastN, h, nil |
| 157 | + | } |
| 158 | + | |
| 159 | + | // blobLeafWriter writes leaf chunks of the blob, with max capacity len(buf), |
| 160 | + | // for every call to Write(). |
| 161 | + | type blobLeafWriter struct { |
| 162 | + | bb *BlobBuilder |
| 163 | + | buf []byte |
| 164 | + | } |
| 165 | + | |
| 166 | + | var zeroKey = []byte{0} |
| 167 | + | var zeroKeys = [][]byte{zeroKey} |
| 168 | + | var leafSubtrees = []uint64{1} |
| 169 | + | |
| 170 | + | func (lw *blobLeafWriter) Write(ctx context.Context, r io.Reader) (hash.Hash, uint64, error) { |
| 171 | + | n, err := r.Read(lw.buf) |
| 172 | + | if err != nil { |
| 173 | + | return hash.Hash{}, 0, err |
| 174 | + | } |
| 175 | + | h, err := lw.bb.write(ctx, zeroKeys, [][]byte{lw.buf[:n]}, leafSubtrees, 0) |
| 176 | + | return h, 1, err |
| 177 | + | } |
| 178 | + | |
| 179 | + | // blobLevelWriters writes internal chunks of a blob, using its |child| to |
| 180 | + | // write the level below it. On a call to |Write|, it repeatedly calls |
| 181 | + | // |child.Write|, accumulating addresses to its children, until it fills up or |
| 182 | + | // the Reader is exhausted. In either case, it then writes its node and |
| 183 | + | // returns. |
| 184 | + | type blobLevelWriter struct { |
| 185 | + | bb *BlobBuilder |
| 186 | + | child blobNodeWriter |
| 187 | + | buf []byte |
| 188 | + | vals [][]byte |
| 189 | + | subtrees []uint64 |
| 190 | + | sz int |
| 191 | + | level int |
| 192 | + | } |
| 193 | + | |
| 194 | + | func (lw *blobLevelWriter) Write(ctx context.Context, r io.Reader) (hash.Hash, uint64, error) { |
| 195 | + | i, off, totalCount := 0, 0, uint64(0) |
| 196 | + | for { |
| 197 | + | // Sketchy hack to elide a copy here... |
| 198 | + | //h := (*hash.Hash)(unsafe.Pointer(&lw.buf[off])) |
| 199 | + | //var n uint64 |
| 200 | + | //var err error |
| 201 | + | h, n, err := lw.child.Write(ctx, r) |
| 202 | + | if err != nil && err != io.EOF { |
| 203 | + | return hash.Hash{}, 0, err |
| 204 | + | } |
| 205 | + | if n != 0 { |
| 206 | + | totalCount += n |
| 207 | + | copy(lw.buf[off:], h[:]) |
| 208 | + | lw.subtrees[i] = n |
| 209 | + | lw.vals[i] = lw.buf[off : off+hash.ByteLen] |
| 210 | + | i += 1 |
| 211 | + | off += hash.ByteLen |
| 212 | + | } |
| 213 | + | if i >= lw.sz || err == io.EOF { |
| 214 | + | h, nerr := lw.bb.write(ctx, lw.bb.keys[:i], lw.vals[:i], lw.subtrees[:i], lw.level) |
| 215 | + | if nerr != nil { |
| 216 | + | return hash.Hash{}, 0, nerr |
| 217 | + | } |
| 218 | + | return h, totalCount, err |
| 219 | + | } |
| 220 | + | } |
| 221 | + | } |
| 222 | + | |
| 223 | + | // Write the blob node. Called by level and leaf writers. Will store lastN if |
| 224 | + | // the level corresponds to our root level. |
| 225 | + | func (b *BlobBuilder) write(ctx context.Context, keys, vals [][]byte, subtrees []uint64, level int) (hash.Hash, error) { |
| 226 | + | msg := b.S.Serialize(keys, vals, subtrees, level) |
| 227 | + | node, err := NodeFromBytes(msg) |
| 228 | + | if err != nil { |
| 229 | + | return hash.Hash{}, err |
| 230 | + | } |
| 231 | + | h, err := b.ns.Write(ctx, node) |
| 232 | + | if err != nil { |
| 233 | + | return hash.Hash{}, err |
| 234 | + | } |
| 235 | + | if level == b.topLevel { |
| 236 | + | b.lastN = node |
| 237 | + | } |
| 238 | + | return h, nil |
| 239 | + | } |
| 240 | + | |
| 241 | + | const bytePeekLength = 128 |
| 242 | + | |
| 243 | + | type ByteArray struct { |
| 244 | + | ImmutableTree |
| 245 | + | } |
| 246 | + | |
| 247 | + | func NewByteArray(addr hash.Hash, ns NodeStore) *ByteArray { |
| 248 | + | return &ByteArray{ImmutableTree{Addr: addr, ns: ns}} |
| 249 | + | } |
| 250 | + | |
| 251 | + | func (b *ByteArray) ToBytes(ctx context.Context) ([]byte, error) { |
| 252 | + | return b.bytes(ctx) |
| 253 | + | } |
| 254 | + | |
| 255 | + | func (b *ByteArray) ToString(ctx context.Context) (string, error) { |
| 256 | + | buf, err := b.bytes(ctx) |
| 257 | + | if err != nil { |
| 258 | + | return "", err |
| 259 | + | } |
| 260 | + | toShow := bytePeekLength |
| 261 | + | if len(buf) < toShow { |
| 262 | + | toShow = len(buf) |
| 263 | + | } |
| 264 | + | return string(buf[:toShow]), nil |
| 265 | + | } |
| 266 | + | |
| 267 | + | type JSONDoc struct { |
| 268 | + | ImmutableTree |
| 269 | + | } |
| 270 | + | |
| 271 | + | func NewJSONDoc(addr hash.Hash, ns NodeStore) *JSONDoc { |
| 272 | + | return &JSONDoc{ImmutableTree{Addr: addr, ns: ns}} |
| 273 | + | } |
| 274 | + | |
| 275 | + | func (b *JSONDoc) ToJSONDocument(ctx context.Context) (sql.JSONDocument, error) { |
| 276 | + | buf, err := b.bytes(ctx) |
| 277 | + | if err != nil { |
| 278 | + | return sql.JSONDocument{}, err |
| 279 | + | } |
| 280 | + | var doc sql.JSONDocument |
| 281 | + | err = json.Unmarshal(buf, &doc.Val) |
| 282 | + | if err != nil { |
| 283 | + | return sql.JSONDocument{}, err |
| 284 | + | } |
| 285 | + | return doc, err |
| 286 | + | } |
| 287 | + | |
| 288 | + | func (b *JSONDoc) ToString(ctx context.Context) (string, error) { |
| 289 | + | buf, err := b.bytes(ctx) |
| 290 | + | if err != nil { |
| 291 | + | return "", err |
| 292 | + | } |
| 293 | + | toShow := bytePeekLength |
| 294 | + | if len(buf) < toShow { |
| 295 | + | toShow = len(buf) |
| 296 | + | } |
| 297 | + | return string(buf[:toShow]), nil |
| 298 | + | } |
| 299 | + | |
| 300 | + | type TextStorage struct { |
| 301 | + | ImmutableTree |
| 302 | + | } |
| 303 | + | |
| 304 | + | func NewTextStorage(addr hash.Hash, ns NodeStore) *TextStorage { |
| 305 | + | return &TextStorage{ImmutableTree{Addr: addr, ns: ns}} |
| 306 | + | } |
| 307 | + | |
| 308 | + | func (b *TextStorage) ToBytes(ctx context.Context) ([]byte, error) { |
| 309 | + | return b.bytes(ctx) |
| 310 | + | } |
| 311 | + | |
| 312 | + | func (b *TextStorage) ToString(ctx context.Context) (string, error) { |
| 313 | + | buf, err := b.bytes(ctx) |
| 314 | + | if err != nil { |
| 315 | + | return "", err |
| 316 | + | } |
| 317 | + | return string(buf), nil |
| 318 | + | } |
| 319 | + | |
| 320 | + | type ImmutableTree struct { |
| 321 | + | Addr hash.Hash |
| 322 | + | buf []byte |
| 323 | + | ns NodeStore |
| 324 | + | } |
| 325 | + | |
| 326 | + | func (t *ImmutableTree) load(ctx context.Context) error { |
| 327 | + | if t.Addr.IsEmpty() { |
| 328 | + | t.buf = []byte{} |
| 329 | + | return nil |
| 330 | + | } |
| 331 | + | n, err := t.ns.Read(ctx, t.Addr) |
| 332 | + | if err != nil { |
| 333 | + | return err |
| 334 | + | } |
| 335 | + | |
| 336 | + | return WalkNodes(ctx, n, t.ns, func(ctx context.Context, n Node) error { |
| 337 | + | if n.IsLeaf() { |
| 338 | + | t.buf = append(t.buf, n.GetValue(0)...) |
| 339 | + | } |
| 340 | + | return nil |
| 341 | + | }) |
| 342 | + | } |
| 343 | + | |
| 344 | + | func (t *ImmutableTree) bytes(ctx context.Context) ([]byte, error) { |
| 345 | + | if t.buf == nil { |
| 346 | + | err := t.load(ctx) |
| 347 | + | if err != nil { |
| 348 | + | return nil, err |
| 349 | + | } |
| 350 | + | } |
| 351 | + | return t.buf[:], nil |
| 352 | + | } |
| 353 | + | |
| 354 | + | func (t *ImmutableTree) next() (Node, error) { |
| 355 | + | panic("not implemented") |
| 356 | + | } |
| 357 | + | |
| 358 | + | func (t *ImmutableTree) close() error { |
| 359 | + | panic("not implemented") |
| 360 | + | } |
| 361 | + | |
| 362 | + | func (t *ImmutableTree) Read(_ bytes.Buffer) (int, error) { |
| 363 | + | panic("not implemented") |
| 364 | + | } |
| 365 | + | |