@@ -651,8 +651,12 @@ void toku_ftnode_clone_callback(void *value_data,
651651 // set new pair attr if necessary
652652 if (node->height == 0 ) {
653653 *new_attr = make_ftnode_pair_attr (node);
654- node->logical_rows_delta = 0 ;
655- cloned_node->logical_rows_delta = 0 ;
654+ for (int i = 0 ; i < node->n_children ; i++) {
655+ if (BP_STATE (node, i) == PT_AVAIL) {
656+ BLB_LRD (node, i) = 0 ;
657+ BLB_LRD (cloned_node, i) = 0 ;
658+ }
659+ }
656660 } else {
657661 new_attr->is_valid = false ;
658662 }
@@ -700,9 +704,26 @@ void toku_ftnode_flush_callback(CACHEFILE UU(cachefile),
700704 if (ftnode->height == 0 ) {
701705 FT_STATUS_INC (FT_FULL_EVICTIONS_LEAF, 1 );
702706 FT_STATUS_INC (FT_FULL_EVICTIONS_LEAF_BYTES, node_size);
703- if (!ftnode->dirty ) {
704- toku_ft_adjust_logical_row_count (
705- ft, -ftnode->logical_rows_delta );
707+
708+ // A leaf node (height == 0) is being evicted (!keep_me) and is
709+ // not a checkpoint clone (!is_clone). This leaf node may have
710+ // had messages applied to satisfy a query, but was never
711+ // actually dirtied (!ftnode->dirty && !write_me). **Note that
712+ // if (write_me) would persist the node and clear the dirty
713+ // flag **. This message application may have updated the trees
714+ // logical row count. Since these message applications are not
715+ // persisted, we need undo the logical row count adjustments as
716+ // they may occur again in the future if/when the node is
717+ // re-read from disk for another query or change.
718+ if (!ftnode->dirty && !write_me) {
719+ int64_t lrc_delta = 0 ;
720+ for (int i = 0 ; i < ftnode->n_children ; i++) {
721+ if (BP_STATE (ftnode, i) == PT_AVAIL) {
722+ lrc_delta -= BLB_LRD (ftnode, i);
723+ BLB_LRD (ftnode, i) = 0 ;
724+ }
725+ }
726+ toku_ft_adjust_logical_row_count (ft, lrc_delta);
706727 }
707728 } else {
708729 FT_STATUS_INC (FT_FULL_EVICTIONS_NONLEAF, 1 );
@@ -711,17 +732,18 @@ void toku_ftnode_flush_callback(CACHEFILE UU(cachefile),
711732 toku_free (*disk_data);
712733 } else {
713734 if (ftnode->height == 0 ) {
735+ // No need to adjust logical row counts when flushing a clone
736+ // as they should have been zeroed out anyway when cloned.
737+ // Clones are 'copies' of work already done so doing it again
738+ // (adjusting row counts) would be redundant and leads to
739+ // inaccurate counts.
714740 for (int i = 0 ; i < ftnode->n_children ; i++) {
715741 if (BP_STATE (ftnode, i) == PT_AVAIL) {
716742 BASEMENTNODE bn = BLB (ftnode, i);
717743 toku_ft_decrease_stats (&ft->in_memory_stats ,
718744 bn->stat64_delta );
719745 }
720746 }
721- if (!ftnode->dirty ) {
722- toku_ft_adjust_logical_row_count (
723- ft, -ftnode->logical_rows_delta );
724- }
725747 }
726748 }
727749 toku_ftnode_free (&ftnode);
@@ -748,24 +770,48 @@ toku_ft_status_update_pivot_fetch_reason(ftnode_fetch_extra *bfe)
748770 }
749771}
750772
751- int toku_ftnode_fetch_callback (CACHEFILE UU (cachefile), PAIR p, int fd, BLOCKNUM blocknum, uint32_t fullhash,
752- void **ftnode_pv, void** disk_data, PAIR_ATTR *sizep, int *dirtyp, void *extraargs) {
773+ int toku_ftnode_fetch_callback (CACHEFILE UU (cachefile),
774+ PAIR p,
775+ int fd,
776+ BLOCKNUM blocknum,
777+ uint32_t fullhash,
778+ void **ftnode_pv,
779+ void **disk_data,
780+ PAIR_ATTR *sizep,
781+ int *dirtyp,
782+ void *extraargs) {
753783 assert (extraargs);
754- assert (*ftnode_pv == NULL );
755- FTNODE_DISK_DATA* ndd = (FTNODE_DISK_DATA*)disk_data;
784+ assert (*ftnode_pv == nullptr );
785+ FTNODE_DISK_DATA * ndd = (FTNODE_DISK_DATA *)disk_data;
756786 ftnode_fetch_extra *bfe = (ftnode_fetch_extra *)extraargs;
757- FTNODE *node= (FTNODE*)ftnode_pv;
787+ FTNODE *node = (FTNODE *)ftnode_pv;
758788 // deserialize the node, must pass the bfe in because we cannot
759789 // evaluate what piece of the the node is necessary until we get it at
760790 // least partially into memory
761- int r = toku_deserialize_ftnode_from (fd, blocknum, fullhash, node, ndd, bfe);
791+ int r =
792+ toku_deserialize_ftnode_from (fd, blocknum, fullhash, node, ndd, bfe);
762793 if (r != 0 ) {
763794 if (r == TOKUDB_BAD_CHECKSUM) {
764- fprintf (stderr,
765- " Checksum failure while reading node in file %s.\n " ,
766- toku_cachefile_fname_in_env (cachefile));
795+ fprintf (
796+ stderr,
797+ " %s:%d:toku_ftnode_fetch_callback - "
798+ " file[%s], blocknum[%ld], toku_deserialize_ftnode_from "
799+ " failed with a checksum error.\n " ,
800+ __FILE__,
801+ __LINE__,
802+ toku_cachefile_fname_in_env (cachefile),
803+ blocknum.b );
767804 } else {
768- fprintf (stderr, " Error deserializing node, errno = %d" , r);
805+ fprintf (
806+ stderr,
807+ " %s:%d:toku_ftnode_fetch_callback - "
808+ " file[%s], blocknum[%ld], toku_deserialize_ftnode_from "
809+ " failed with %d.\n " ,
810+ __FILE__,
811+ __LINE__,
812+ toku_cachefile_fname_in_env (cachefile),
813+ blocknum.b ,
814+ r);
769815 }
770816 // make absolutely sure we crash before doing anything else.
771817 abort ();
@@ -774,7 +820,8 @@ int toku_ftnode_fetch_callback (CACHEFILE UU(cachefile), PAIR p, int fd, BLOCKNU
774820 if (r == 0 ) {
775821 *sizep = make_ftnode_pair_attr (*node);
776822 (*node)->ct_pair = p;
777- *dirtyp = (*node)->dirty ; // deserialize could mark the node as dirty (presumably for upgrade)
823+ *dirtyp = (*node)->dirty ; // deserialize could mark the node as dirty
824+ // (presumably for upgrade)
778825 }
779826 return r;
780827}
@@ -947,6 +994,16 @@ int toku_ftnode_pe_callback(void *ftnode_pv,
947994 basements_to_destroy[num_basements_to_destroy++] = bn;
948995 toku_ft_decrease_stats (&ft->in_memory_stats ,
949996 bn->stat64_delta );
997+ // A basement node is being partially evicted.
998+ // This masement node may have had messages applied to it to
999+ // satisfy a query, but was never actually dirtied.
1000+ // This message application may have updated the trees
1001+ // logical row count. Since these message applications are
1002+ // not being persisted, we need undo the logical row count
1003+ // adjustments as they may occur again in the future if/when
1004+ // the node is re-read from disk for another query or change.
1005+ toku_ft_adjust_logical_row_count (ft,
1006+ -bn->logical_rows_delta );
9501007 set_BNULL (node, i);
9511008 BP_STATE (node, i) = PT_ON_DISK;
9521009 num_partial_evictions++;
0 commit comments