您好,登錄后才能下訂單哦!
本節(jié)簡單介紹了PostgreSQL中的HTAB如何動態(tài)擴展,這是第2部分,結合代碼進行解析.
/*
* Top control structure for a hashtable --- in a shared table, each backend
* has its own copy (OK since no fields change at runtime)
* 哈希表的頂層控制結構.
* 在這個共享哈希表中,每一個后臺進程都有自己的拷貝
* (之所以沒有問題是因為fork出來后,在運行期沒有字段會變化)
*/
struct HTAB
{
//指向共享的控制信息
HASHHDR *hctl; /* => shared control information */
//段目錄
HASHSEGMENT *dir; /* directory of segment starts */
//哈希函數(shù)
HashValueFunc hash; /* hash function */
//哈希鍵比較函數(shù)
HashCompareFunc match; /* key comparison function */
//哈希鍵拷貝函數(shù)
HashCopyFunc keycopy; /* key copying function */
//內(nèi)存分配器
HashAllocFunc alloc; /* memory allocator */
//內(nèi)存上下文
MemoryContext hcxt; /* memory context if default allocator used */
//表名(用于錯誤信息)
char *tabname; /* table name (for error messages) */
//如在共享內(nèi)存中,則為T
bool isshared; /* true if table is in shared memory */
//如為T,則固定大小不能擴展
bool isfixed; /* if true, don't enlarge */
/* freezing a shared table isn't allowed, so we can keep state here */
//不允許凍結共享表,因此這里會保存相關狀態(tài)
bool frozen; /* true = no more inserts allowed */
/* We keep local copies of these fixed values to reduce contention */
//保存這些固定值的本地拷貝,以減少沖突
//哈希鍵長度(以字節(jié)為單位)
Size keysize; /* hash key length in bytes */
//段大小,必須為2的冪
long ssize; /* segment size --- must be power of 2 */
//段偏移,ssize的對數(shù)
int sshift; /* segment shift = log2(ssize) */
};
/*
* Header structure for a hash table --- contains all changeable info
* 哈希表的頭部結構 -- 存儲所有可變信息
*
* In a shared-memory hash table, the HASHHDR is in shared memory, while
* each backend has a local HTAB struct. For a non-shared table, there isn't
* any functional difference between HASHHDR and HTAB, but we separate them
* anyway to share code between shared and non-shared tables.
* 在共享內(nèi)存哈希表中,HASHHDR位于共享內(nèi)存中,每一個后臺進程都有一個本地HTAB結構.
* 對于非共享哈希表,HASHHDR和HTAB沒有任何功能性的不同,
* 但無論如何,我們還是把它們區(qū)分為共享和非共享表.
*/
struct HASHHDR
{
/*
* The freelist can become a point of contention in high-concurrency hash
* tables, so we use an array of freelists, each with its own mutex and
* nentries count, instead of just a single one. Although the freelists
* normally operate independently, we will scavenge entries from freelists
* other than a hashcode's default freelist when necessary.
* 在高并發(fā)的哈希表中,空閑鏈表會成為競爭熱點,因此我們使用空閑鏈表數(shù)組,
* 數(shù)組中的每一個元素都有自己的mutex和條目統(tǒng)計,而不是使用一個.
*
* If the hash table is not partitioned, only freeList[0] is used and its
* spinlock is not used at all; callers' locking is assumed sufficient.
* 如果哈希表沒有分區(qū),那么只有freelist[0]元素是有用的,自旋鎖沒有任何用處;
* 調(diào)用者鎖定被認為已足夠OK.
*/
/* Number of freelists to be used for a partitioned hash table. */
//#define NUM_FREELISTS 32
FreeListData freeList[NUM_FREELISTS];
/* These fields can change, but not in a partitioned table */
//這些域字段可以改變,但不適用于分區(qū)表
/* Also, dsize can't change in a shared table, even if unpartitioned */
//同時,就算是非分區(qū)表,共享表的dsize也不能改變
//目錄大小
long dsize; /* directory size */
//已分配的段大小(<= dsize)
long nsegs; /* number of allocated segments (<= dsize) */
//正在使用的最大桶ID
uint32 max_bucket; /* ID of maximum bucket in use */
//進入整個哈希表的模掩碼
uint32 high_mask; /* mask to modulo into entire table */
//進入低位哈希表的模掩碼
uint32 low_mask; /* mask to modulo into lower half of table */
/* These fields are fixed at hashtable creation */
//下面這些字段在哈希表創(chuàng)建時已固定
//哈希鍵大小(以字節(jié)為單位)
Size keysize; /* hash key length in bytes */
//所有用戶元素大小(以字節(jié)為單位)
Size entrysize; /* total user element size in bytes */
//分區(qū)個數(shù)(2的冪),或者為0
long num_partitions; /* # partitions (must be power of 2), or 0 */
//目標的填充因子
long ffactor; /* target fill factor */
//如目錄是固定大小,則該值為dsize的上限值
long max_dsize; /* 'dsize' limit if directory is fixed size */
//段大小,必須是2的冪
long ssize; /* segment size --- must be power of 2 */
//段偏移,ssize的對數(shù)
int sshift; /* segment shift = log2(ssize) */
//一次性分配的條目個數(shù)
int nelem_alloc; /* number of entries to allocate at once */
#ifdef HASH_STATISTICS
/*
* Count statistics here. NB: stats code doesn't bother with mutex, so
* counts could be corrupted a bit in a partitioned table.
* 統(tǒng)計信息.
* 注意:統(tǒng)計相關的代碼不會影響mutex,因此對于分區(qū)表,統(tǒng)計可能有一點點問題
*/
long accesses;
long collisions;
#endif
};
/*
* Per-freelist data.
* 空閑鏈表數(shù)據(jù).
*
* In a partitioned hash table, each freelist is associated with a specific
* set of hashcodes, as determined by the FREELIST_IDX() macro below.
* nentries tracks the number of live hashtable entries having those hashcodes
* (NOT the number of entries in the freelist, as you might expect).
* 在一個分區(qū)哈希表中,每一個空閑鏈表與特定的hashcodes集合相關,通過下面的FREELIST_IDX()宏進行定義.
* nentries跟蹤有這些hashcodes的仍存活的hashtable條目個數(shù).
* (注意不要搞錯,不是空閑的條目個數(shù))
*
* The coverage of a freelist might be more or less than one partition, so it
* needs its own lock rather than relying on caller locking. Relying on that
* wouldn't work even if the coverage was the same, because of the occasional
* need to "borrow" entries from another freelist; see get_hash_entry().
* 空閑鏈表的覆蓋范圍可能比一個分區(qū)多或少,因此需要自己的鎖而不能僅僅依賴調(diào)用者的鎖.
* 依賴調(diào)用者鎖在覆蓋面一樣的情況下也不會起效,因為偶爾需要從另一個自由列表“借用”條目,詳細參見get_hash_entry()
*
* Using an array of FreeListData instead of separate arrays of mutexes,
* nentries and freeLists helps to reduce sharing of cache lines between
* different mutexes.
* 使用FreeListData數(shù)組而不是一個獨立的mutexes,nentries和freelists數(shù)組有助于減少不同mutexes之間的緩存線共享.
*/
typedef struct
{
//該空閑鏈表的自旋鎖
slock_t mutex; /* spinlock for this freelist */
//相關桶中的條目個數(shù)
long nentries; /* number of entries in associated buckets */
//空閑元素鏈
HASHELEMENT *freeList; /* chain of free elements */
} FreeListData;
/*
* HASHELEMENT is the private part of a hashtable entry. The caller's data
* follows the HASHELEMENT structure (on a MAXALIGN'd boundary). The hash key
* is expected to be at the start of the caller's hash entry data structure.
* HASHELEMENT是哈希表條目的私有部分.
* 調(diào)用者的數(shù)據(jù)按照HASHELEMENT結構組織(位于MAXALIGN的邊界).
* 哈希鍵應位于調(diào)用者hash條目數(shù)據(jù)結構的開始位置.
*/
typedef struct HASHELEMENT
{
//鏈接到相同桶中的下一個條目
struct HASHELEMENT *link; /* link to next entry in same bucket */
//該條目的哈希函數(shù)結果
uint32 hashvalue; /* hash function result for this entry */
} HASHELEMENT;
/* Hash table header struct is an opaque type known only within dynahash.c */
//哈希表頭部結構,非透明類型,用于dynahash.c
typedef struct HASHHDR HASHHDR;
/* Hash table control struct is an opaque type known only within dynahash.c */
//哈希表控制結構,非透明類型,用于dynahash.c
typedef struct HTAB HTAB;
/* Parameter data structure for hash_create */
//hash_create使用的參數(shù)數(shù)據(jù)結構
/* Only those fields indicated by hash_flags need be set */
//根據(jù)hash_flags標記設置相應的字段
typedef struct HASHCTL
{
//分區(qū)個數(shù)(必須是2的冪)
long num_partitions; /* # partitions (must be power of 2) */
//段大小
long ssize; /* segment size */
//初始化目錄大小
long dsize; /* (initial) directory size */
//dsize上限
long max_dsize; /* limit to dsize if dir size is limited */
//填充因子
long ffactor; /* fill factor */
//哈希鍵大小(字節(jié)為單位)
Size keysize; /* hash key length in bytes */
//參見上述數(shù)據(jù)結構注釋
Size entrysize; /* total user element size in bytes */
//
HashValueFunc hash; /* hash function */
HashCompareFunc match; /* key comparison function */
HashCopyFunc keycopy; /* key copying function */
HashAllocFunc alloc; /* memory allocator */
MemoryContext hcxt; /* memory context to use for allocations */
//共享內(nèi)存中的哈希頭部結構地址
HASHHDR *hctl; /* location of header in shared mem */
} HASHCTL;
/* A hash bucket is a linked list of HASHELEMENTs */
//哈希桶是HASHELEMENTs鏈表
typedef HASHELEMENT *HASHBUCKET;
/* A hash segment is an array of bucket headers */
//hash segment是桶數(shù)組
typedef HASHBUCKET *HASHSEGMENT;
/*
* Hash functions must have this signature.
* Hash函數(shù)必須有它自己的標識
*/
typedef uint32 (*HashValueFunc) (const void *key, Size keysize);
/*
* Key comparison functions must have this signature. Comparison functions
* return zero for match, nonzero for no match. (The comparison function
* definition is designed to allow memcmp() and strncmp() to be used directly
* as key comparison functions.)
* 哈希鍵對比函數(shù)必須有自己的標識.
* 如匹配則對比函數(shù)返回0,不匹配返回非0.
* (對比函數(shù)定義被設計為允許在對比鍵值時可直接使用memcmp()和strncmp())
*/
typedef int (*HashCompareFunc) (const void *key1, const void *key2,
Size keysize);
/*
* Key copying functions must have this signature. The return value is not
* used. (The definition is set up to allow memcpy() and strlcpy() to be
* used directly.)
* 鍵拷貝函數(shù)必須有自己的標識.
* 返回值無用.
*/
typedef void *(*HashCopyFunc) (void *dest, const void *src, Size keysize);
/*
* Space allocation function for a hashtable --- designed to match malloc().
* Note: there is no free function API; can't destroy a hashtable unless you
* use the default allocator.
* 哈希表的恐懼分配函數(shù) -- 被設計為與malloc()函數(shù)匹配.
* 注意:這里沒有釋放函數(shù)API;不能銷毀哈希表,除非使用默認的分配器.
*/
typedef void *(*HashAllocFunc) (Size request);
其結構如下圖所示:
擴展后的結構如下圖所示:
主要的函數(shù)是expand_table
1.分配新桶,HTAB的最大桶數(shù)max_bucket+1
2.根據(jù)新桶號計算段號和段內(nèi)編號
3.如需擴展段,則擴展(*2)
4.獲取新桶號對應的原桶號,目的是為了把原桶號中的數(shù)據(jù)遷移到新桶中.新桶號和原桶號相差low_mask
5.掃描舊桶,重建舊桶元素鏈表,構造新桶元素鏈表
/*
* Expand the table by adding one more hash bucket.
* 通過增加一個或者多個hash bucket擴展hash表
*/
static bool
expand_table(HTAB *hashp)
{
HASHHDR *hctl = hashp->hctl;//hash控制結構
HASHSEGMENT old_seg,//原seg
new_seg;//新seg
long old_bucket,//原bucket
new_bucket;//新bucket
long new_segnum,//新seg號
new_segndx;//新seg索引(segment中的編號)
long old_segnum,//新seg號
old_segndx;//原seg索引
HASHBUCKET *oldlink,//原桶
*newlink;//新桶
HASHBUCKET currElement,//當前元素
nextElement;//下一元素
//#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
Assert(!IS_PARTITIONED(hctl));
#ifdef HASH_STATISTICS
hash_expansions++;
#endif
new_bucket = hctl->max_bucket + 1;//新增加一個bucket
new_segnum = new_bucket >> hashp->sshift;//取商數(shù)
new_segndx = MOD(new_bucket, hashp->ssize);//取余數(shù)
if (new_segnum >= hctl->nsegs)
{
//擴展segment,每次擴展一倍
/* Allocate new segment if necessary -- could fail if dir full */
if (new_segnum >= hctl->dsize)
if (!dir_realloc(hashp))
return false;
if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))//為新的seg對應的bucket分配空間
return false;
hctl->nsegs++;
}
/* OK, we created a new bucket */
//已完成創(chuàng)建
hctl->max_bucket++;
/*
* *Before* changing masks, find old bucket corresponding to same hash
* values; values in that bucket may need to be relocated to new bucket.
* Note that new_bucket is certainly larger than low_mask at this point,
* so we can skip the first step of the regular hash mask calc.
* 在修改掩碼前,為新的bucket找到對應的原bucket,原bucket中的元素keneng需要遷移到新的bucket上.
* 注意new_bucket肯定會比low_mask要大,可以跳過常規(guī)的hash掩碼計算的第一個步驟.
*/
old_bucket = (new_bucket & hctl->low_mask);
/*
* If we crossed a power of 2, readjust masks.
* 如果new_bucket是2的n次方,調(diào)整掩碼
*/
if ((uint32) new_bucket > hctl->high_mask)
{
hctl->low_mask = hctl->high_mask;//如15->31
hctl->high_mask = (uint32) new_bucket | hctl->low_mask;//如31->63
}
/*
* Relocate records to the new bucket. NOTE: because of the way the hash
* masking is done in calc_bucket, only one old bucket can need to be
* split at this point. With a different way of reducing the hash value,
* that might not be true!
* 重定位記錄到新的bucket上.
* 注意:由于通過方法calc_bucket計算hash掩碼,這時只需要拆分一個bucket.
*
*/
old_segnum = old_bucket >> hashp->sshift;//計算原seg號
old_segndx = MOD(old_bucket, hashp->ssize);//計算原seg中的索引號
old_seg = hashp->dir[old_segnum];//舊seg
new_seg = hashp->dir[new_segnum];//新seg
oldlink = &old_seg[old_segndx];//原bucket指針
newlink = &new_seg[new_segndx];//新bucket指針
for (currElement = *oldlink;
currElement != NULL;
currElement = nextElement)//循環(huán)遍歷
{
nextElement = currElement->link;
if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
{
*oldlink = currElement;
oldlink = &currElement->link;//重新構造原bucket
}
else
{
*newlink = currElement;//構造新bucket
newlink = &currElement->link;
}
}
/* don't forget to terminate the rebuilt hash chains... */
//不要忘了終止重建后的hash鏈
*oldlink = NULL;
*newlink = NULL;
return true;
}
static bool
dir_realloc(HTAB *hashp)
{
HASHSEGMENT *p;
HASHSEGMENT *old_p;
long new_dsize;
long old_dirsize;
long new_dirsize;
if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
return false;
/* Reallocate directory */
new_dsize = hashp->hctl->dsize << 1;
old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
new_dirsize = new_dsize * sizeof(HASHSEGMENT);
old_p = hashp->dir;
CurrentDynaHashCxt = hashp->hcxt;
p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
if (p != NULL)
{
memcpy(p, old_p, old_dirsize);
MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
hashp->dir = p;
hashp->hctl->dsize = new_dsize;
/* XXX assume the allocator is palloc, so we know how to free */
Assert(hashp->alloc == DynaHashAlloc);
pfree(old_p);
return true;
}
return false;
}
static HASHSEGMENT
seg_alloc(HTAB *hashp)
{
HASHSEGMENT segp;
CurrentDynaHashCxt = hashp->hcxt;
segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
if (!segp)
return NULL;
MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
return segp;
}
測試腳本
[local:/data/run/pg12]:5120 pg12@testdb=# \d t_expand;
Table "public.t_expand"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
id | integer | | |
[local:/data/run/pg12]:5120 pg12@testdb=# select count(*) from t_expand;
count
---------
2000000
(1 row)
[local:/data/run/pg12]:5120 pg12@testdb=# select * from t_expand;
...
啟動gdb跟蹤
(gdb) b hash_search_with_hash_value
Breakpoint 2 at 0xa790f2: file dynahash.c, line 925.
(gdb) c
Continuing.
Breakpoint 2, hash_search_with_hash_value (hashp=0x224eac8, keyPtr=0x7fffed717700, hashvalue=2252448879, action=HASH_ENTER, foundPtr=0x7fffed7176ff) at dynahash.c:925
925 HASHHDR *hctl = hashp->hctl; --> hash控制結構體
(gdb) n
926 int freelist_idx = FREELIST_IDX(hctl, hashvalue);--> 空閑鏈表
(gdb) p *hctl
$1 = {freeList = {{mutex = 0 '\000', nentries = 0, freeList = 0x22504d0}, {mutex = 0 '\000', nentries = 0, freeList = 0x0} <repeats 31 times>}, dsize = 256, nsegs = 1, max_bucket = 15,
high_mask = 31, low_mask = 15, keysize = 20, entrysize = 72, num_partitions = 0, ffactor = 1, max_dsize = -1, ssize = 256, sshift = 8, nelem_alloc = 46}
(gdb) n
949 if (action == HASH_ENTER || action == HASH_ENTER_NULL)
(gdb)
956 if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
(gdb)
957 hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && --> 判斷是否需要擴展
(gdb)
956 if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
(gdb)
965 bucket = calc_bucket(hctl, hashvalue);-->計算hash桶
(gdb) step
calc_bucket (hctl=0x224eb60, hash_val=2252448879) at dynahash.c:871
871 bucket = hash_val & hctl->high_mask;-->先行與high_mask(31)進行掩碼運算
(gdb) n
872 if (bucket > hctl->max_bucket)-->得到的結果如何比max_bucket還大,那要跟low_mask(15)進行掩碼運算
(gdb) p bucket
$2 = 15
(gdb) n
875 return bucket;
(gdb) l
870
871 bucket = hash_val & hctl->high_mask;
872 if (bucket > hctl->max_bucket)
873 bucket = bucket & hctl->low_mask;
874
875 return bucket;
876 }
877
878 /*
879 * hash_search -- look up key in table and perform action
(gdb) n
876 }
(gdb)
hash_search_with_hash_value (hashp=0x224eac8, keyPtr=0x7fffed717700, hashvalue=2252448879, action=HASH_ENTER, foundPtr=0x7fffed7176ff) at dynahash.c:967
967 segment_num = bucket >> hashp->sshift;-->seg號,相當于15/256,結果為0
(gdb)
968 segment_ndx = MOD(bucket, hashp->ssize);-->seg內(nèi)編號,相當于15/256取模,結果為15
(gdb)
970 segp = hashp->dir[segment_num];
(gdb)
972 if (segp == NULL)
(gdb) p segment_num
$3 = 0
(gdb) p segment_ndx
$4 = 15
(gdb) n
975 prevBucketPtr = &segp[segment_ndx];
(gdb)
976 currBucket = *prevBucketPtr;
(gdb)
981 match = hashp->match; /* save one fetch in inner loop */
(gdb)
982 keysize = hashp->keysize; /* ditto */
(gdb)
984 while (currBucket != NULL)
(gdb)
997 if (foundPtr)
(gdb)
998 *foundPtr = (bool) (currBucket != NULL);
(gdb)
1003 switch (action)
(gdb)
1047 if (currBucket != NULL)
(gdb)
1051 if (hashp->frozen)
(gdb)
1055 currBucket = get_hash_entry(hashp, freelist_idx);
(gdb)
1056 if (currBucket == NULL)
(gdb)
1073 *prevBucketPtr = currBucket;
(gdb)
1074 currBucket->link = NULL;
(gdb)
1077 currBucket->hashvalue = hashvalue;
(gdb)
1078 hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
(gdb)
1087 return (void *) ELEMENTKEY(currBucket);
(gdb)
1093 }
(gdb)
hash_search (hashp=0x224eac8, keyPtr=0x7fffed717700, action=HASH_ENTER, foundPtr=0x7fffed7176ff) at dynahash.c:916
916 }
(gdb)
N/A
免責聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉載和分享為主,文章觀點不代表本網(wǎng)站立場,如果涉及侵權請聯(lián)系站長郵箱:is@yisu.com進行舉報,并提供相關證據(jù),一經(jīng)查實,將立刻刪除涉嫌侵權內(nèi)容。