您好,登錄后才能下訂單哦!
本節(jié)簡(jiǎn)單解釋了PostgreSQL NOT IN在執(zhí)行時(shí)寫入臨時(shí)表空間的實(shí)現(xiàn)。
測(cè)試數(shù)據(jù)如下:
[local]:5432 pg12@testdb=# select count(*) from tbl;
count
-------
1
(1 row)
Time: 6.009 ms
[local]:5432 pg12@testdb=# select count(*) from t_big_null;
count
----------
10000001
(1 row)
[local]:5432 pg12@testdb=#
Tuplestorestate
Tuplestore相關(guān)操作的私有狀態(tài)。
/*
* Possible states of a Tuplestore object. These denote the states that
* persist between calls of Tuplestore routines.
*/
typedef enum
{
TSS_INMEM, /* Tuples still fit in memory */
TSS_WRITEFILE, /* Writing to temp file */
TSS_READFILE /* Reading from temp file */
} TupStoreStatus;
/*
* Private state of a Tuplestore operation.
*/
struct Tuplestorestate
{
TupStoreStatus status; /* 狀態(tài)枚舉值;enumerated value as shown above */
int eflags; /* capability flags (OR of pointers' flags) */
bool backward; /* store extra length words in file? */
bool interXact; /* keep open through transactions? */
bool truncated; /* tuplestore_trim has removed tuples? */
int64 availMem; /* remaining memory available, in bytes */
int64 allowedMem; /* total memory allowed, in bytes */
int64 tuples; /* number of tuples added */
BufFile *myfile; /* underlying file, or NULL if none */
MemoryContext context; /* memory context for holding tuples */
ResourceOwner resowner; /* resowner for holding temp files */
/*
* These function pointers decouple the routines that must know what kind
* of tuple we are handling from the routines that don't need to know it.
* They are set up by the tuplestore_begin_xxx routines.
*
* (Although tuplestore.c currently only supports heap tuples, I've copied
* this part of tuplesort.c so that extension to other kinds of objects
* will be easy if it's ever needed.)
*
* Function to copy a supplied input tuple into palloc'd space. (NB: we
* assume that a single pfree() is enough to release the tuple later, so
* the representation must be "flat" in one palloc chunk.) state->availMem
* must be decreased by the amount of space used.
*/
void *(*copytup) (Tuplestorestate *state, void *tup);
/*
* Function to write a stored tuple onto tape. The representation of the
* tuple on tape need not be the same as it is in memory; requirements on
* the tape representation are given below. After writing the tuple,
* pfree() it, and increase state->availMem by the amount of memory space
* thereby released.
*/
void (*writetup) (Tuplestorestate *state, void *tup);
/*
* Function to read a stored tuple from tape back into memory. 'len' is
* the already-read length of the stored tuple. Create and return a
* palloc'd copy, and decrease state->availMem by the amount of memory
* space consumed.
*/
void *(*readtup) (Tuplestorestate *state, unsigned int len);
/*
* This array holds pointers to tuples in memory if we are in state INMEM.
* In states WRITEFILE and READFILE it's not used.
*
* When memtupdeleted > 0, the first memtupdeleted pointers are already
* released due to a tuplestore_trim() operation, but we haven't expended
* the effort to slide the remaining pointers down. These unused pointers
* are set to NULL to catch any invalid accesses. Note that memtupcount
* includes the deleted pointers.
*/
void **memtuples; /* array of pointers to palloc'd tuples */
int memtupdeleted; /* the first N slots are currently unused */
int memtupcount; /* number of tuples currently present */
int memtupsize; /* allocated length of memtuples array */
bool growmemtuples; /* memtuples' growth still underway? */
/*
* These variables are used to keep track of the current positions.
*
* In state WRITEFILE, the current file seek position is the write point;
* in state READFILE, the write position is remembered in writepos_xxx.
* (The write position is the same as EOF, but since BufFileSeek doesn't
* currently implement SEEK_END, we have to remember it explicitly.)
*/
TSReadPointer *readptrs; /* array of read pointers */
int activeptr; /* index of the active read pointer */
int readptrcount; /* number of pointers currently valid */
int readptrsize; /* allocated length of readptrs array */
int writepos_file; /* file# (valid if READFILE state) */
off_t writepos_offset; /* offset (valid if READFILE state) */
};
#define COPYTUP(state,tup) ((*(state)->copytup) (state, tup))
#define WRITETUP(state,tup) ((*(state)->writetup) (state, tup))
#define READTUP(state,len) ((*(state)->readtup) (state, len))
#define LACKMEM(state) ((state)->availMem < 0)
#define USEMEM(state,amt) ((state)->availMem -= (amt))
#define FREEMEM(state,amt) ((state)->availMem += (amt))
TSReadPointer
tuplestore讀指針
/*
* Possible states of a Tuplestore object. These denote the states that
* persist between calls of Tuplestore routines.
*/
typedef enum
{
TSS_INMEM, /* Tuples still fit in memory */
TSS_WRITEFILE, /* Writing to temp file */
TSS_READFILE /* Reading from temp file */
} TupStoreStatus;
/*
* State for a single read pointer. If we are in state INMEM then all the
* read pointers' "current" fields denote the read positions. In state
* WRITEFILE, the file/offset fields denote the read positions. In state
* READFILE, inactive read pointers have valid file/offset, but the active
* read pointer implicitly has position equal to the temp file's seek position.
*
* Special case: if eof_reached is true, then the pointer's read position is
* implicitly equal to the write position, and current/file/offset aren't
* maintained. This way we need not update all the read pointers each time
* we write.
*/
typedef struct
{
int eflags; /* capability flags */
bool eof_reached; /* read has reached EOF */
int current; /* next array index to read */
int file; /* temp file# */
off_t offset; /* byte offset in file */
} TSReadPointer;
BufFile
該數(shù)據(jù)結(jié)構(gòu)表示包含一個(gè)或多個(gè)物理文件的buffered file(每一個(gè)通過fd.c管理的虛擬文件描述符進(jìn)行訪問)
/*
* We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
* The reason is that we'd like large BufFiles to be spread across multiple
* tablespaces when available.
* BufFiles會(huì)拆分為以幾個(gè)GB為單位的segments而不管RELSEG_SIZE的大小.
* 原因是我們傾向于在可用時(shí)把很大的BufFiles在多個(gè)表空間中分布.
*/
#define MAX_PHYSICAL_FILESIZE 0x40000000
#define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ)
/*
* This data structure represents a buffered file that consists of one or
* more physical files (each accessed through a virtual file descriptor
* managed by fd.c).
* 該數(shù)據(jù)結(jié)構(gòu)表示包含一個(gè)或多個(gè)物理文件的buffered file(每一個(gè)通過fd.c管理的虛擬文件描述符進(jìn)行訪問)
*/
struct BufFile
{
//集合中物理文件的數(shù)量
int numFiles; /* number of physical files in set */
/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
//------- 除了最后一個(gè)文件,其他文件的大小為MAX_PHYSICAL_FILESIZE
//使用numFiles分配的數(shù)組
File *files; /* palloc'd array with numFiles entries */
//跨事務(wù)?
bool isInterXact; /* keep open over transactions? */
//臟數(shù)據(jù)?
bool dirty; /* does buffer need to be written? */
//是否只讀?
bool readOnly; /* has the file been set to read only? */
//如共享,段文件的空間大小
SharedFileSet *fileset; /* space for segment files if shared */
//如共享,該BufFile的名稱
const char *name; /* name of this BufFile if shared */
/*
* resowner is the ResourceOwner to use for underlying temp files. (We
* don't need to remember the memory context we're using explicitly,
* because after creation we only repalloc our arrays larger.)
* 用于臨時(shí)文件的ResourceOwner
*/
ResourceOwner resowner;
/*
* "current pos" is position of start of buffer within the logical file.
* Position as seen by user of BufFile is (curFile, curOffset + pos).
* "current pos" 是邏輯文件中buffer的起始位置.
* BufFile用戶看到的位置是((curFile, curOffset + pos))
*/
//文件索引,當(dāng)前位置的第(0..n)部分
int curFile; /* file index (0..n) part of current pos */
//當(dāng)前位置的偏移部分
off_t curOffset; /* offset part of current pos */
//buffer中的下一個(gè)R/W位置
int pos; /* next read/write position in buffer */
//buffer中的有效字節(jié)數(shù)
int nbytes; /* total # of valid bytes in buffer */
PGAlignedBlock buffer;
};
tuplestore_puttupleslot
把接收到的tuple放到tuplestore中
/*
* Accept one tuple and append it to the tuplestore.
* 把接收到的tuple放到tuplestore中
*
* Note that the input tuple is always copied; the caller need not save it.
* 要注意的是輸入元組通常已被拷貝,調(diào)用者不需要存儲(chǔ)該元組。
*
* If the active read pointer is currently "at EOF", it remains so (the read
* pointer implicitly advances along with the write pointer); otherwise the
* read pointer is unchanged. Non-active read pointers do not move, which
* means they are certain to not be "at EOF" immediately after puttuple.
* This curious-seeming behavior is for the convenience of nodeMaterial.c and
* nodeCtescan.c, which would otherwise need to do extra pointer repositioning
* steps.
* 如果活動(dòng)的讀指針當(dāng)前正處于EOF位置,那么仍會(huì)保留現(xiàn)狀(讀指針默認(rèn)與寫指針同步)。
* 否則的話,讀指針是不變的。非活動(dòng)讀指針不會(huì)移動(dòng),意味著在puttuple后沒有馬上就處于EOF狀態(tài)下。
* 這種看似奇怪的行為是便于nodeMaterial.c和nodeCtescan.c的處理,否則需要額外的指針重定位。
*
* tuplestore_puttupleslot() is a convenience routine to collect data from
* a TupleTableSlot without an extra copy operation.
* tuplestore_puttupleslot()例程不需要額外的拷貝動(dòng)作從TupleTableSlot中收集數(shù)據(jù)。
*/
void
tuplestore_puttupleslot(Tuplestorestate *state,
TupleTableSlot *slot)
{
MinimalTuple tuple;
MemoryContext oldcxt = MemoryContextSwitchTo(state->context);
/*
* Form a MinimalTuple in working memory
* 在工作內(nèi)存中組裝MinimalTuple
*/
tuple = ExecCopySlotMinimalTuple(slot);
USEMEM(state, GetMemoryChunkSpace(tuple));
tuplestore_puttuple_common(state, (void *) tuple);
MemoryContextSwitchTo(oldcxt);
}
tuplestore_puttuple_common
tuplestore_puttupleslot函數(shù)的實(shí)現(xiàn)
static void
tuplestore_puttuple_common(Tuplestorestate *state, void *tuple)
{
TSReadPointer *readptr;
int i;
ResourceOwner oldowner;
state->tuples++;
switch (state->status)
{
case TSS_INMEM:
/*
* Update read pointers as needed; see API spec above.
* 需要時(shí)更新讀指針
*/
readptr = state->readptrs;
for (i = 0; i < state->readptrcount; readptr++, i++)
{
if (readptr->eof_reached && i != state->activeptr)
{
//已達(dá)末尾,且指針非活動(dòng),則設(shè)置相應(yīng)的狀態(tài)和位置
readptr->eof_reached = false;
readptr->current = state->memtupcount;
}
}
/*
* Grow the array as needed. Note that we try to grow the array
* when there is still one free slot remaining --- if we fail,
* there'll still be room to store the incoming tuple, and then
* we'll switch to tape-based operation.
* 需要時(shí)擴(kuò)展數(shù)組大小.
* 注意:在仍有一個(gè)空閑slot剩余時(shí)嘗試增大數(shù)組,如果失敗仍有空間存儲(chǔ)進(jìn)入的元組,
* 然后切換至tape-based操作.
*/
if (state->memtupcount >= state->memtupsize - 1)
{
(void) grow_memtuples(state);
Assert(state->memtupcount < state->memtupsize);
}
/* Stash the tuple in the in-memory array */
//指向tuple
state->memtuples[state->memtupcount++] = tuple;
/*
* Done if we still fit in available memory and have array slots.
* 仍有可用內(nèi)存并有數(shù)組slots,已完成所有工作,可返回.
*/
if (state->memtupcount < state->memtupsize && !LACKMEM(state))
return;
//否則的話,需要落盤
/*
* Nope; time to switch to tape-based operation. Make sure that
* the temp file(s) are created in suitable temp tablespaces.
* 切換至tape-base操作.
* 確保臨時(shí)文件在合適的temp表空間中創(chuàng)建.
*/
PrepareTempTablespaces();
/* associate the file with the store's resource owner */
//關(guān)聯(lián)文件與存儲(chǔ)資源宿主
oldowner = CurrentResourceOwner;
CurrentResourceOwner = state->resowner;
state->myfile = BufFileCreateTemp(state->interXact);
CurrentResourceOwner = oldowner;
/*
* Freeze the decision about whether trailing length words will be
* used. We can't change this choice once data is on tape, even
* though callers might drop the requirement.
* 關(guān)于是否使用結(jié)尾長(zhǎng)度字需要"凍結(jié)"此決定.
* 一旦數(shù)據(jù)落盤就不能改變此選擇,即使調(diào)用者可能會(huì)放棄此要求.
*/
state->backward = (state->eflags & EXEC_FLAG_BACKWARD) != 0;
state->status = TSS_WRITEFILE;
dumptuples(state);
break;
case TSS_WRITEFILE:
/*
* Update read pointers as needed; see API spec above. Note:
* BufFileTell is quite cheap, so not worth trying to avoid
* multiple calls.
* 需要時(shí)更新讀指針.
* 注意:BufFileTell執(zhí)行效率很高,因此不值得嘗試避免循環(huán)多次調(diào)用.
*/
readptr = state->readptrs;
for (i = 0; i < state->readptrcount; readptr++, i++)
{
if (readptr->eof_reached && i != state->activeptr)
{
readptr->eof_reached = false;
BufFileTell(state->myfile,
&readptr->file,
&readptr->offset);
}
}
//#define WRITETUP(state,tup) ((*(state)->writetup) (state, tup))
WRITETUP(state, tuple);
break;
case TSS_READFILE:
/*
* Switch from reading to writing.
* 從讀切換至寫.
*/
if (!state->readptrs[state->activeptr].eof_reached)
BufFileTell(state->myfile,
&state->readptrs[state->activeptr].file,
&state->readptrs[state->activeptr].offset);
if (BufFileSeek(state->myfile,
state->writepos_file, state->writepos_offset,
SEEK_SET) != 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not seek in tuplestore temporary file: %m")));
state->status = TSS_WRITEFILE;
/*
* Update read pointers as needed; see API spec above.
* 需要時(shí)更新讀指針.
*/
readptr = state->readptrs;
for (i = 0; i < state->readptrcount; readptr++, i++)
{
if (readptr->eof_reached && i != state->activeptr)
{
readptr->eof_reached = false;
readptr->file = state->writepos_file;
readptr->offset = state->writepos_offset;
}
}
//#define WRITETUP(state,tup) ((*(state)->writetup) (state, tup))
WRITETUP(state, tuple);
break;
default:
elog(ERROR, "invalid tuplestore state");
break;
}
}
void
BufFileTell(BufFile *file, int *fileno, off_t *offset)
{
*fileno = file->curFile;
*offset = file->curOffset + file->pos;
}
執(zhí)行SQL:
[local]:5432 pg12@testdb=# select * from tbl a where a.id not in (select b.id from t_big_null b);
啟動(dòng)gdb,進(jìn)入斷點(diǎn)
(gdb) b tuplestore_puttupleslot
Breakpoint 1 at 0xab9134: file tuplestore.c, line 712.
(gdb) c
Continuing.
Breakpoint 1, tuplestore_puttupleslot (state=0x1efec78, slot=0x1efd4e0) at tuplestore.c:712
712 MemoryContext oldcxt = MemoryContextSwitchTo(state->context);
(gdb)
輸入?yún)?shù)
(gdb) n
717 tuple = ExecCopySlotMinimalTuple(slot);
(gdb)
718 USEMEM(state, GetMemoryChunkSpace(tuple));
(gdb)
720 tuplestore_puttuple_common(state, (void *) tuple);
(gdb) p *state
$1 = {status = TSS_INMEM, eflags = 2, backward = false, interXact = false, truncated = false,
availMem = 4177840, allowedMem = 4194304, tuples = 0, myfile = 0x0, context = 0x1efce00, resowner = 0x1e5d308,
copytup = 0xaba7bd <copytup_heap>, writetup = 0xaba811 <writetup_heap>, readtup = 0xaba9d9 <readtup_heap>,
memtuples = 0x1f18ed0, memtupdeleted = 0, memtupcount = 0, memtupsize = 2048, growmemtuples = true,
readptrs = 0x1f056a0, activeptr = 0, readptrcount = 1, readptrsize = 8, writepos_file = 0, writepos_offset = 0}
(gdb) p *slot
$2 = {type = T_TupleTableSlot, tts_flags = 16, tts_nvalid = 0, tts_ops = 0xc3e780 <TTSOpsBufferHeapTuple>,
tts_tupleDescriptor = 0x7f16f33f5378, tts_values = 0x1efd550, tts_isnull = 0x1efd558, tts_mcxt = 0x1efce00,
tts_tid = {ip_blkid = {bi_hi = 0, bi_lo = 0}, ip_posid = 1}, tts_tableOid = 49155}
(gdb) p slot->tts_values[0]
$3 = 0
(gdb)
進(jìn)入tuplestore_puttuple_common
(gdb) step
tuplestore_puttuple_common (state=0x1efec78, tuple=0x1f05ce8) at tuplestore.c:771
771 state->tuples++;
(gdb)
當(dāng)前狀態(tài)TSS_INMEM
(gdb) p state->status
$4 = TSS_INMEM
(gdb)
如需要,更新讀指針(無需更新)
(gdb) n
773 switch (state->status)
(gdb)
780 readptr = state->readptrs;
(gdb)
781 for (i = 0; i < state->readptrcount; readptr++, i++)
(gdb) p *readptr
$5 = {eflags = 2, eof_reached = true, current = 0, file = 2139062143, offset = 9187201950435737471}
(gdb) n
783 if (readptr->eof_reached && i != state->activeptr)
(gdb) p state->readptrcount
$6 = 1
(gdb) p state->activeptr
$7 = 0
(gdb) n
781 for (i = 0; i < state->readptrcount; readptr++, i++)
(gdb)
如需要,擴(kuò)展數(shù)組(實(shí)際不需要)
(gdb)
796 if (state->memtupcount >= state->memtupsize - 1)
(gdb) p state->memtupcount
$8 = 0
(gdb) p state->memtupsize - 1
$9 = 2047
(gdb) n
803 state->memtuples[state->memtupcount++] = tuple;
(gdb)
放入到內(nèi)存中,返回
(gdb) n
808 if (state->memtupcount < state->memtupsize && !LACKMEM(state))
(gdb)
809 return;
(gdb)
退出函數(shù)
(gdb)
892 }
(gdb)
tuplestore_puttupleslot (state=0x1efec78, slot=0x1efd4e0) at tuplestore.c:722
722 MemoryContextSwitchTo(oldcxt);
(gdb)
723 }
(gdb)
ExecMaterial (pstate=0x1efd1b8) at nodeMaterial.c:149
149 ExecCopySlot(slot, outerslot);
(gdb)
使用ignore N遍后,state->status狀態(tài)變?yōu)門SS_WRITEFILE
(gdb) ignore 4 4194303
Will ignore next 4194303 crossings of breakpoint 4.
(gdb) c
Continuing.
Breakpoint 3, tuplestore_puttuple_common (state=0x160ba38, tuple=0x7f2cd90cc0b0) at tuplestore.c:771
771 state->tuples++;
(gdb)
...
tuplestore_puttupleslot (state=0x160ba38, slot=0x160a2a0) at tuplestore.c:722
722 MemoryContextSwitchTo(oldcxt);
(gdb) c
Continuing.
Breakpoint 3, tuplestore_puttuple_common (state=0x160ba38, tuple=0x7f2cd90cc0e8) at tuplestore.c:771
771 state->tuples++;
(gdb) p *state
$9 = {status = TSS_WRITEFILE, eflags = 2, backward = false, interXact = false, truncated = false,
availMem = 3669944, allowedMem = 4194304, tuples = 4192545, myfile = 0x162ad80, context = 0x1609bc0,
resowner = 0x1579170, copytup = 0xaba7bd <copytup_heap>, writetup = 0xaba811 <writetup_heap>,
readtup = 0xaba9d9 <readtup_heap>, memtuples = 0x7f2cd914a050, memtupdeleted = 0, memtupcount = 0,
memtupsize = 65535, growmemtuples = false, readptrs = 0x1627590, activeptr = 0, readptrcount = 1,
readptrsize = 8, writepos_file = 0, writepos_offset = 0}
(gdb) n
773 switch (state->status)
(gdb)
841 readptr = state->readptrs;
(gdb)
842 for (i = 0; i < state->readptrcount; readptr++, i++)
(gdb)
844 if (readptr->eof_reached && i != state->activeptr)
(gdb)
842 for (i = 0; i < state->readptrcount; readptr++, i++)
(gdb)
853 WRITETUP(state, tuple);
(gdb)
854 break;
(gdb) p *state->myfile
$10 = {numFiles = 1, files = 0x7f2cd934c008, isInterXact = false, dirty = true, readOnly = false, fileset = 0x0,
name = 0x0, resowner = 0x1579170, curFile = 0, curOffset = 58687488, pos = 8156, nbytes = 8156, buffer = {
data = "\000\t\030\000\335\366?\000\016\000\000\000\001\000\000\t\030\000\336\366?\000\016\000\000\000\001\000\000\t\030\000\337\366?\000\016\000\000\000\001\000\000\t\030\000\340\366?\000\016\000\000\000\001\000\000\t\030\000\341\366?\000\016\000\000\000\001\000\000\t\030\000\342\366?\000\016\000\000\000\001\000\000\t\030\000\343\366?\000\016\000\000\000\001\000\000\t\030\000\344\366?\000\016\000\000\000\001\000\000\t\030\000\345\366?\000\016\000\000\000\001\000\000\t\030\000\346\366?\000\016\000\000\000\001\000\000\t\030\000\347\366?\000\016\000\000\000\001\000\000\t\030\000\350\366?\000\016\000\000\000\001\000\000\t\030\000\351\366?\000\016\000\000\000\001\000\000\t\030\000\352\366?\000\016\000\000\000\001\000\000\t\030\000"..., force_align_d = 1.7780737478550286e-307,
force_align_i64 = 18004352582551808}}
...
DONE
N/A
免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如果涉及侵權(quán)請(qǐng)聯(lián)系站長(zhǎng)郵箱:is@yisu.com進(jìn)行舉報(bào),并提供相關(guān)證據(jù),一經(jīng)查實(shí),將立刻刪除涉嫌侵權(quán)內(nèi)容。