Sphinx源碼分析——Indexer

發(fā)布時(shí)間：2020-07-05 20:47:34 來(lái)源：網(wǎng)絡(luò) 閱讀：3608 作者：frankiewb 欄目：移動(dòng)開發(fā)

Sphinx作為一款優(yōu)秀的全文檢索開源軟件確實(shí)是很不錯(cuò)，最近工作需要，要求在其上進(jìn)行二次開發(fā)，第一次接觸這樣一款開源軟件，興奮和緊張心情難免，作為一個(gè)剛畢業(yè)的應(yīng)屆生，看了一周的源代碼，現(xiàn)在奉上一篇博文來(lái)對(duì)其Indexer部分代碼進(jìn)行分析，共各位及自己做一個(gè)參考，其中只代表個(gè)人的一些粗淺看法，如果不對(duì)，請(qǐng)各位大神一定要指正，這樣才能提高，謝謝！

Indexer作為Sphinx的重要組成部分之一，其主要作用是將數(shù)據(jù)源進(jìn)行索引化操作，其中涉及到了索引結(jié)構(gòu)的問題，我會(huì)單獨(dú)開一篇博文去詳細(xì)講解這個(gè)問題，下面我們開始進(jìn)行代碼分析。

//Indexer.cpp —— int main ( int argc, char ** argv )

這一部分主要是命令行下參數(shù)的讀入，我們這里采用的是Indexer.exe -c d:/csft_mysql.conf -All命令，這里，argc = 4，即有3個(gè)參數(shù)，argv[1] :-c就是讀取指定config文檔，argv[2]: d: / csft_mysql.conf就配置文件config的位置，argv[3]就代表對(duì)config文件中指定的所有數(shù)據(jù)源進(jìn)行索引構(gòu)建

//Do argv[i] 解析命令行
const char * sOptConfig = NULL;
    bool bMerge = false;
    CSphVector<CSphFilterSettings> dMergeDstFilters;
    CSphVector<const char *> dIndexes;
    bool bIndexAll = false;
    bool bMergeKillLists = false;
    int i;
    for ( i=1; i<argc; i++ )//依次處理命令行中的每一個(gè)部分
    {
        if ( ( !strcmp ( argv[i], "--config" ) || !strcmp ( argv[i], "-c" ) ) && (i+1)<argc )
        {
            sOptConfig = argv[++i];
            if ( !sphIsReadable ( sOptConfig ) )
                sphDie ( "config file '%s' does not exist or is not readable", sOptConfig );
        }
        else if ( strcasecmp ( argv[i], "--merge" )==0 && (i+2)<argc )
        {
            bMerge = true;
            dIndexes.Add ( argv[i+1] );
            dIndexes.Add ( argv[i+2] );
            i += 2;
        }
        else if ( bMerge && strcasecmp ( argv[i], "--merge-dst-range" )==0 && (i+3)<argc )
        {
            dMergeDstFilters.Resize ( dMergeDstFilters.GetLength()+1 );
            dMergeDstFilters.Last().m_eType = SPH_FILTER_RANGE;
            dMergeDstFilters.Last().m_sAttrName = argv[i+1];
            dMergeDstFilters.Last().m_uMinValue = (SphAttr_t) strtoull ( argv[i+2], NULL, 10 );
            dMergeDstFilters.Last().m_uMaxValue = (SphAttr_t) strtoull ( argv[i+3], NULL, 10 );
            i += 3;
        }
        else if ( strcasecmp ( argv[i], "--buildstops" )==0 && (i+2)<argc )
        {
            g_sBuildStops = argv[i+1];
            g_iTopStops = atoi ( argv[i+2] );
            if ( g_iTopStops<=0 )
                break;
            i += 2;
        } else if ( strcasecmp ( argv[i], "--rotate" )==0 )
        {
            g_bRotate = true;
        } else if ( strcasecmp ( argv[i], "--buildfreqs" )==0 )
        {
            g_bBuildFreqs = true;
        } else if ( strcasecmp ( argv[i], "--quiet" )==0 )
        {
            g_bQuiet = true;//這里quiet就是只輸出error信息其余信息都不輸出
            sphSetQuiet ( true );
        } else if ( strcasecmp ( argv[i], "--noprogress" )==0 )
        {
            g_bProgress = false;
        } else if ( strcasecmp ( argv[i], "--all" )==0 )
        {
            bIndexAll = true;//表明所有數(shù)據(jù)源都要進(jìn)行索引構(gòu)建
        } else if ( strcasecmp ( argv[i], "--merge-killlists" )==0 )
        {
            bMergeKillLists = true;
        } else if ( sphIsAlpha(argv[i][0]) )
        {
            dIndexes.Add ( argv[i] );
        } else
        {
            break;
        }
    }
    if ( !g_bQuiet )
        fprintf ( stdout, SPHINX_BANNER );//如果不是quiet模式則輸出歡迎信息
    if ( !isatty ( fileno(stdout) ) )
        g_bProgress = false;
    if ( i!=argc || argc<2 )
    {
        if ( argc>1 )
        {
            fprintf ( stdout, "ERROR: malformed or unknown option near '%s'.\n", argv[i] );
        } else
        {
            fprintf ( stdout,
                "Usage: indexer [OPTIONS] [indexname1 [indexname2 [...]]]\n"
                "\n"
                "Options are:\n"
                "--config <file>\t\tread configuration from specified file\n"
                "\t\t\t(default is csft.conf)\n"
                "--all\t\t\treindex all configured indexes\n"
                "--quiet\t\t\tbe quiet, only print errors\n"
                "--noprogress\t\tdo not display progress\n"
                "\t\t\t(automatically on if output is not to a tty)\n"
#if !USE_WINDOWS
                "--rotate\t\tsend SIGHUP to searchd when indexing is over\n"
                "\t\t\tto rotate updated indexes automatically\n"
#endif
                "--buildstops <output.txt> <N>\n"
                "\t\t\tbuild top N stopwords and write them to given file\n"
                "--buildfreqs\t\tstore words frequencies to output.txt\n"
                "\t\t\t(used with --buildstops only)\n"
                "--merge <dst-index> <src-index>\n"
                "\t\t\tmerge 'src-index' into 'dst-index'\n"
                "\t\t\t'dst-index' will receive merge result\n"
                "\t\t\t'src-index' will not be modified\n"
                "--merge-dst-range <attr> <min> <max>\n"
                "\t\t\tfilter 'dst-index' on merge, keep only those documents\n"
                "\t\t\twhere 'attr' is between 'min' and 'max' (inclusive)\n"
                "--merge-killlists"
                "\t\t\tmerge src and dst killlists instead of applying src killlist to dst"
                "\n"
                "Examples:\n"
                "indexer --quiet myidx1\treindex 'myidx1' defined in 'csft.conf'\n"
                "indexer --all\t\treindex all indexes defined in 'csft.conf'\n" );
        }
        return 1;
    }

下一步則是進(jìn)行Config文件的讀取工作，這涉及到兩個(gè)重要的類，在后面的索引及查詢過程中會(huì)多次使用它們，一個(gè)是CSphConfigParser,一個(gè)是CSphConfig。首先看下CSphConfigParser的結(jié)構(gòu):

//Sphinxutils.h

class CSphConfigParser
{
public:
    CSphConfig      m_tConf;
public:
                    CSphConfigParser ();
    bool            Parse ( const char * sFileName, const char * pBuffer = NULL );
protected:
    CSphString      m_sFileName;
    int             m_iLine;
    CSphString      m_sSectionType;
    CSphString      m_sSectionName;
    char            m_sError [ 1024 ];
    int                 m_iWarnings;
    static const int    WARNS_THRESH    = 5;
protected:
    bool            IsPlainSection ( const char * sKey );
    bool            IsNamedSection ( const char * sKey );
    bool            AddSection ( const char * sType, const char * sSection );
    void            AddKey ( const char * sKey, char * sValue );
    bool            ValidateKey ( const char * sKey );
#if !USE_WINDOWS
    bool            TryToExec ( char * pBuffer, char * pEnd, const char * szFilename, CSphVector<char> & dResult );
#endif
    char *          GetBufferString ( char * szDest, int iMax, const char * & szSource );
};

它是解析Config文檔的主要的數(shù)據(jù)結(jié)構(gòu)，其中存儲(chǔ)Config文檔信息的就是我們這里提到的第二個(gè)數(shù)據(jù)結(jié)構(gòu)，它同樣也是CSphConfigParser中的一個(gè)成員變量，即CSphConfig，我們可以看到CSphConfig實(shí)際上是一個(gè)哈希表，通過代碼發(fā)現(xiàn)它是一個(gè)擁有256個(gè)鍵值對(duì)的哈希表，后面我們會(huì)講到，通過CSphConfigParser類函數(shù)，將Config文件解析，讀取到某一Config名字插入CsphConfig哈希表中的Key值，讀取到該Config對(duì)應(yīng)的值插入到Value中，方便后面構(gòu)建索引時(shí)使用。

//Sphinxutils.h

/// config (hash of section types)
typedef SmallStringHash_T < CSphConfigType >  CSphConfig;

說(shuō)完了這兩個(gè)數(shù)據(jù)結(jié)構(gòu)我們來(lái)看下Indexer是如何讀取Config信息的,其中主要是通過一個(gè)sphLoadConfig函數(shù)完成讀取操作，將相關(guān)Config信息以鍵值對(duì)的形式存入cp.m_tConf中，然后檢查重要的參數(shù)是否讀入且存在，例如Source相關(guān)信息，數(shù)據(jù)源是否被讀入，Sphinx中Mysql默認(rèn)Source對(duì)應(yīng)值為mysql，Indexer，即全局Index定義中是否定義了mem_limit的值，即索引過程中最大緩存限制，等等。

//Indexer.cpp —— int main ( int argc, char ** argv )

///////////////
// load config
///////////////
CSphConfigParser cp;
CSphConfig & hConf = cp.m_tConf;
sOptConfig = sphLoadConfig ( sOptConfig, g_bQuiet, cp );
if ( !hConf ( "source" ) )
    sphDie ( "no indexes found in config file '%s'", sOptConfig );
g_iMemLimit = 0;
if ( hConf("indexer") && hConf["indexer"]("indexer") )
{
    CSphConfigSection & hIndexer = hConf["indexer"]["indexer"];
    g_iMemLimit = hIndexer.GetSize ( "mem_limit", 0 );
    g_iMaxXmlpipe2Field = hIndexer.GetSize ( "max_xmlpipe2_field", 2*1024*1024 );
    g_iWriteBuffer = hIndexer.GetSize ( "write_buffer", 1024*1024 );
    sphSetThrottling ( hIndexer.GetInt ( "max_iops", 0 ), hIndexer.GetSize ( "max_iosize", 0 ) );
}

這其中，主要解析函數(shù)為CSphConfig中的Parser函數(shù)，其里面比較復(fù)雜，大意就是按照字符流讀取Config文檔，遇到配置信息及其值就存儲(chǔ)到CSphconfig這個(gè)哈希表中

//Sphinxutils.h

const char * sphLoadConfig ( const char * sOptConfig, bool bQuiet, CSphConfigParser & cp )
{
    // fallback to defaults if there was no explicit config specified
    while ( !sOptConfig )
    {
#ifdef SYSCONFDIR
        sOptConfig = SYSCONFDIR "/csft.conf";
        if ( sphIsReadable(sOptConfig) )
            break;
#endif
        sOptConfig = "./csft.conf";
        if ( sphIsReadable(sOptConfig) )
            break;
        sOptConfig = NULL;
        break;
    }
    if ( !sOptConfig )
        sphDie ( "no readable config file (looked in "
#ifdef SYSCONFDIR
        SYSCONFDIR "/csft.conf, "
#endif
        "./csft.conf)" );
    if ( !bQuiet )
        fprintf ( stdout, "using config file '%s'...\n", sOptConfig );
    // load config
    if ( !cp.Parse ( sOptConfig ) )//Parser為實(shí)際解析函數(shù)
        sphDie ( "failed to parse config file '%s'", sOptConfig );
    CSphConfig & hConf = cp.m_tConf;
    if ( !hConf ( "index" ) )
        sphDie ( "no indexes found in config file '%s'", sOptConfig );
    return sOptConfig;
}

當(dāng)我們順利的讀取完Config信息后，我們進(jìn)入構(gòu)建索引階段，前面我們提到了第三個(gè)參數(shù)，我們選用的是ALL即為所有的數(shù)據(jù)源構(gòu)建索引，故bMerge(合并索引）為false，bIndexALL為true，我們開始為每一數(shù)據(jù)源構(gòu)建索引，程序會(huì)開始在類型為CSphConfig的hConf哈希表中搜索Key為index的值，即需要構(gòu)建的索引，然后取出該索引的名稱，結(jié)合數(shù)據(jù)源Source信息構(gòu)建索引，執(zhí)行DoIndex函數(shù)。

//Indexer.cpp —— int main ( int argc, char ** argv )

/////////////////////
// index each index
////////////////////
sphStartIOStats ();
bool bIndexedOk = false; // if any of the indexes are ok
if ( bMerge )
{
    if ( dIndexes.GetLength()!=2 )
        sphDie ( "there must be 2 indexes to merge specified" );
    if ( !hConf["index"](dIndexes[0]) )
        sphDie ( "no merge destination index '%s'", dIndexes[0] );
    if ( !hConf["index"](dIndexes[1]) )
        sphDie ( "no merge source index '%s'", dIndexes[1] );
    bIndexedOk = DoMerge (
        hConf["index"][dIndexes[0]], dIndexes[0],
        hConf["index"][dIndexes[1]], dIndexes[1], dMergeDstFilters, g_bRotate, bMergeKillLists );
} else if ( bIndexAll )
{
    hConf["index"].IterateStart ();
    while ( hConf["index"].IterateNext() )
        bIndexedOk |= DoIndex ( hConf["index"].IterateGet (), hConf["index"].IterateGetKey().cstr(), hConf["source"] );//在這里構(gòu)建索引，核心函數(shù)為DoIndex
} else
{
    ARRAY_FOREACH ( i, dIndexes )
    {
        if ( !hConf["index"](dIndexes[i]) )
            fprintf ( stdout, "WARNING: no such index '%s', skipping.\n", dIndexes[i] );
        else
            bIndexedOk |= DoIndex ( hConf["index"][dIndexes[i]], dIndexes[i], hConf["source"] );
    }
}
sphShutdownWordforms ();
const CSphIOStats & tStats = sphStopIOStats ();
if ( !g_bQuiet )
{
    ReportIOStats ( "reads", tStats.m_iReadOps, tStats.m_iReadTime, tStats.m_iReadBytes );
    ReportIOStats ( "writes", tStats.m_iWriteOps, tStats.m_iWriteTime, tStats.m_iWriteBytes );
}

DoIndex為整個(gè)Indexer中最核心的函數(shù)，下面我們來(lái)詳細(xì)分析下.

//Indexer.cpp ——DoIndex(const CSphConfigSection & hIndex, const char * sIndexName, const CSphConfigType & hSource)

首先判斷數(shù)據(jù)源類型是否為分布式，是否采用quiet模式只輸出error信息

if ( hIndex("type") && hIndex["type"]=="distributed" )
    {
        if ( !g_bQuiet )
        {
            fprintf ( stdout, "distributed index '%s' can not be directly indexed; skipping.\n", sIndexName );
            fflush ( stdout );
        }
        return false;
    }
    if ( !g_bQuiet )
    {
        fprintf ( stdout, "indexing index '%s'...\n", sIndexName );
        fflush ( stdout );
    }

然后檢查hIndex信息中的配置信息是否齊全正確

// check config
    if ( !hIndex("path") )
    {
        fprintf ( stdout, "ERROR: index '%s': key 'path' not found.\n", sIndexName );
        return false;
    }
    if ( ( hIndex.GetInt ( "min_prefix_len", 0 ) > 0 || hIndex.GetInt ( "min_infix_len", 0 ) > 0 )
        && hIndex.GetInt ( "enable_star" ) == 0 )
    {
        const char * szMorph = hIndex.GetStr ( "morphology", "" );
        if ( szMorph && *szMorph && strcmp ( szMorph, "none" ) )
        {
            fprintf ( stdout, "ERROR: index '%s': infixes and morphology are enabled, enable_star=0\n", sIndexName );
            return false;
        }
    }

接著開始準(zhǔn)備分詞器，其中主要就是初始化一些參數(shù)，例如在sphConfTokenizer中會(huì)根據(jù)Config配置文件中設(shè)置的charsetType類型選擇合適的編碼解析字符，以及采用何種中文分詞器來(lái)對(duì)中文進(jìn)行分詞操作。然后就是初始化參數(shù)后創(chuàng)建分析器實(shí)例指定分詞所用的詞典的地址位置等

///////////////////
// spawn tokenizer
///////////////////
CSphString sError;
CSphTokenizerSettings tTokSettings;
if ( !sphConfTokenizer ( hIndex, tTokSettings, sError ) )
    sphDie ( "index '%s': %s", sIndexName, sError.cstr() );
ISphTokenizer * pTokenizer = ISphTokenizer::Create ( tTokSettings, sError );
if ( !pTokenizer )
    sphDie ( "index '%s': %s", sIndexName, sError.cstr() );
CSphDict * pDict = NULL;
CSphDictSettings tDictSettings;
if ( !g_sBuildStops )
{
    ISphTokenizer * pTokenFilter = NULL;
    sphConfDictionary ( hIndex, tDictSettings );
    pDict = sphCreateDictionaryCRC ( tDictSettings, pTokenizer, sError );
    if ( !pDict )
        sphDie ( "index '%s': %s", sIndexName, sError.cstr() );
    if ( !sError.IsEmpty () )
        fprintf ( stdout, "WARNING: index '%s': %s\n", sIndexName, sError.cstr() );
    pTokenFilter = ISphTokenizer::CreateTokenFilter ( pTokenizer, pDict->GetMultiWordforms () );
    pTokenizer = pTokenFilter ? pTokenFilter : pTokenizer;
}

然后是前綴后綴索引設(shè)置（這個(gè)地方研究的不細(xì)致，先把代碼貼出來(lái)，待補(bǔ)充）

// prefix/infix indexing
    int iPrefix = hIndex("min_prefix_len") ? hIndex["min_prefix_len"].intval() : 0;
    int iInfix = hIndex("min_infix_len") ? hIndex["min_infix_len"].intval() : 0;
    iPrefix = Max ( iPrefix, 0 );
    iInfix = Max ( iInfix, 0 );
    CSphString sPrefixFields, sInfixFields;
    if ( hIndex.Exists ( "prefix_fields" ) )
        sPrefixFields = hIndex ["prefix_fields"].cstr ();
    if ( hIndex.Exists ( "infix_fields" ) )
        sInfixFields = hIndex ["infix_fields"].cstr ();
    if ( iPrefix == 0 && !sPrefixFields.IsEmpty () )
        fprintf ( stdout, "WARNING: min_prefix_len = 0. prefix_fields are ignored\n" );
    if ( iInfix == 0 && !sInfixFields.IsEmpty () )
        fprintf ( stdout, "WARNING: min_infix_len = 0. infix_fields are ignored\n" );

然后設(shè)置boundary信息（詞組邊界符列表，此列表控制哪些字符被視作分隔不同詞組的邊界，每到一個(gè)這樣的邊界，其后面的詞的“位置”值都會(huì)被加入一個(gè)額外的增量，可以借此用近似搜索符來(lái)模擬詞組搜索。）

// boundary
bool bInplaceEnable = hIndex.GetInt ( "inplace_enable", 0 ) != 0;
int iHitGap         = hIndex.GetSize ( "inplace_hit_gap", 0 );
int iDocinfoGap     = hIndex.GetSize ( "inplace_docinfo_gap", 0 );
float fRelocFactor  = hIndex.GetFloat ( "inplace_reloc_factor", 0.1f );
float fWriteFactor  = hIndex.GetFloat ( "inplace_write_factor", 0.1f );
if ( bInplaceEnable )
{
    if ( fRelocFactor < 0.01f || fRelocFactor > 0.9f )
    {
        fprintf ( stdout, "WARNING: inplace_reloc_factor must be 0.01 to 0.9, clamped\n" );
        fRelocFactor = Min ( Max ( fRelocFactor, 0.01f ), 0.9f );
    }
    if ( fWriteFactor < 0.01f || fWriteFactor > 0.9f )
    {
        fprintf ( stdout, "WARNING: inplace_write_factor must be 0.01 to 0.9, clamped\n" );
        fWriteFactor = Min ( Max ( fWriteFactor, 0.01f ), 0.9f );
    }
    if ( fWriteFactor+fRelocFactor > 1.0f )
    {
        fprintf ( stdout, "WARNING: inplace_write_factor+inplace_reloc_factor must be less than 0.9, scaled\n" );
        float fScale = 0.9f/(fWriteFactor+fRelocFactor);
        fRelocFactor *= fScale;
        fWriteFactor *= fScale;
    }
}

接下來(lái)準(zhǔn)備數(shù)據(jù)源，其實(shí)發(fā)現(xiàn)Indexer在準(zhǔn)備這些工作時(shí)很繁瑣，一遍又一遍的檢查相關(guān)配置信息是否完全，前面檢查了后面還查，可能是出于嚴(yán)謹(jǐn)?shù)目紤]吧，這里提一下dSource是一個(gè)CSphSource的數(shù)組，每一個(gè)CSphSource類型的pSource對(duì)應(yīng)一個(gè)數(shù)據(jù)源，因?yàn)榕渲眯畔⒅锌赡軙?huì)存在多個(gè)數(shù)據(jù)源，所以會(huì)有多個(gè)pSource。程序會(huì)在hIndex中搜索Key值為Source的鍵值對(duì)，提取出對(duì)應(yīng)的值作為pSourceName ，在本例中，我們只有配置文件中的一個(gè)Source即mysql。我們看一下CSphSource類型結(jié)構(gòu)。其中包含有三個(gè)大部分，第一大部分存儲(chǔ)文本分詞后的word信息，每一個(gè)word(也許是字也許是詞）對(duì)應(yīng)一個(gè)WordHit，這個(gè)WordHit描述該word的相關(guān)信息，唯一標(biāo)示該word。其中WordHit中又包含三部分，分別為word的文檔ID，表示該word屬于哪一篇文檔；word的ID，表示該word在字典中的對(duì)應(yīng)ID；Word的位置，表示該word在文檔中的偏移量。第二大部分存儲(chǔ)Source中文檔的相關(guān)信息，其中亦包含了三部分，分別問文檔ID；文檔中列的數(shù)目，以及列對(duì)應(yīng)的指針。第三大部分存儲(chǔ)的就是doc中的屬性字段信息。

/// generic data source
class CSphSource : public CSphSourceSettings
{
public:
    CSphVector<CSphWordHit>               m_dHits;    ///< current document split into words
    CSphDocInfo                         m_tDocInfo; ///< current document info
    CSphVector<CSphString>                m_dStrAttrs;///< current document string attrs

123

// parse all sources
    CSphVector<CSphSource*> dSources;
    bool bGotAttrs = false;
    bool bSpawnFailed = false;
    for ( CSphVariant * pSourceName = hIndex("source"); pSourceName; pSourceName = pSourceName->m_pNext )
    {
        if ( !hSources ( pSourceName->cstr() ) )
        {
            fprintf ( stdout, "ERROR: index '%s': source '%s' not found.\n", sIndexName, pSourceName->cstr() );
            continue;
        }
        const CSphConfigSection & hSource = hSources [ pSourceName->cstr() ];
        CSphSource * pSource = SpawnSource ( hSource, pSourceName->cstr(), pTokenizer->IsUtf8 () );//通過SpawnSource完成對(duì)于數(shù)據(jù)源的解析，其中包括了屬性列，需要構(gòu)建索引列等相關(guān)信息
        if ( !pSource )
        {
            bSpawnFailed = true;
            continue;
        }
        if ( pSource->HasAttrsConfigured() )
            bGotAttrs = true;//判斷數(shù)據(jù)源中是否有指定屬性項(xiàng)
        pSource->SetupFieldMatch ( sPrefixFields.cstr (), sInfixFields.cstr () );
        pSource->SetTokenizer ( pTokenizer );//為每一個(gè)Source準(zhǔn)備一個(gè)分詞器
        dSources.Add ( pSource );//將解析好的某個(gè)Source加入Source數(shù)組中去，因?yàn)榭赡艽嬖诙鄠€(gè)Source
}

Source信息準(zhǔn)備好后，開始準(zhǔn)備Index的構(gòu)建工作，首先檢測(cè)該Index是否被使用，即是否被上鎖，其次通過CSphIndexSettings類型的tSettings對(duì)創(chuàng)建好的pIndex進(jìn)行初始化，主要是一些索引構(gòu)建的信息，例如緩存大小，Boudary大小，停用詞初始化，分詞器初始化等等。準(zhǔn)備完相關(guān)信息后，重要的就是Build函數(shù)，這是索引構(gòu)建的核心函數(shù)，我們來(lái)仔細(xì)分析

// do index
        CSphIndex * pIndex = sphCreateIndexPhrase ( sIndexPath.cstr() );
        assert ( pIndex );
        // check lock file
        if ( !pIndex->Lock() )
        {
            fprintf ( stdout, "FATAL: %s, will not index. Try --rotate option.\n", pIndex->GetLastError().cstr() );
            exit ( 1 );
        }
        CSphIndexSettings tSettings;
        sphConfIndex ( hIndex, tSettings );
        if ( tSettings.m_bIndexExactWords && !tDictSettings.HasMorphology () )
        {
            tSettings.m_bIndexExactWords = false;
            fprintf ( stdout, "WARNING: index '%s': no morphology, index_exact_words=1 has no effect, ignoring\n", sIndexName );
        }
        if ( bGotAttrs && tSettings.m_eDocinfo==SPH_DOCINFO_NONE )
        {
            fprintf ( stdout, "FATAL: index '%s': got attributes, but docinfo is 'none' (fix your config file).\n", sIndexName );
            exit ( 1 );
        }
        pIndex->SetProgressCallback ( ShowProgress );
        if ( bInplaceEnable )
            pIndex->SetInplaceSettings ( iHitGap, iDocinfoGap, fRelocFactor, fWriteFactor );
        pIndex->SetTokenizer ( pTokenizer );
        pIndex->SetDictionary ( pDict );
        pIndex->Setup ( tSettings );
        bOK = pIndex->Build ( dSources, g_iMemLimit, g_iWriteBuffer )!=0;//Build函數(shù)是索引構(gòu)建的重點(diǎn)，所有的核心操作都在其中
        if ( bOK && g_bRotate )
        {
            sIndexPath.SetSprintf ( "%s.new", hIndex["path"].cstr() );
            bOK = pIndex->Rename ( sIndexPath.cstr() );
        }
        if ( !bOK )
            fprintf ( stdout, "ERROR: index '%s': %s.\n", sIndexName, pIndex->GetLastError().cstr() );
        pIndex->Unlock ();

對(duì)于Build函數(shù)而言，它是單次處理一個(gè)數(shù)據(jù)源并為此構(gòu)建索引信息，

//sphinx.cpp Build ( const CSphVector<CSphSource*> & dSources, int iMemoryLimit, int iWriteBuffer )

首先是準(zhǔn)備Source，還是把dSource中的每一個(gè)pSource檢查下是否都存在，詞典是否都準(zhǔn)備好，各種初始化是否都齊備

// setup sources
ARRAY_FOREACH ( iSource, dSources )
{
    CSphSource * pSource = dSources[iSource];
    assert ( pSource );
    pSource->SetDict ( m_pDict );
    pSource->Setup ( m_tSettings );
}

鏈接第一個(gè)數(shù)據(jù)源，獲取數(shù)據(jù)源的Schema信息，就是數(shù)據(jù)源的Doc中哪些是屬性，哪些列是要構(gòu)建索引的信息

// connect 1st source and fetch its schema
    if ( !dSources[0]->Connect ( m_sLastError )
        || !dSources[0]->IterateHitsStart ( m_sLastError )
        || !dSources[0]->UpdateSchema ( &m_tSchema, m_sLastError ) )
    {
        return 0;
    }

后面就是初始化一些存儲(chǔ)結(jié)構(gòu)，其中重點(diǎn)說(shuō)下緩存出來(lái)的幾個(gè)臨時(shí)文件分別的作用。結(jié)尾時(shí)tmp0的存儲(chǔ)的是被上鎖的Index，有些Index正在被查詢使用故上鎖。tmp1，即對(duì)應(yīng)將來(lái)生成的spp文件，存儲(chǔ)詞匯的位置信息，包含該詞所在的文檔ID，該詞所在詞典對(duì)應(yīng)的ID，以及該詞在本文檔中的位置信息。tmp2，即對(duì)應(yīng)將來(lái)生成的spa文件存儲(chǔ)的是文檔信息，包含了DocID以及DocInfo信息。tmp7對(duì)應(yīng)的是多值查詢，感興趣的可以度娘，這是一種查詢方式，這里不做過多解釋

// create temp files
    CSphAutofile fdLock ( GetIndexFileName("tmp0"), SPH_O_NEW, m_sLastError, true );
    CSphAutofile fdHits ( GetIndexFileName ( m_bInplaceSettings ? "spp" : "tmp1" ), SPH_O_NEW, m_sLastError, !m_bInplaceSettings );
    CSphAutofile fdDocinfos ( GetIndexFileName ( m_bInplaceSettings ? "spa" : "tmp2" ), SPH_O_NEW, m_sLastError, !m_bInplaceSettings );
    CSphAutofile fdTmpFieldMVAs ( GetIndexFileName("tmp7"), SPH_O_NEW, m_sLastError, true );
    CSphWriter tOrdWriter;
    CSphString sRawOrdinalsFile = GetIndexFileName("tmp4");

下面具體處理每一個(gè)Source取出的每一個(gè)文檔，主要是通過這個(gè)IterateHitsNext實(shí)現(xiàn)的

// fetch documents
        for ( ;; )
        {
            // get next doc, and handle errors
            if ( !pSource->IterateHitsNext ( m_sLastError ) )
                return 0;

具體到該函數(shù)可以看到，該函數(shù)主要是有兩部分組成，即提取索引列(NextDocument),針對(duì)該索引列構(gòu)建索引(BuildHits)

bool CSphSource_Document::IterateHitsNext ( CSphString & sError )
{
    assert ( m_pTokenizer );
    PROFILE ( src_document );
    BYTE ** dFields = NextDocument ( sError );//從數(shù)據(jù)源中提取需要構(gòu)建索引的列
    if ( m_tDocInfo.m_iDocID==0 )
        return true;
    if ( !dFields )
        return false;
    m_tStats.m_iTotalDocuments++;
    m_dHits.Reserve ( 1024 );
    m_dHits.Resize ( 0 );
    BuildHits ( dFields, -1, 0 );//針對(duì)提取出的需要索引的列構(gòu)建索引
    return true;
}

具體看一下NexDocument的操作，通過Sql.h中的API——sqlFetchRow，取出一條記錄，驗(yàn)證該記錄是否合法

// get next non-zero-id row
do
{
    // try to get next row
    bool bGotRow = SqlFetchRow ();//首先嘗試能否正常取出一條記錄
    // when the party's over...
    while ( !bGotRow )//如果取不出來(lái)這條記錄，再繼續(xù)思考原因
    {
        // is that an error?
        if ( SqlIsError() )
        {
            sError.SetSprintf ( "sql_fetch_row: %s", SqlError() );
            m_tDocInfo.m_iDocID = 1; // 0 means legal eof
            return NULL;
        }
        // maybe we can do next step yet?
        if ( !RunQueryStep ( m_tParams.m_sQuery.cstr(), sError ) )
        {
            // if there's a message, there's an error
            // otherwise, we're just over
            if ( !sError.IsEmpty() )
            {
                m_tDocInfo.m_iDocID = 1; // 0 means legal eof
                return NULL;
            }
        } else
        {
            // step went fine; try to fetch
            bGotRow = SqlFetchRow ();
            continue;
        }
        SqlDismi***esult ();
        // ok, we're over
        ARRAY_FOREACH ( i, m_tParams.m_dQueryPost )
        {
            if ( !SqlQuery ( m_tParams.m_dQueryPost[i].cstr() ) )
            {
                sphWarn ( "sql_query_post[%d]: error=%s, query=%s",
                    i, SqlError(), m_tParams.m_dQueryPost[i].cstr() );
                break;
            }
            SqlDismi***esult ();
        }
        m_tDocInfo.m_iDocID = 0; // 0 means legal eof
        return NULL;
    }
    // get him!//成功取得后
    m_tDocInfo.m_iDocID = VerifyID ( sphToDocid ( SqlColumn(0) ) );//判斷ID是否為0，是否越界
    m_uMaxFetchedID = Max ( m_uMaxFetchedID, m_tDocInfo.m_iDocID );
} while ( !m_tDocInfo.m_iDocID );

將條記錄按照Schema分成Feild部分，即需要構(gòu)建索引的部分，以及Attribute部分，即排序需要用到的屬性部分

ARRAY_FOREACH ( i, m_tSchema.m_dFields )
{
    #if USE_ZLIB
    if ( m_dUnpack[i] != SPH_UNPACK_NONE )
    {
        m_dFields[i] = (BYTE*) SqlUnpackColumn ( i, m_dUnpack[i] );
        continue;
    }
    #endif
    m_dFields[i] = (BYTE*) SqlColumn ( m_tSchema.m_dFields[i].m_iIndex );
}
int iFieldMVA = 0;
for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
{
    const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i); // shortcut
    if ( tAttr.m_eAttrType & SPH_ATTR_MULTI )
    {
        m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
        if ( tAttr.m_eSrc == SPH_ATTRSRC_FIELD )
            ParseFieldMVA ( m_dFieldMVAs, iFieldMVA++, SqlColumn ( tAttr.m_iIndex ) );
        continue;
    }
    switch ( tAttr.m_eAttrType )
    {
        case SPH_ATTR_ORDINAL:
            // memorize string, fixup NULLs
            m_dStrAttrs[i] = SqlColumn ( tAttr.m_iIndex );
            if ( !m_dStrAttrs[i].cstr() )
                m_dStrAttrs[i] = "";
            m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
            break;
        case SPH_ATTR_FLOAT:
            m_tDocInfo.SetAttrFloat ( tAttr.m_tLocator, sphToFloat ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
            break;
        case SPH_ATTR_BIGINT:
            m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToInt64 ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
            break;
        default:
            // just store as uint by default
            m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToDword ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
            break;
    }
}
return m_dFields;

提取出相關(guān)數(shù)據(jù)后，針對(duì)每一條需要索引的item開始構(gòu)建索引，進(jìn)入BuildHit函數(shù)，首先先初始化相關(guān)參數(shù)，準(zhǔn)備分詞器緩存

ARRAY_FOREACH ( iField, m_tSchema.m_dFields )
    {
        //BYTE * sField = dFields[iField];
        BYTE * sField = GetField(dFields, iField);//取出索引字段
        if ( !sField )
            continue;
        if ( m_bStripHTML )
            m_pStripper->Strip ( sField );
        int iFieldBytes = (int) strlen ( (char*)sField );
        m_tStats.m_iTotalBytes += iFieldBytes;
        m_pTokenizer->SetBuffer ( sField, iFieldBytes );//設(shè)置分詞器緩存，實(shí)際上就是索引字段大小，準(zhǔn)備針對(duì)索引字段進(jìn)行分詞
        BYTE * sWord;
        int iPos = HIT_PACK(iField,0);
        int iLastStep = 1;
        bool bPrefixField = m_tSchema.m_dFields[iField].m_eWordpart == SPH_WORDPART_PREFIX;
        bool bInfixMode = m_iMinInfixLen > 0;
        BYTE sBuf [ 16+3*SPH_MAX_WORD_LEN ];

然后開始分詞，分詞的過程在這里不具體講了，這不屬于Sphinx的主要涉足領(lǐng)域，當(dāng)我們把iField即要索引的字段放入分詞器中依次解析，然后將分出的詞賦值給sWord，將sWord的位置計(jì)算后賦值給ipos

// index words only
            while ( ( sWord = m_pTokenizer->GetToken() )!=NULL )
            {
                iPos += iLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep;
                if ( m_pTokenizer->GetBoundary() )
                    iPos = Max ( iPos+m_iBoundaryStep, 1 );
                iLastStep = 1;
                if ( bGlobalPartialMatch )
                {
                    int iBytes = strlen ( (const char*)sWord );
                    memcpy ( sBuf + 1, sWord, iBytes );
                    sBuf [0]            = MAGIC_WORD_HEAD;
                    sBuf [iBytes + 1]   = '\0';
                    SphWordID_t iWord = m_pDict->GetWordIDWithMarkers ( sBuf );
                    if ( iWord )
                    {
                        CSphWordHit & tHit = m_dHits.Add ();
                        tHit.m_iDocID = m_tDocInfo.m_iDocID;
                        tHit.m_iWordID = iWord;
                        tHit.m_iWordPos = iPos;
                    }
                }

將分詞后的sWord去詞典中查找它對(duì)應(yīng)的詞ID，這樣我們就收集全了這個(gè)詞的所有詳細(xì)信息，創(chuàng)建一個(gè)類型為CSphWordHit類型的tHit,其中存儲(chǔ)了該sWord所在的DocID，在詞典中對(duì)應(yīng)的詞ID，以及在文檔中詞的位置信息Pos

SphWordID_t iWord = m_pDict->GetWordID ( sWord );
                if ( iWord )
                {
                    CSphWordHit & tHit = m_dHits.Add ();//將tHit放入dHit中去
                    tHit.m_iDocID = m_tDocInfo.m_iDocID;
                    tHit.m_iWordID = iWord;
                    tHit.m_iWordPos = iPos;
                } else
                {
                    iLastStep = m_iStopwordStep;
                }

處理完該詞后，如果是中文的話還會(huì)進(jìn)一步去判斷其是否有近義詞出現(xiàn)，其主要的函數(shù)為GetThesaurus，這里要簡(jiǎn)單說(shuō)明下采用的MMSEG分詞法，比如我們分詞得到了中華，那么它還會(huì)繼續(xù)從詞典中去找是否存在其擴(kuò)展詞段（這里姑且翻譯成近義詞）如中華人民，×××，然后也會(huì)把他也存入進(jìn)去(對(duì)于MMSEG的中文分詞方法還有待進(jìn)一步研究，這我只能照著代碼念了），最后將所有的sWord的信息tHit都放入到m_dHits中去,形成我們的詞索引spp索引

// zh_cn only GetThesaurus
                {
                    int iBytes = strlen ( (const char*)sWord );
                    const BYTE* tbuf_ptr = m_pTokenizer->GetThesaurus(sWord, iBytes);
                    if(tbuf_ptr) {
                        while(*tbuf_ptr) {
                            size_t len = strlen((const char*)tbuf_ptr);
                            SphWordID_t iWord = m_pDict->GetWordID ( tbuf_ptr ,len , true);
                            if ( iWord ) {
                                CSphWordHit & tHit = m_dHits.Add ();
                                tHit.m_iDocID = m_tDocInfo.m_iDocID;
                                tHit.m_iWordID = iWord;
                                tHit.m_iWordPos = iPos;
                                //tHit.m_iBytePos = iBytePos;
                                //tHit.m_iByteLen = iByteLen;
                                //iLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1; //needs move this?
                            }
                            tbuf_ptr += len + 1; //move next
                        }
                    }
                    //end if buf
                }//end GetThesaurus

當(dāng)該iField索引字段全部都索引完成后，在dHit中添加結(jié)束標(biāo)記

// mark trailing hit
        if ( m_dHits.GetLength() )
            m_dHits.Last().m_iWordPos |= HIT_FIELD_END;

向AI問一下細(xì)節(jié)

Sphinx源碼分析——Indexer

猜你喜歡

最新資訊

相關(guān)推薦

相關(guān)標(biāo)簽