数据结构随笔3(哈希表)

写了一个哈希表模板类，用于统计一篇文章中不同单词出现的次数。哈希表使用char*作为key,使用桶式链表指针数组(指向结点链表的指针数组)来索引，字符串哈希函数是在网上搜来的。模板参数一个是值类型，另一个NBARREL是指针数组的大小，通常是越大哈希值冲突就越少，结点链表长度也就越短，当然查找就越快。

为了方便统计，为哈希表增加了一个IncValue函数，提高效率。统计的时候使用了快速排序，为了提高效率，
把上次的快排模板稍微修改了一下，直接对指针数组进行排序，就不用进行值的交换了。

/********************************************************************
    created:    2007/12/30
    filename:   hashtable.h
    author:     dj
    purpose:    哈希表模板类
*********************************************************************/

   #ifndef __HASHTABLE_H__
#define __HASHTABLE_H__

#define SAFE_DELETE(p) {if(p) { delete [] (p); (p) = NULL;}}

template<typename T>
void quicksort(T* v, int n)
{
    if (n<=1)
        return;
    int last = 0;
    int pivot = rand()%n;
    swap(v, 0, pivot);
    for (int i = 1; i < n; i++)
    {
        if ((*v[i])>(*v[0]))
            swap(v, ++last, i);
    }
    swap(v, last, 0);
    quicksort(&v[0], last);
    quicksort(&v[last+1], n-last-1);
}

template<typename T>
void swap(T* v, int i, int j)
{
    T tmp = v[i];
    v[i] = v[j];
    v[j] = tmp;
}

template<typename T, int NBARREL = 100>
class HashTable
{
public:
    HashTable():m_nCount(0)
    {
        memset(m_pNodes, NULL, sizeof(void*)*NBARREL);
    }
    ~HashTable()
    {
        FreeTable();
    }
    bool Exists(const char* sName)
    {
        int h = Hash(sName);
        HashNode* p = m_pNodes[h];
        while(p)
        {
            if (strcmp(p->name, sName)==0)
                return true;
            p = p->next;
        }
        return false;
    }
    bool AddNode(const char* sName, const T& tValue)
    {
        if (Exists(sName))
            return false;
        int h = Hash(sName);
        HashNode* node = new HashNode(sName, tValue);
        node->next = m_pNodes[h];
        m_pNodes[h] = node;
        m_nCount++;
        return true;
    }
    T GetValue(const char* sName)
    {
        int h = Hash(sName);
        HashNode* p = m_pNodes[h];
        while(p)
        {
            if (strcmp(p->name, sName)==0)
                return p->value;
            p = p->next;
        }
        return NULL;
    }
    bool SetValue(const char* sName, const T& tValue)
    {
        int h = Hash(sName);
        HashNode* p = m_pNodes[h];
        while(p)
        {
            if (strcmp(p->name, sName)==0)
            {
                p->value = tValue;
                return true;
            }
            p = p->next;
        }
    }
    void IncValue(const char* sName)
    {
        int h = Hash(sName);
        HashNode* p = m_pNodes[h];
        while(p)
        {
            if (strcmp(p->name, sName)==0)
            {
                p->value++;
                return;
            }
            p = p->next;
        }
        HashNode* node = new HashNode(sName, 1);
        node->next = m_pNodes[h];
        m_pNodes[h] = node;
        m_nCount++;
    }
    void Dump(const char* sFile)
    {
        ofstream file(sFile);
        assert(file!=NULL);
        HashNode** pNodes = new HashNode*[m_nCount];
        int i, counter = 0;
        for(i = 0; i < NBARREL; i++)
        {
            HashNode* p = m_pNodes[i];
            while(p)
            {
                pNodes[counter++] = p;
                p = p->next;
            }
        }
        quicksort(pNodes, m_nCount);
        for (i = 0; i < m_nCount; i++)
        {
            file<<pNodes[i]->value<<"  "<<pNodes[i]->name<<endl;
        }

        SAFE_DELETE(pNodes);
        file.close();
    }
private:
    int Hash(const char* c)
    {
        int ret=0;
        int n, v, r;
        if ((c == NULL) || (*c == '\0'))
        return(ret);
        n=0x100;
        while (*c)
        {
            v=n|(*c);
            n+=0x100;
            r= (int)((v>>2)^v)&0x0f;
            ret=(ret!=(32-r));
            ret&=0xFFFFFFFFL;
            ret^=v*v;
            c++;
        }
        return(((ret>>16)^ret)%NBARREL);
    }
    void FreeTable()
    {
        for(int i = 0; i < NBARREL; i++)
        {
            HashNode* p = m_pNodes[i];
            while(p)
            {
                HashNode* pnext = p->next;
                delete p;
                p = pnext;
            }
        }
    }
private:
    struct HashNode
    {
        HashNode(const char* c, const T& v)
        {
            name = new char[strlen(c)+1];
            strcpy(name, c);
            value = v;
        }
        ~HashNode()
        {
            SAFE_DELETE(name);
        }
        bool operator > (const HashNode& node) const
        {
            return (this->value > node.value);
        }
        bool operator < (const HashNode& node) const
        {
            return (this->value < node.value);
        }
        bool operator == (const HashNode& node) const
        {
            return (this->value == node.value);
        }
        char* name;
        T value;
        HashNode* next;
    };
    HashNode* m_pNodes[NBARREL];
    int m_nCount;
};

#endif //__HASHTABLE_H__

测试程序如下

int main(int argc, char* argv[])

{

HashTable<int, 500> h;

ifstream f("c:\\test.txt");

string s;

while(f>>s)

{

// if (h.Exists(s.c_str()))

// {

// h.SetValue(s.c_str(), h.GetValue(s.c_str())+1);

// }

// else

// {

// h.AddNode(s.c_str(), 1);

// }

h.IncValue(s.c_str());

}

h.Dump("c:\\stat.txt");

return 0;

}

随便在google上找了几个英文网页来统计，
发现排第一位的单词是"2007",第二位的居然是"die"

程序设计实践上说，用素数作为数组的大小是明智的，因为这样能保证在数组大小、散列的乘数和可能的数据值之间不存在公因子，我觉得他特指java的散列函数

enum{MULTIPLIER = 37}

unsigned int hash(char* str)

{

unsigned int h = 0;

unsigned char* p;

for(p=str;*p!=0;p++)

h = MULTIPLIER*h+*p;

return h%NHASH;

}

最后附几个经典字符串哈希函数来自
http://www.oioj.net/blog/user3/28679/archives/2005/166870.shtml

posted on 2007-12-30 15:13 小四阅读(542) 评论(0) 编辑收藏引用所属分类: 算法与数据结构

只有注册用户登录后才能发表评论。
【推荐】100%开源！大型工业跨平台软件C++源码提供，建模，组态！

相关文章: 关于mp3转ogg DLL窗体中PreTranslateMessage的解决方案数据结构随笔6(表达式求值) 数据结构随笔5(二叉排序树) 数据结构随笔4(折半查找) 数据结构随笔3(哈希表) 数据结构随笔2(快速排序) 数据结构随笔1(堆栈)

网站导航: 博客园 IT新闻 BlogJava 博问 Chat2DB 管理

常用链接

留言簿(14)

随笔分类

随笔档案

相册

搜索

最新评论

阅读排行榜

评论排行榜