本篇内容介绍了“分析数据库实现原理”的有关知识,在实际案例的操作过程中,不少人都会遇到这样的困境,接下来就让小编带领大家学习一下如何处理这些情况吧!希望大家仔细阅读,能够学有所成!

Hash连接,如内存足够,首先遍历内表创建Hash表,然后遍历外表,对连接键计算HashCode,如一致,则遍历Hash表中具有同一HashCode的链表,值一致,则返回该值。
如内存不够,可遍历两张表,使用同样的Hash函数把表拆分为N个Hash“分区”,遍历内表每一个Hash分区和外表相应的Hash分区,如找到与连接键值一致的数据,则返回该值。

详见代码注释.

#include<stdio.h>#include<stdlib.h>#include"hash_join.h"#defineMAX_ELEMENTS1024//生成hashcodestaticintgenerate_hashcode(intn){returnn%HASH_BUCKET;}//生成hash桶(写入到文件中,以文件的方式模拟)staticintgenerate_bucket(FILE*file,char*tag){printf("-----------generate_bucket----------\n");//数组charbuf[MAX_BYTES];FILE*fd=NULL;for(;!feof(file);){intx=read_int(file,buf);if(x==0)break;inthashcode=generate_hashcode(x);charfilename[30];sprintf(filename,"/cygdrive/d/tmp/hash/%s_%d.csv",tag,hashcode);//printf("Hashcodeis%d,Bucketfilenameis%s.\n",hashcode,filename);fd=fopen(filename,"a");if(fd==NULL){printf("Cannotopenfile%s.\n",filename);return0;}//写入文件中write_int(fd,x);fclose(fd);}return1;}//把hash表加载到内存中,适用于内存足够的情况//使用二维数组模拟Hash表,D1:hash桶,D2:桶中的数据staticintload_hashtable(intht[][MAX_ELEMENTS]){printf("-----------load_hashtable----------\n");for(inti=0;i<HASH_BUCKET;i++){//循环桶号charfilename[MAX_BYTES];//读文件sprintf(filename,"/cygdrive/d/tmp/hash/inner_%d.csv",i);FILE*fd=fopen(filename,"r");if(fd==NULL){//printf("Cannotopenfile:%s\n",filename);continue;}intj=0;charbuf[MAX_BYTES];for(;!feof(fd)&&j<MAX_ELEMENTS;){//把文件内容放到数组中intx=read_int(fd,buf);ht[i][j++]=x;}fclose(fd);}return1;}//使用内存创建hash表进行hash连接staticvoidhash_join_onmemory(FILE*outerfile,FILE*innerfile){printf("-----------hash_join_onmemory----------\n");intht[HASH_BUCKET][MAX_ELEMENTS];charbuffer[MAX_BYTES];intflag=0;//创建hashbucket文件flag=generate_bucket(innerfile,"inner");if(!flag){printf("Cannotgeneratebucketfile!\n");return;}//加载到hash表中(二维数组模拟)flag=load_hashtable(ht);if(!flag){printf("Cannotloadhashtable!\n");return;}//遍历第二个文件,执行JOINfor(;!feof(outerfile);){//读第二个文件,执行joinintouter=read_int(outerfile,buffer);//计算hashcodeinthashcode=generate_hashcode(outer);for(inti=0;i<MAX_ELEMENTS;i++){//遍历hash桶中的数据,找到对应的数据if(ht[hashcode][i]==outer){printf("Foundone,hashbucketis%d,valueis:%d.\n",hashcode,outer);}}}}//使用磁盘缓存进行hash连接staticvoidhash_join_ondisk(FILE*outerfile,FILE*innerfile){printf("-----------hash_join_ondisk----------\n");charbuffer[MAX_BYTES];intflag=0;//创建hash"桶"文件flag=generate_bucket(innerfile,"inner");if(!flag){printf("Cannotgenerateinnerbucketfile!\n");return;}flag=generate_bucket(outerfile,"outer");if(!flag){printf("Cannotgenerateouterbucketfile!\n");return;}//遍历hash值相同的文件,执行连接for(inti=0;i<HASH_BUCKET;i++){//从0号桶开始charinnerfname[MAX_BYTES];charouterfname[MAX_BYTES];//读文件sprintf(innerfname,"/cygdrive/d/tmp/hash/%s_%d.csv","inner",i);sprintf(outerfname,"/cygdrive/d/tmp/hash/%s_%d.csv","outer",i);FILE*fd_inner=fopen(innerfname,"r");if(fd_inner==NULL){//printf("Cannotopenfile:%s\n",filename);continue;}FILE*fd_outer=fopen(outerfname,"r");if(fd_outer==NULL){continue;}for(;!feof(fd_outer);){intv_out=read_int(fd_outer,buffer);if(v_out==0)continue;for(;!feof(fd_inner);){intv_in=read_int(fd_inner,buffer);if(v_in==0)continue;if(v_out==v_in){printf("Foundone,hashbucketis%d,valueis:%d.\n",i,v_out);}}rewind(fd_inner);}}}//执行Hash连接voidhash_join(char*file1,char*file2,char*flag){printf("-----------hashjoin----------\n");FILE*outerfile=fopen(file1,"r");if(outerfile==NULL){printf("Cannotopenfile%s.\n",file1);return;}//打开第二个文件FILE*innerfile=fopen(file2,"r");if(innerfile==NULL){printf("Cannotopenfile%s.\n",file2);return;}//执行JOINif(strcmp(flag,"memory")==0)hash_join_onmemory(outerfile,innerfile);elsehash_join_ondisk(outerfile,innerfile);//关闭fclose(outerfile);fclose(innerfile);}

运行输出

$catfile1.csv1234512342939900220$catfile2.csv1120340555023433901$/cygdrive/d/tmp/test.exefile1.csvfile2.csv-------------usememory-----------------------------hashjoin---------------------hash_join_onmemory---------------------generate_bucket---------------------load_hashtable----------Foundone,hashbucketis1,valueis:1.Foundone,hashbucketis3,valueis:3.Foundone,hashbucketis1,valueis:1.Foundone,hashbucketis106,valueis:234.Foundone,hashbucketis20,valueis:20.-------------usedisk-----------------------------hashjoin---------------------hash_join_ondisk---------------------generate_bucket---------------------generate_bucket----------Foundone,hashbucketis1,valueis:1.Foundone,hashbucketis1,valueis:1.Foundone,hashbucketis3,valueis:3.Foundone,hashbucketis20,valueis:20.Foundone,hashbucketis106,valueis:234.

“分析数据库实现原理”的内容就介绍到这里了,感谢大家的阅读。如果想了解更多行业相关的知识可以关注亿速云网站,小编将为大家输出更多高质量的实用文章!