C++ Coder

HCP高性能计算架构,实现,编译器指令优化,算法优化, LLVM CLANG OpenCL CUDA OpenACC C++AMP OpenMP MPI

C++博客 首页 新随笔 联系 聚合 管理
  98 Posts :: 0 Stories :: 0 Comments :: 0 Trackbacks

以下代码在win7 home basic , ati hd 5450平台测试通过,处理速度为每秒100万次。

 

程序很简单,只有一个main.cpp程序。Device端只有一个md5.cl文件。

下面我把代码贴出来,因为不能上传附件,我把完整工程包放到了242337476的群共享里面。。。。

 

main.cpp

复制代码
#include "CL\cl.h"

#include <stdio.h>
#include <iostream>

#include <Windows.h>

void main()
{
    cl_int err_code;

    printf("\nLet's rock!!!!!\n");

    //setup platform
    cl_platform_id platIDs[128];
    cl_uint numPlatform;

    if( clGetPlatformIDs(128, platIDs, &numPlatform) != CL_SUCCESS )
    {
        printf("error: clGetPlatformIDs\n");
        return;
    }

    printf("number platforms we found :%d\n", numPlatform);

    for(int i=0; i<numPlatform; i++)
    {
        char buf[500];
        size_t s;
        if(clGetPlatformInfo(platIDs[i],CL_PLATFORM_NAME, 500, buf, &s) != CL_SUCCESS)
        {
            printf("error: clGetPlatformInfo\n");
            return;
        }

        printf("%d:%s\n", i, buf);
    }

    //get device info
    cl_device_id deviceIDs[128];
    cl_uint numDevice;
    if(clGetDeviceIDs(platIDs[0], CL_DEVICE_TYPE_GPU, 128, deviceIDs, &numDevice) != CL_SUCCESS)
    {
        printf("error: clGetDeviceIDs\n");
        return;
    }

    printf("number device we found :%d\n", numDevice);

    for(int i=0; i<numDevice; i++)
    {
        char buf[500]; 
        size_t s;
        if(clGetDeviceInfo(deviceIDs[i], CL_DEVICE_NAME, 500, buf, &s) != CL_SUCCESS)
        {
            printf("error: clGetDeviceInfo\n");
            return;
        }

        printf("Device %d, %s\n", deviceIDs[i], buf);
    }

    //user platform 0//////////////////////
    //create context

    cl_context theContext;

    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platIDs[0], 0 };

    theContext = clCreateContext(cps, 1, deviceIDs, NULL, NULL, &err_code);
    if(err_code != CL_SUCCESS)
    {
        printf("error: clCreateContext\n");
        return;
    }


    //command queue
    cl_command_queue commandQ;

    commandQ = clCreateCommandQueue(theContext, deviceIDs[0], 0, &err_code);
    if(err_code != CL_SUCCESS)
    {
        printf("error: clCreateCommandQueue\n");
        return;
    }


    //program
    //read from file to buf
    FILE *fp;
    fp = fopen("md5.cl", "rb");

    static char source[1024*1024];//1MByte to big to the default stack
    size_t numRead = fread(source, 1, 1024*1024, fp);
    printf("read %d bytes from md5.cl\n", numRead);

    fclose(fp);

    //create program 
    cl_program Prog;

    char* srcarr[1];
    srcarr[0] = source;
    size_t srclen[1];
    srclen[0] = numRead;

    Prog = clCreateProgramWithSource(theContext, 1, (const char **)srcarr, srclen, &err_code);
    if(err_code != CL_SUCCESS)
    {
        printf("error: clCreateProgramWithSource\n");
        return;
    }

    //build program
    if(clBuildProgram(Prog, 1, deviceIDs, "", NULL, NULL) != CL_SUCCESS)
    {

        // Shows the log
        char* build_log;
        size_t log_size;
        // First call to know the proper size
        clGetProgramBuildInfo(Prog, deviceIDs[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        build_log = new char[log_size+1];
        // Second call to get the log
        clGetProgramBuildInfo(Prog, deviceIDs[0], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
        build_log[log_size] = '\0';
        printf("%s\n", build_log);
        delete[] build_log;


        printf("error: clBuildProgram\n");
        return;
    }

    //Create kernel
    cl_kernel K;
    K = clCreateKernel(Prog, "md5_kernel", &err_code);
    if(err_code != CL_SUCCESS)
    {
        printf("error: clCreateKernel\n");
        return;
    }

    //alloc memory and set kernel param
    cl_mem res_d = clCreateBuffer(theContext, CL_MEM_WRITE_ONLY, 1000*16, NULL, &err_code);
    if(err_code != CL_SUCCESS)
    {
        printf("error: clCreateBuffer\n");
        return;
    }

    err_code = clSetKernelArg(K, 0, sizeof(cl_mem), &res_d);
    if(err_code != CL_SUCCESS)
    {
        printf("error: clSetKernelArg\n");
        return;
    }

    const size_t global_ws = 1000;

    printf("Start\n");
    DWORD sTime = GetTickCount();
    for(int i=0; i<10000; i++)
    {

        if(clEnqueueNDRangeKernel(commandQ, K, 1, NULL, &global_ws, 0, 0, NULL, NULL) != CL_SUCCESS)
        {
            printf("error: clEnqueueNDRangeKernel\n");
        }

        clFinish(commandQ);
        printf("%d\r", i);
    }

    DWORD dTime = GetTickCount() - sTime;

    printf("\nFinished in %f second\n", dTime/1000.0f);


    //display result
    char* check = new char[1000*16];
    clEnqueueReadBuffer(commandQ, res_d, CL_TRUE, 0, 1000*16, check, 0, NULL, NULL);

    //*(check + 32) = '\0';
    //printf("Result is : %s\n", check);

    for(int j=0; j<10; j++)
    {
        for (int i = 0; i < 16; i++)
        {
            printf("%02x",(unsigned char)check[j*16 + i]);
        }
        printf("\n");
    }

    printf("\n");


    delete[] check;

    //release
    clReleaseKernel(K);
    clReleaseMemObject(res_d);
    clReleaseCommandQueue(commandQ);
    clReleaseProgram(Prog);
    clReleaseContext(theContext);
    

}
复制代码

 

 

md5.cl

复制代码
// Enter your kernel in this window
/* MD5lib.h - md5 library
 */

/* Copyright (C) 1990-2, RSA Data Security, Inc. Created 1990. All
rights reserved.

RSA Data Security, Inc. makes no representations concerning either
the merchantability of this software or the suitability of this
software for any particular purpose. It is provided "as is"
without express or implied warranty of any kind.

These notices must be retained in any copies of any part of this
documentation and/or software.
 */

/* The following makes MD default to MD5 if it has not already been
  defined with C compiler flags.
 */

#define MD 5

/* GLOBAL.H - RSAREF types and constants
 */

/* PROTOTYPES should be set to one if and only if the compiler supports
  function argument prototyping.
  The following makes PROTOTYPES default to 0 if it has not already
  been defined with C compiler flags.
 */
#ifndef PROTOTYPES
#define PROTOTYPES 0
#endif

/* POINTER defines a generic pointer type */
typedef unsigned char *POINTER;

/* UINT2 defines a two byte word */
typedef unsigned short UINT2;

/* UINT4 defines a four byte word */
typedef unsigned int UINT4;

/* PROTO_LIST is defined depending on how PROTOTYPES is defined above.
If using PROTOTYPES, then PROTO_LIST returns the list, otherwise it
  returns an empty list.
 */
#if PROTOTYPES
#define PROTO_LIST(list) list
#else
#define PROTO_LIST(list) ()
#endif


 /* Length of test block, number of test blocks.
 */
#define TEST_BLOCK_LEN 1000
#define TEST_BLOCK_COUNT 1000

  

/* Constants for MD5Transform routine.
 */
#define S11 7
#define S12 12
#define S13 17
#define S14 22
#define S21 5
#define S22 9
#define S23 14
#define S24 20
#define S31 4
#define S32 11
#define S33 16
#define S34 23
#define S41 6
#define S42 10
#define S43 15
#define S44 21

char* MDString PROTO_LIST ((char *));
char* MDFile PROTO_LIST ((char *));
char* hmac_md5(char* text, char* key);

typedef struct {
  UINT4 state[4];                                   /* state (ABCD) */
  UINT4 count[2];        /* number of bits, modulo 2^64 (lsb first) */
  unsigned char buffer[64];                         /* input buffer */
} MD5_CTX;

/*void MD5Init PROTO_LIST ((MD5_CTX *));
void MD5Update PROTO_LIST
  ((MD5_CTX *, unsigned char *, unsigned int));
void MD5Final PROTO_LIST ((unsigned char [16], MD5_CT X *));

void MD5Transform PROTO_LIST ((UINT4 [4], unsigned char [64]));
void Encode PROTO_LIST
  ((unsigned char *, UINT4 *, unsigned int));
void Decode PROTO_LIST
  ((UINT4 *, unsigned char *, unsigned int));
void MD5_memcpy PROTO_LIST ((POINTER, POINTER, unsigned int));
void MD5_memset PROTO_LIST ((POINTER, int, unsigned int));
*/
/*__global unsigned char PADDING[64] = {
  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};*/

/* F, G, H and I are basic MD5 functions.
 */
#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
#define H(x, y, z) ((x) ^ (y) ^ (z))
#define I(x, y, z) ((y) ^ ((x) | (~z)))

/* ROTATE_LEFT rotates x left n bits.
 */
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))

/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
Rotation is separate from addition to prevent recomputation.
 */
#define FF(a, b, c, d, x, s, ac) {(a)+=F((b), (c), (d)) + (x) + (UINT4)(ac);(a)= ROTATE_LEFT((a),(s)); (a)+=(b);}
#define GG(a, b, c, d, x, s, ac) {(a) += G ((b), (c), (d)) + (x) + (UINT4)(ac);(a) = ROTATE_LEFT ((a), (s)); (a) += (b);}
#define HH(a, b, c, d, x, s, ac) {(a) += H ((b), (c), (d)) + (x) + (UINT4)(ac);(a) = ROTATE_LEFT ((a), (s)); (a) += (b);}
#define II(a, b, c, d, x, s, ac) {(a) += I ((b), (c), (d)) + (x) + (UINT4)(ac);(a) = ROTATE_LEFT ((a), (s)); (a) += (b);}
void MD5Init (MD5_CTX *context);
void MD5Update(MD5_CTX *context, unsigned char *input,unsigned int inputLen);
  
void MD5Final (unsigned char digest[16], MD5_CTX *context);
void MD5Transform  (UINT4 [4], unsigned char [64]) ;
void Encode(unsigned char *, UINT4 *, unsigned int);
void Decode (UINT4 *, unsigned char *, unsigned int);
void MD5_memcpy(POINTER, POINTER, unsigned int);
void MD5_memset(POINTER, int, unsigned int);

/* MD5 initialization. Begins an MD5 operation, writing a new context.
 */
void MD5Init (MD5_CTX *context)
                                        /* context */
{
  context->count[0] = context->count[1] = 0;
  /* Load magic initialization constants.
*/
  context->state[0] = 0x67452301;
  context->state[1] = 0xefcdab89;
  context->state[2] = 0x98badcfe;
  context->state[3] = 0x10325476;
}

/* MD5 block update operation. Continues an MD5 message-digest
  operation, processing another message block, and updating the
  context.
 */
void MD5Update (MD5_CTX *context, unsigned char *input,unsigned int inputLen  )
                                         /* context */
                              /* input block */
                     /* length of input block */
{
  unsigned int i, index, partLen;

  /* Compute number of bytes mod 64 */
  index = (unsigned int)((context->count[0] >> 3) & 0x3F);

  /* Update number of bits */
  if ((context->count[0] += ((UINT4)inputLen << 3))
  < ((UINT4)inputLen << 3))
 context->count[1]++;
  context->count[1] += ((UINT4)inputLen >> 29);

  partLen = 64 - index;

  /* Transform as many times as possible.
*/
  if (inputLen >= partLen) {
 MD5_memcpy
   ((POINTER)&context->buffer[index], (POINTER)input, partLen);
 MD5Transform (context->state, context->buffer);

 for (i = partLen; i + 63 < inputLen; i += 64)
   MD5Transform (context->state, &input[i]);

 index = 0;
  }
  else
 i = 0;

  /* Buffer remaining input */
  MD5_memcpy
 ((POINTER)&context->buffer[index], (POINTER)&input[i],
  inputLen-i);
}

/* MD5 finalization. Ends an MD5 message-digest operation, writing the
  the message digest and zeroizing the context.
 */
void MD5Final (unsigned char digest[16], MD5_CTX *context)
                       /* message digest */
                                        /* context */
{
  unsigned char bits[8];
  unsigned int index, padLen;

  /* Save number of bits */
  Encode (bits, context->count, 8);

  /* Pad out to 56 mod 64.
*/
  index = (unsigned int)((context->count[0] >> 3) & 0x3f);
  padLen = (index < 56) ? (56 - index) : (120 - index);




  unsigned char PADDING[64] = {
  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  };

  MD5Update (context,(unsigned char*) PADDING, padLen);

  /* Append length (before padding) */
  MD5Update (context, bits, 8);
  /* Store state in digest */
  Encode (digest, context->state, 16);

  /* Zeroize sensitive information.
*/
  MD5_memset ((POINTER)context, 0, sizeof (*context));
}

/* MD5 basic transformation. Transforms state based on block.
 */
static void MD5Transform (UINT4 state[4],
unsigned char block[64])
 
{
 
 UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];

  Decode (x, block, 64);

  /* Round 1 */
  FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
  FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
  FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
  FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
  FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
  FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
  FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
  FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
  FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
  FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
  FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
  FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
  FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
  FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
  FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
  FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */

 /* Round 2 */
  GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
  GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
  GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
  GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
  GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
  GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
  GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
  GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
  GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
  GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
  GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
  GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
  GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
  GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
  GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
  GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */

  /* Round 3 */
  HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
  HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
  HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
  HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
  HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
  HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
  HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
  HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
  HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
  HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
  HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
  HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
  HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
  HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
  HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
  HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */

  /* Round 4 */
  II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
  II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
  II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
  II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
  II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
  II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
  II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
  II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
  II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
  II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
  II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
  II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
  II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
  II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
  II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
  II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */

  state[0] += a;
  state[1] += b;
  state[2] += c;
  state[3] += d;

  /* Zeroize sensitive information.
  */
  MD5_memset ((POINTER)x, 0, sizeof (x));
}

/* Encodes input (UINT4) into output (unsigned char). Assumes len is
  a multiple of 4.
 */
void Encode (unsigned char *output,
UINT4 *input,
unsigned int len)
 
{
  unsigned int i, j;

  for (i = 0, j = 0; j < len; i++, j += 4) {
 output[j] = (unsigned char)(input[i] & 0xff);
 output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
 output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
 output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
  }
}

/* Decodes input (unsigned char) into output (UINT4). Assumes len is
  a multiple of 4.
 */
void Decode (UINT4 *output,
unsigned char *input,
unsigned int len)
 
{
  unsigned int i, j;

  for (i = 0, j = 0; j < len; i++, j += 4)
 output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
   (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
}

/* Note: Replace "for loop" with standard memcpy if possible.
 */

void MD5_memcpy (POINTER output,
POINTER input,
unsigned int len)
 
{
  unsigned int i;

  for (i = 0; i < len; i++)
  output[i] = input[i];
}

/* Note: Replace "for loop" with standard memset if possible.
 */
void MD5_memset (POINTER output,
int value,
unsigned int len)
 
{
  unsigned int i;

  for (i = 0; i < len; i++)
 ((char *)output)[i] = (char)value;
}


void myitoa(int i, char* string)
{
        int power, j;
        j=i; 
        for (power=1;j>=10;j/=10) 
            power*=10; 
        for (;power>0;power/=10)
        {
            *string++='0'+i/power; i%=power; 
        }
        *string='\0';
}



__kernel void md5_kernel (__global char* res)
{
    int idx = get_global_id(0);
    idx = idx%1000;
    char src[32];

    for(int i=0; i<32; i++)
    {
        src[i] = 0;
    }

    myitoa(idx, src);

    int len;
    for(int i=0; i<32; i++)
    {
        if(src[i] == 0)
        {
            len = i;
            break;
        }
    }

    unsigned char digest[16];

    MD5_CTX context;
    MD5Init (&context);
    MD5Update (&context, (unsigned char*)src, len);
    MD5Final (digest, &context);

    for(int i=0; i<16; i++)
   {
        *(res + idx*16 + i) = digest[i];
   }
}


http://www.cnblogs.com/leiben/archive/2012/06/07/2540330.html
posted on 2012-12-27 10:46 jackdong 阅读(412) 评论(0)  编辑 收藏 引用 所属分类: OpenCL

只有注册用户登录后才能发表评论。
网站导航: 博客园   IT新闻   BlogJava   知识库   博问   管理