网络爬虫需要从指定的URL通过HTTP协议来获得HTML文件信息,以此从一个URL爬到另一个URL。在Windows平台上,这往往通过WinINet接口实现。
但是,如果对HTTP协议熟悉的话,也可以通过Winsock接口实现。代码如下。
1 #pragma warning (disable:4996)
2
3 #define DEFAULT_URL "http://www.google.com"
4
5 BOOL WinsockStartup(BYTE highVer, BYTE lowVer)
6 {
7 WSADATA wsaData;
8 return WSAStartup(MAKEWORD(highVer, lowVer), &wsaData) == 0;
9 }
10
11 int SendData(SOCKET s, char * data)
12 {
13 return send(s, data, strlen(data), 0);
14 }
15
16 void ParseTheURL(char * pszURL, char * pszHostName)
17 {
18 char * p, * pHostStart;
19
20 p = strstr(pszURL, "http://");
21 if (p && p == pszURL)
22 {
23 pHostStart = pszURL + 7;
24 }
25 else
26 {
27 pHostStart = pszURL;
28 }
29 p = strchr(pHostStart, '/');
30 if (p)
31 {
32 memcpy(pszHostName, pHostStart, p - pHostStart);
33 }
34 else
35 {
36 memcpy(pszHostName, pHostStart, strlen(pHostStart));
37 }
38 }
39
40 int _tmain()
41 {
42 int iRet = 0;
43 DWORD dwError = 0;
44 BOOL bOk = FALSE;
45
46 char szURL[256] = { 0 }; // 主机文件,即URL
47 char szHostName[256] = { 0 }; // 主机名
48 char szPortName[] = "80"; // 端口号
49
50 if (!WinsockStartup(2, 2))
51 {
52 _tcprintf(TEXT("初始化Windows Sockets失败!"));
53 cin.getline(szURL, 255);
54 return -1;
55 }
56
57 addrinfo aiHints = { 0 };
58 addrinfo * aiList;
59
60 aiHints.ai_family = AF_INET;
61 aiHints.ai_socktype = SOCK_STREAM;
62 aiHints.ai_protocol = IPPROTO_TCP;
63
64 cout<<"输入URL:";
65 cin.getline(szURL, 255);
66
67 if (strcmp(szURL, "") == 0)
68 {
69 strcpy(szURL, DEFAULT_URL);
70 cout<<DEFAULT_URL<<endl;
71 }
72
73 ParseTheURL(szURL, szHostName);
74
75 if (getaddrinfo(szHostName, szPortName, NULL, &aiList) != 0)
76 {
77 _tcprintf_s(TEXT("getaddrinfo失败:%d"), WSAGetLastError());
78 WSACleanup();
79 cin.getline(szURL, 255);
80 return -1;
81 }
82
83 SOCKET s;
84 for (addrinfo * aiPtr = aiList; aiPtr != NULL; aiPtr = aiPtr->ai_next)
85 {
86 s = socket(aiList->ai_family, aiList->ai_socktype, aiList->ai_protocol);
87 if (s == INVALID_SOCKET)
88 {
89 _tcprintf_s(TEXT("socket创建失败:%d"), WSAGetLastError());
90 continue;
91 }
92
93 if (connect(s, aiPtr->ai_addr, aiPtr->ai_addrlen) == SOCKET_ERROR)
94 {
95 closesocket(s);
96 s = INVALID_SOCKET;
97 _tcprintf_s(TEXT("connect失败:%d"), WSAGetLastError());
98 continue;
99 }
100 break;
101 }
102
103 freeaddrinfo(aiList);
104
105 if (s == INVALID_SOCKET)
106 {
107 WSACleanup();
108 cin.getline(szURL, 255);
109 return -1;
110 }
111
112 char requestData[512] = { 0 };
113 sprintf(requestData, "GET %s HTTP/1.1\r\n", szURL);
114 SendData(s, requestData);
115 //SendData(s, "GET / HTTP/1.1\r\n");
116 sprintf(requestData, "Host:%s\r\n", szHostName);
117 SendData(s, requestData);
118 SendData(s, "Accept: */*\r\n");
119 SendData(s, "User-Agent: Mozilla/4.0(compatible; MSIE 5.00; Windows NT)\r\n");
120 SendData(s, "Connection:Close\r\n");
121 //SendData(s, "Connection:Keep-Alive\r\n");
122 SendData(s, "\r\n");
123 SendData(s, "\r\n");//最后要加空行
124
125 BOOL done = FALSE;
126 char buffer[1024] = { 0 };
127 int l, chars = 0;
128
129 // 打印http响应的头部
130 while (!done)
131 {
132 l = recv(s, buffer, 1, 0);
133 if (l <= 0)
134 done = TRUE;
135 switch(*buffer)
136 {
137 case '\r':
138 break;
139 case '\n':
140 if(chars == 0)
141 done = TRUE;
142 chars = 0; // 表示另起一行
143 break;
144 default:
145 ++chars;
146 break;
147 }
148 printf("%c",*buffer);
149 }
150
151 // 接收正文部分
152 int sum = 0;
153 do
154 {
155 l = recv(s, buffer, sizeof (buffer) - 1, 0);
156 if( l <= 0 )
157 break;
158 sum += l;
159 *(buffer + l) = 0;
160 printf(buffer);
161 } while( l > 0 );
162
163 //这里输出正文部分大小,发现其实和响应消息头部的Content-length大小是一样的
164 //这样就可以检查是否接受完毕
165 printf("\n\n大小 = %d字节\n",sum);
166
167 WSACleanup();
168
169 cin.getline(szURL, 255);
170 return 0;
171 }