larbin源代码分析<三> url类分析
一 分析utils包中的url类
该类代表实际中的一个url,成员变量主要有 ,char * file ,char * host , uint16_t port , int8_t depth, char * cookie
还有一个public 属性的 in_addr 表示一个ipv4的地址。
成员函数中主要有一些,比如构造函数,返回url,添加cookie等操作。
二 实例代码如下
// Larbin
// Sebastien Ailleret
// 15-11-99 -> 14-03-02
data:image/s3,"s3://crabby-images/13de6/13de6130588e8a001331bf125b484ea2f97d951e" alt=""
data:image/s3,"s3://crabby-images/f86b7/f86b7e502a0580d5e24db72fe38f81dda2bc052d" alt=""
/**//* This class describes an URL */
data:image/s3,"s3://crabby-images/13de6/13de6130588e8a001331bf125b484ea2f97d951e" alt=""
#ifndef URL_H
#define URL_H
data:image/s3,"s3://crabby-images/13de6/13de6130588e8a001331bf125b484ea2f97d951e" alt=""
#include <netinet/in.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <stdlib.h>
data:image/s3,"s3://crabby-images/13de6/13de6130588e8a001331bf125b484ea2f97d951e" alt=""
#include "types.h"
data:image/s3,"s3://crabby-images/13de6/13de6130588e8a001331bf125b484ea2f97d951e" alt=""
bool fileNormalize (char *file);
data:image/s3,"s3://crabby-images/13de6/13de6130588e8a001331bf125b484ea2f97d951e" alt=""
data:image/s3,"s3://crabby-images/f86b7/f86b7e502a0580d5e24db72fe38f81dda2bc052d" alt=""
class url
{
private:
char *host;
char *file;
uint16_t port; // the order of variables is important for physical size
int8_t depth;
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* parse the url */
void parse (char *s);
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//** parse a file with base */
void parseWithBase (char *u, url *base);
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* normalize file name */
bool normalize (char *file);
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* Does this url starts with a protocol name */
bool isProtocol (char *s);
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* constructor used by giveBase */
url (char *host, uint port, char *file);
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
public:
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* Constructor : Parses an url (u is deleted) */
url (char *u, int8_t depth, url *base);
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* constructor used by input */
url (char *line, int8_t depth);
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* Constructor : read the url from a file (cf serialize) */
url (char *line);
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* Destructor */
~url ();
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* inet addr (once calculated) */
struct in_addr addr;
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* Is it a valid url ? */
bool isValid ();
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* print an URL */
void print ();
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* return the host */
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
inline char *getHost ()
{ return host; }
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* return the port */
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
inline uint getPort ()
{ return port; }
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* return the file */
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
inline char *getFile ()
{ return file; }
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//** Depth in the Site */
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
inline int8_t getDepth ()
{ return depth; }
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* Set depth to max if we are at an entry point in the site
* try to find the ip addr
* answer false if forbidden by robots.txt, true otherwise */
bool initOK (url *from);
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//** return the base of the url
* give means that you have to delete the string yourself
*/
url *giveBase ();
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//** return a char * representation of the url
* give means that you have to delete the string yourself
*/
char *giveUrl ();
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//** write the url in a buffer
* buf must be at least of size maxUrlSize
* returns the size of what has been written (not including '\0')
*/
int writeUrl (char *buf);
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* serialize the url for the Persistent Fifo */
char *serialize ();
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* very thread unsafe serialisation in a static buffer */
char *getUrl();
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* return a hashcode for the host of this url */
uint hostHashCode ();
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* return a hashcode for this url */
uint hashCode ();
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
#ifdef URL_TAGS
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* tag associated to this url */
uint tag;
#endif // URL_TAGS
data:image/s3,"s3://crabby-images/6c6b8/6c6b84e662455f8092d9c42e3a86036cd3a28be1" alt=""
#ifdef COOKIES
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
/**//* cookies associated with this page */
char *cookie;
void addCookie(char *header);
#else // COOKIES
data:image/s3,"s3://crabby-images/db282/db282e9ea79ad6a7617774c9b676a45b33d46480" alt=""
inline void addCookie(char *header)
{}
#endif // COOKIES
};
data:image/s3,"s3://crabby-images/13de6/13de6130588e8a001331bf125b484ea2f97d951e" alt=""
#endif // URL_H
三 代码分析
url中的实现类主要是,创建url,其中创建规则如下:
http://www.hach.com/r/0343/ttt.html
则host为www.hach.com , file 为/r/0343/ttt.html
url的构造函数即根据 上述规则构建 url类。 若是含有base url 则新的url的file为 base->file + 新url 的file。
(2)另外url的hash函数即是 利用了 file 字符串和 host字符串。
/* return a hashcode for this url */
uint url::hashCode () {
unsigned int h=port;
unsigned int i=0;
while (host[i] != 0) {
h = 31*h + host[i];
i++;
}
i=0;
while (file[i] != 0) {
h = 31*h + file[i];
i++;
}
return h % hashSize;
}
(3) cookie的处理函数如下
若addCookie(char * head) 中的head字符串是以 set-cookie: 开始的,则将head之后的12个字符
添加到cookie变量中。
四综上:
url 类中的成员变量,char * file ,char * host , port , cookie 能够表示一个url。
并且url类中提供了解析函数,使用户可以根据从网页中爬取的url构造url类对象。