Skip to content

Commit

Permalink
不再支持持久化存储,这个功能在client端实现比较合适
Browse files Browse the repository at this point in the history
  • Loading branch information
huichen committed Feb 21, 2016
1 parent 6ff8ddf commit 0a23aa3
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 178 deletions.
9 changes: 3 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
* 服务自动发现和注册(基于 etcd 和 registrator)
* 负载均衡 + 一致性哈希
* 服务端客户端通信基于 gRPC,支持多种编程语言的客户端
* 可设置抓取超时、页面重新抓取频率等
* 支持 GET、HEAD、POST、POSTFORM 四种方法
* 可设置抓取超时
* 支持 GET、HEAD、POST 方法
* 支持自定义 header
* 基于 docker volume 和文件系统的持久化存储

### 如何部署

Expand Down Expand Up @@ -44,13 +43,11 @@ docker run -d --name=registrator --net=host --volume=/var/run/docker.sock:/tmp/d
这会生成 unmerged/zerg 容器。然后在集群的每台服务器上启动容器:

```
docker run -d -P -v /opt/zerg_cache:/cache unmerged/zerg
docker run -d -P unmerged/zerg
```

registrator 会自动注册这些服务到 etcd。如果单机有多个 IP,你可以单机启动多个容器,并在 -P 中分别指定 IP。

抓取的页面内容会通过 docker volume 存储在 /opt/zerg_cache 目录下。

#### 第四步:调用样例代码

进入 examples 目录,运行
Expand Down
10 changes: 4 additions & 6 deletions example/single_machine_crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ import (
var (
address = flag.String("address", ":50051", "服务器地址")
url = flag.String("url", "", "URL")
ttl = flag.Int64("ttl", 0, "重新抓取TTL")
method = flag.String("method", "GET", "HTTP 请求类型:GET HEAD POST POSTFORM")
method = flag.String("method", "GET", "HTTP 请求类型:GET HEAD POST")
)

func main() {
Expand All @@ -27,10 +26,9 @@ func main() {

log.Printf("开始抓取")
request := pb.CrawlRequest{
Url: *url,
Timeout: 10000,
RecrawlTtl: *ttl,
Method: pb.Method(pb.Method_value[*method]),
Url: *url,
Timeout: 10000,
Method: pb.Method(pb.Method_value[*method]),
}
response, err := client.Crawl(context.Background(), &request)
if err != nil {
Expand Down
12 changes: 5 additions & 7 deletions example/zerg_crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,10 @@ import (
)

var (
url = flag.String("url", "", "URL")
ttl = flag.Int64("ttl", 0, "重新抓取TTL")
endPoints = flag.String("endpoints", "", "半角逗号分隔的 etcd 接入点列表,每个接入点地址以 http:// 开始")
serviceName = flag.String("service_name", "/services/zerg", "zerg 服务名")
method = flag.String("method", "GET", "HTTP 请求类型:GET HEAD POST POSTFORM")
method = flag.String("method", "GET", "HTTP 请求类型:GET HEAD POST")
url = flag.String("url", "", "URL")
)

func main() {
Expand All @@ -27,10 +26,9 @@ func main() {

// 调用 zerg 服务
request := pb.CrawlRequest{
Url: *url,
Timeout: 10000, // 超时 10 秒
RecrawlTtl: *ttl,
Method: pb.Method(pb.Method_value[*method]),
Url: *url,
Timeout: 10000, // 超时 10 秒
Method: pb.Method(pb.Method_value[*method]),
}
response, err := zc.Crawl(&request)
if err != nil {
Expand Down
103 changes: 39 additions & 64 deletions protos/crawl.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 0 additions & 12 deletions protos/crawl.proto
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,6 @@ message CrawlRequest {
// 抓取超时限制,单位毫秒,设为 0 时无超时
int64 timeout = 2;

bool use_cache = 3;

// 单位毫秒,当页面抓取时间比当前时间早超过 recrawl_ttl(含)时重新抓取,当该值为 0 时始终重新抓取
// 仅对 GET 和 HEAD 两类请求有效,POST 和 POSTFORM 始终重新抓取
int64 recrawl_ttl = 4;

// 是否仅返回 metadata 而忽略 content
bool only_return_metadata = 5;

Expand All @@ -33,16 +27,12 @@ message CrawlRequest {
// POST body,仅当请求类型为 POST 时有效
string post_body = 8;
string body_type = 9;

// POST form,仅当请求类型为 POSTFORM 时有效
repeated KV form_values = 10;
}

enum Method {
GET = 0;
HEAD = 1;
POST = 2;
POSTFORM = 3;
}

message KV {
Expand All @@ -53,12 +43,10 @@ message KV {
message CrawlResponse {
Metadata metadata = 1;
string content = 2;
bool is_fresh_crawl = 3; // 是否是在这次请求中抓取的
}

message Metadata {
uint32 length = 1;
int64 last_crawl_timestamp = 2;
repeated KV header = 3;
string status = 4;
int32 status_code = 5;
Expand Down
2 changes: 1 addition & 1 deletion service_container/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM busybox
EXPOSE 5000
ADD / /
CMD ./service --page_cache_dir /cache --address :5000
CMD ./service --address :5000
Loading

0 comments on commit 0a23aa3

Please sign in to comment.