步骤1:准备好eclipse、eclipse svn插件、MySQL准备好,mysql使用utf-8编码
步骤2:mysql建库,建表:
CREATE DATABASE nutch ;
CREATE TABLE `webpage` (
`id` varchar(767) NOT NULL,
`headers` blob,
`text` mediumtext DEFAULT NULL,
`status` int(11) DEFAULT NULL,
`markers` blob,
`parseStatus` blob,
`modifiedTime` bigint(20) DEFAULT NULL,
`score` float DEFAULT NULL,
`typ` varchar(32) CHARACTER SET latin1 DEFAULT NULL,
`baseUrl` varchar(767) DEFAULT NULL,
`content` longblob,
`title` varchar(2048) DEFAULT NULL,
`reprUrl` varchar(767) DEFAULT NULL,
`fetchInterval` int(11) DEFAULT NULL,
`prevFetchTime` bigint(20) DEFAULT NULL,
`inlinks` mediumblob,
`prevSignature` blob,
`outlinks` mediumblob,
`fetchTime` bigint(20) DEFAULT NULL,
`retriesSinceFetch` int(11) DEFAULT NULL,
`protocolStatus` blob,
`signature` blob,
`metadata` blob,
PRIMARY KEY (`id`)
) ENGINE=InnoDB
ROW_FORMAT=COMPRESSED
DEFAULT CHARSET=utf8mb4;
`id` varchar(767) NOT NULL 这个在我本机是不能成功的,只能最大设置为100 所以改为:`id` varchar(100) NOT NULL
步骤3:从 https://svn.apache.org/repos/asf/nutch/tags/release-2.1 拉下代码,在本地创建Java project。本人因为试验过很多次,所以在此取项目名称为test。
步骤4:加src文件
在project explorer下右击项目,选择properties。进入java build path ,在source选项卡,删除src文件夹,选择“Add Folder ”,在这里把conf,src/bin,src/java,src/test,src/testresources,以及src/plugin文件夹下各个插件的src和test也加入进来。最终可以看到如下界面(test为项目名称):
在每个eclipse 项目文件夹下有 .classpath文件,打开 .classpath文件能看到:内容基本是这样的。
<classpathentry kind="src" path="conf"/>
<classpathentry kind="src" path="src/java"/>
<classpathentry kind="src" path="src/test"/>
<classpathentry kind="src" path="src/plugin/protocol-file/src/test"/>
<classpathentry kind="src" path="src/plugin/protocol-httpclient/src/test"/>
<classpathentry kind="src" path="src/plugin/subcollection/src/test"/>
<classpathentry kind="src" path="src/plugin/parse-html/src/test"/>
<classpathentry kind="src" path="src/plugin/urlfilter-automaton/src/test"/>
<classpathentry kind="src" path="src/plugin/parse-html/src/java"/>
<classpathentry kind="src" path="src/plugin/parse-tika/src/test"/>
<classpathentry kind="src" path="src/plugin/lib-http/src/test"/>
<classpathentry kind="src" path="src/plugin/parse-tika/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-regex/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-domain/src/java"/>
<classpathentry kind="src" path="src/plugin/scoring-link/src/java"/>
<classpathentry kind="src" path="src/plugin/index-anchor/src/test"/>
<classpathentry kind="src" path="src/plugin/protocol-http/src/java"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-regex/src/test"/>
<classpathentry kind="src" path="src/plugin/urlfilter-prefix/src/java"/>
<classpathentry kind="src" path="src/plugin/scoring-opic/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-domain/src/test"/>
<classpathentry kind="src" path="src/plugin/protocol-file/src/java"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-regex/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-suffix/src/java"/>
<classpathentry kind="src" path="src/plugin/language-identifier/src/java"/>
<classpathentry kind="src" path="src/plugin/lib-regex-filter/src/test"/>
<classpathentry kind="src" path="src/plugin/language-identifier/src/test"/>
<classpathentry kind="src" path="src/plugin/subcollection/src/java"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-basic/src/test"/>
<classpathentry kind="src" path="src/plugin/index-basic/src/java"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-pass/src/test"/>
<classpathentry kind="src" path="src/plugin/creativecommons/src/java"/>
<classpathentry kind="src" path="src/bin"/>
<classpathentry kind="src" path="src/plugin/protocol-httpclient/src/java"/>
<classpathentry kind="src" path="src/plugin/tld/src/java"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-basic/src/java"/>
<classpathentry kind="src" path="src/plugin/index-basic/src/test"/>
<classpathentry kind="src" path="src/plugin/lib-http/src/java"/>
<classpathentry kind="src" path="src/plugin/protocol-ftp/src/java"/>
<classpathentry kind="src" path="src/plugin/index-anchor/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-validator/src/java"/>
<classpathentry kind="src" path="src/plugin/index-more/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-suffix/src/test"/>
<classpathentry kind="src" path="src/plugin/creativecommons/src/test"/>
<classpathentry kind="src" path="src/plugin/microformats-reltag/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-regex/src/test"/>
<classpathentry kind="src" path="src/plugin/lib-regex-filter/src/java"/>
<classpathentry kind="src" path="src/plugin/index-more/src/test"/>
<classpathentry kind="src" path="src/plugin/urlnormalizer-pass/src/java"/>
<classpathentry kind="src" path="src/plugin/urlfilter-automaton/src/java"/>
<classpathentry kind="src" path="src/testresources"/>