最新帖子 精华区 社区服务 会员列表 统计排行
主题 : Java Lucene排重实现group by
ceshi123 离线
级别: 管理员
宣传大使奖 特殊贡献奖 灌水天才奖
显示用户信息 
0  发表于: 2010-07-22 20:38

Java Lucene排重实现group by

购买流程:注册论坛账号------->在线充值购买论坛交易币------->下载毕业设计将直接扣除交易币
  package com.loongtao.lucene.test;

  import java.io.IOException;

  import org.apache.lucene.analysis.standard.StandardAnalyzer;

  import org.apache.lucene.document.Document;

  import org.apache.lucene.document.Field;

  import org.apache.lucene.document.Field.Index;

  import org.apache.lucene.document.Field.Store;

  import org.apache.lucene.index.CorruptIndexException;

  import org.apache.lucene.index.IndexWriter;

  import org.apache.lucene.index.Term;

  import org.apache.lucene.index.IndexWriter.MaxFieldLength;

  import org.apache.lucene.search.DuplicateFilter;

  import org.apache.lucene.search.Filter;

  import org.apache.lucene.search.IndexSearcher;

  import org.apache.lucene.search.Query;

  import org.apache.lucene.search.ScoreDoc;

  import org.apache.lucene.search.TermQuery;

  import org.apache.lucene.search.TopDocs;

  import org.apache.lucene.store.Directory;

  import org.apache.lucene.store.LockObtainFailedException;

  import org.apache.lucene.store.RAMDirectory;

  import org.apache.lucene.util.Version;

  public class DuplicateFilterTest {

  public static void main(String[] args) {

  Directory dir = new RAMDirectory();

  Document doc = new Document();

  doc.add(new Field("id", "binbin", Store.YES, Index.NOT_ANALYZED));

  doc.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED));

  doc.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED));

  doc.add(new Field("duplicate", "123456", Store.YES, Index.NOT_ANALYZED));

  Document doc1 = new Document();

  doc1.add(new Field("id", "yaoyao", Store.YES, Index.NOT_ANALYZED));

  doc1.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED));

  doc1.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED));

  doc1

  .add(new Field("duplicate", "123456", Store.YES,

  Index.NOT_ANALYZED));

  Document doc2 = new Document();

  doc2.add(new Field("id", "zhangjian", Store.YES, Index.NOT_ANALYZED));

  doc2.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED));

  doc2.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED));

  doc2

  .add(new Field("duplicate", "123455", Store.YES,

  Index.NOT_ANALYZED));

  Document doc3 = new Document();

  doc3.add(new Field("id", "liweicheng", Store.YES, Index.NOT_ANALYZED));

  doc3.add(new Field("string", "haha", Store.YES, Index.NOT_ANALYZED));

  doc3.add(new Field("time", "20100801", Store.YES, Index.NOT_ANALYZED));

  doc3

  .add(new Field("duplicate", "123451", Store.YES,

  Index.NOT_ANALYZED));

  try {

  IndexWriter indexWriter = new IndexWriter(dir,

  new StandardAnalyzer(Version.LUCENE_30), true,

  MaxFieldLength.LIMITED);

  indexWriter.addDocument(doc);

  indexWriter.addDocument(doc1);

  indexWriter.addDocument(doc2);

  indexWriter.addDocument(doc3);

  indexWriter.close();

  Query query = new TermQuery(new Term("string", "haha"));

  Filter filter = new DuplicateFilter("duplicate");

  IndexSearcher indexSearcher = new IndexSearcher(dir);

  TopDocs top = indexSearcher.search(query, filter, 200);

  ScoreDoc[] scoreDocs = top.scoreDocs;

  for (ScoreDoc scoreDoc : scoreDocs) {

  Document rdoc = indexSearcher.doc(scoreDoc.doc);

  System.out.print("id:"+rdoc.get("id") +"  排重ID:" +rdoc.get("duplicate"));

  Query queryDuplicate = new TermQuery(new Term("duplicate", rdoc.get("duplicate")));

  System.out.println("转载:"+ indexSearcher.search(queryDuplicate, 100).totalHits

  );

  }

  } catch (CorruptIndexException e) {

  e.printStackTrace();

  } catch (LockObtainFailedException e) {

  e.printStackTrace();

  } catch (IOException e) {

  // TODO Auto-generated catch block

  e.printStackTrace();

  }

  }

  }

快速回复 顶端
内容
HTML 代码不可用
使用签名
Wind Code自动转换

验证问题:本站域名是什么?答案:cccbbs.net  正确答案:cccbbs.net
按"Ctrl+Enter"直接提交