solr - I don't want any metadata added/extracted -

i'm using tika index pdf files , in solr-ui can see lot of metadata , other "stuffs" don't care indexed too:

"response": {     "numfound": 1,     "start": 0,     "docs": [       {         "meta": [           "dc:subject",           "",           "meta:save-date",           "2014-01-09t11:07:45z",           "subject",           "",           "author",           "smalik",           "dcterms:created",           "2014-01-09t11:07:45z",           "date",           "2014-01-09t11:07:45z",           "creator",           "smalik",           "creation-date",           "2014-01-09t11:07:45z",           "meta:author",           "johndoe",           "stream_content_type",           "",           "created",           "thu jan 09 12:07:45 cet 2014",           "stream_size",           "null",           "meta:keyword",           "",           "cp:subject",           "",           "xmp:creatortool",           "pscript5.dll version 5.2.2",           "keywords",           "",           "last-save-date",           "2014-01-09t11:07:45z",           "dc:title",           "e-mail zur archivierung",           "meta:creation-date",           "2014-01-09t11:07:45z",           "dcterms:modified",           "2014-01-09t11:07:45z",           "dc:creator",           "johndoe",           "last-modified",           "2014-01-09t11:07:45z",           "modified",           "2014-01-09t11:07:45z",           "xmptpg:npages",           "1",           "producer",           "www.adlibsoftware.com:exs41012-windows 2008 r2:tng",           "content-type",           "application/pdf"         ],         "div": [           "page"         ],         "id": [           "aaa11besd4effsujqub6toubqr4m3.pdf"         ],         "dc_subject": [           ""         ],         "meta_save_date": [           "2014-01-09t11:07:45z"         ],         "subject": [           ""         ],         "author": [           "johndoe"         ],         "dcterms_created": [           "2014-01-09t11:07:45z"         ],         "date": [           "2014-01-09t11:07:45z"         ],         "creator": [           "johndoe"         ],         "creation_date": [           "2014-01-09t11:07:45z"         ],         "title": [           "e-mail zur archivierung"         ],         "meta_author": [           "johndoe"         ],         "stream_content_type": [           ""         ],         "created": [           "thu jan 09 12:07:45 cet 2014"         ],         "stream_size": [           "null"         ],         "meta_keyword": [           ""         ],         "cp_subject": [           ""         ],         "xmp_creatortool": [           "pscript5.dll version 5.2.2"         ],         "keywords": [           ""         ],         "last_save_date": [           "2014-01-09t11:07:45z"         ],         "dc_title": [           "e-mail zur archivierung"         ],         "meta_creation_date": [           "2014-01-09t11:07:45z"         ],         "dcterms_modified": [           "2014-01-09t11:07:45z"         ],         "dc_creator": [           "johndoe"         ],         "last_modified": [           "2014-01-09t11:07:45z"         ],         "modified": [           "2014-01-09t11:07:45z"         ],         "xmptpg_npages": [           "1"         ],         "producer": [           "www.adlibsoftware.com:exs41012-windows 2008 r2:tng"         ],         "content_type": [           "application/pdf"         ],         "fulltext": [" abcdef"],         "uid": "d41d8cd98f00b204e9800998ecf8427e"       }     ]   }

as i'm interested in "fulltext" , "id", know how/what have set/define in schema.xml and/or solrconfig.xml avoid unnecessary data.

what this:

"response": {         "numfound": 1,         "start": 0,         "docs": [           {             "id": [               "aaa11besd4effsujqub6toubqr4m3.pdf"             ],             "fulltext": [" abcdef"],             "uid": "d41d8cd98f00b204e9800998ecf8427e"           }         ]       }

actually schema , solrconfig.xml this:

<?xml version="1.0" encoding="utf-8" ?> <schema name="simple" version="1.1">     <types>         <fieldtype name="string" class="solr.strfield" postingsformat="simpletext" />         <fieldtype name="text" class="solr.textfield" postingsformat="simpletext">             <analyzer>                 <charfilter class="solr.patternreplacecharfilterfactory" pattern="\n" replacement=""/>                 <tokenizer class="solr.standardtokenizerfactory"/>                 <filter class="solr.lowercasefilterfactory" /> <!--lowercases letters in each token. leaves non-letter tokens alone.-->                 <filter class="solr.classicfilterfactory" /> <!--removes dots acronyms , 's end of tokens. works on typed tokens produced classictokenizer or equivalent.-->                 <filter class="solr.trimfilterfactory"/> <!--trims whitespace @ either end of token. -->                 <filter class="solr.stopfilterfactory" ignorecase="true"/> <!--discards common words.  -->                 <filter class="solr.removeduplicatestokenfilterfactory"/>             </analyzer>         </fieldtype>     </types>      <fields>         <field name="uid" type="string" indexed="true" stored="true"             multivalued="false" />         <dynamicfield name="*" type="string" multivalued="true"             indexed="true" stored="true" />         <field name="content" indexed="true"  type="text" multivalued="true" />     </fields>      <defaultsearchfield>content</defaultsearchfield>      <solrqueryparser defaultoperator="or" />     <uniquekey>uid</uniquekey> </schema>   <?xml version="1.0" encoding="utf-8" ?> <config>     <lucenematchversion>lucene_45</lucenematchversion>     <directoryfactory name='directoryfactory' class='solr.mmapdirectoryfactory' />      <codecfactory name="codecfactory" class="solr.schemacodecfactory" />      <lib dir='${solr.core.instancedir}\lib' />     <lib dir="${solr.core.instancedir}\dist\" regex="solr-cell-\d.*\.jar" />     <lib dir="${solr.core.instancedir}\contrib\extraction\lib" regex=".*\.jar" />      <requesthandler name="standard" class="solr.standardrequesthandler" default="true" />      <requesthandler name="/update" class="solr.updaterequesthandler">         <lst name="defaults">             <str name="update.chain">deduplication</str>         </lst>     </requesthandler>      <requesthandler name="/update/extract"         class="solr.extraction.extractingrequesthandler">         <lst name="defaults">             <str name="captureattr">true</str>             <str name="lowernames">true</str>             <str name="overwrite">false</str>             <str name="captureattr">true</str>             <str name="literalsoverride">true</str>             <str name="uprefix">ignored_</str>             <str name="fmap.a">link</str>             <str name="fmap.content">fulltext</str>             <!-- configuration here useful tests -->             <str name="update.chain">deduplication</str>         </lst>     </requesthandler>      <updaterequestprocessorchain name="deduplication">         <processor             class="org.apache.solr.update.processor.signatureupdateprocessorfactory">             <bool name="overwritedupes">false</bool>             <str name="signaturefield">uid</str>             <bool name="enabled">true</bool>             <str name="fields">content</str>             <str name="mintokenlen">10</str>             <str name="quantrate">.2</str>             <str name="signatureclass">solr.update.processor.textprofilesignature</str>         </processor>         <processor class="solr.logupdateprocessorfactory" />         <processor class="solr.runupdateprocessorfactory" />     </updaterequestprocessorchain>      <requesthandler name="/admin/"         class="org.apache.solr.handler.admin.adminhandlers" />      <locktype>none</locktype>      <admin>         <defaultquery>*:*</defaultquery>     </admin>  </config>

see alexandre's answer , examples here. if getting fields not need, need explicitly declare them in schema , set both indexed , stored false (meaning solr ignore field). can use dynamic fields ignore whole bunch of them common prefix or suffix, typically case docs generated tika.

Search This Blog

And

solr - I don't want any metadata added/extracted -

Comments

Post a Comment

Popular posts from this blog

Android layout hidden on keyboard show -

google app engine - 403 Forbidden POST - Flask WTForms -

how to run a query SQL in node.js mysql -