solr - I don't want any metadata added/extracted -


i'm using tika index pdf files , in solr-ui can see lot of metadata , other "stuffs" don't care indexed too:

"response": {     "numfound": 1,     "start": 0,     "docs": [       {         "meta": [           "dc:subject",           "",           "meta:save-date",           "2014-01-09t11:07:45z",           "subject",           "",           "author",           "smalik",           "dcterms:created",           "2014-01-09t11:07:45z",           "date",           "2014-01-09t11:07:45z",           "creator",           "smalik",           "creation-date",           "2014-01-09t11:07:45z",           "meta:author",           "johndoe",           "stream_content_type",           "",           "created",           "thu jan 09 12:07:45 cet 2014",           "stream_size",           "null",           "meta:keyword",           "",           "cp:subject",           "",           "xmp:creatortool",           "pscript5.dll version 5.2.2",           "keywords",           "",           "last-save-date",           "2014-01-09t11:07:45z",           "dc:title",           "e-mail zur archivierung",           "meta:creation-date",           "2014-01-09t11:07:45z",           "dcterms:modified",           "2014-01-09t11:07:45z",           "dc:creator",           "johndoe",           "last-modified",           "2014-01-09t11:07:45z",           "modified",           "2014-01-09t11:07:45z",           "xmptpg:npages",           "1",           "producer",           "www.adlibsoftware.com:exs41012-windows 2008 r2:tng",           "content-type",           "application/pdf"         ],         "div": [           "page"         ],         "id": [           "aaa11besd4effsujqub6toubqr4m3.pdf"         ],         "dc_subject": [           ""         ],         "meta_save_date": [           "2014-01-09t11:07:45z"         ],         "subject": [           ""         ],         "author": [           "johndoe"         ],         "dcterms_created": [           "2014-01-09t11:07:45z"         ],         "date": [           "2014-01-09t11:07:45z"         ],         "creator": [           "johndoe"         ],         "creation_date": [           "2014-01-09t11:07:45z"         ],         "title": [           "e-mail zur archivierung"         ],         "meta_author": [           "johndoe"         ],         "stream_content_type": [           ""         ],         "created": [           "thu jan 09 12:07:45 cet 2014"         ],         "stream_size": [           "null"         ],         "meta_keyword": [           ""         ],         "cp_subject": [           ""         ],         "xmp_creatortool": [           "pscript5.dll version 5.2.2"         ],         "keywords": [           ""         ],         "last_save_date": [           "2014-01-09t11:07:45z"         ],         "dc_title": [           "e-mail zur archivierung"         ],         "meta_creation_date": [           "2014-01-09t11:07:45z"         ],         "dcterms_modified": [           "2014-01-09t11:07:45z"         ],         "dc_creator": [           "johndoe"         ],         "last_modified": [           "2014-01-09t11:07:45z"         ],         "modified": [           "2014-01-09t11:07:45z"         ],         "xmptpg_npages": [           "1"         ],         "producer": [           "www.adlibsoftware.com:exs41012-windows 2008 r2:tng"         ],         "content_type": [           "application/pdf"         ],         "fulltext": [" abcdef"],         "uid": "d41d8cd98f00b204e9800998ecf8427e"       }     ]   } 

as i'm interested in "fulltext" , "id", know how/what have set/define in schema.xml and/or solrconfig.xml avoid unnecessary data.

what this:

"response": {         "numfound": 1,         "start": 0,         "docs": [           {             "id": [               "aaa11besd4effsujqub6toubqr4m3.pdf"             ],             "fulltext": [" abcdef"],             "uid": "d41d8cd98f00b204e9800998ecf8427e"           }         ]       } 

actually schema , solrconfig.xml this:

<?xml version="1.0" encoding="utf-8" ?> <schema name="simple" version="1.1">     <types>         <fieldtype name="string" class="solr.strfield" postingsformat="simpletext" />         <fieldtype name="text" class="solr.textfield" postingsformat="simpletext">             <analyzer>                 <charfilter class="solr.patternreplacecharfilterfactory" pattern="\n" replacement=""/>                 <tokenizer class="solr.standardtokenizerfactory"/>                 <filter class="solr.lowercasefilterfactory" /> <!--lowercases letters in each token. leaves non-letter tokens alone.-->                 <filter class="solr.classicfilterfactory" /> <!--removes dots acronyms , 's end of tokens. works on typed tokens produced classictokenizer or equivalent.-->                 <filter class="solr.trimfilterfactory"/> <!--trims whitespace @ either end of token. -->                 <filter class="solr.stopfilterfactory" ignorecase="true"/> <!--discards common words.  -->                 <filter class="solr.removeduplicatestokenfilterfactory"/>             </analyzer>         </fieldtype>     </types>      <fields>         <field name="uid" type="string" indexed="true" stored="true"             multivalued="false" />         <dynamicfield name="*" type="string" multivalued="true"             indexed="true" stored="true" />         <field name="content" indexed="true"  type="text" multivalued="true" />     </fields>      <defaultsearchfield>content</defaultsearchfield>      <solrqueryparser defaultoperator="or" />     <uniquekey>uid</uniquekey> </schema>   <?xml version="1.0" encoding="utf-8" ?> <config>     <lucenematchversion>lucene_45</lucenematchversion>     <directoryfactory name='directoryfactory' class='solr.mmapdirectoryfactory' />      <codecfactory name="codecfactory" class="solr.schemacodecfactory" />      <lib dir='${solr.core.instancedir}\lib' />     <lib dir="${solr.core.instancedir}\dist\" regex="solr-cell-\d.*\.jar" />     <lib dir="${solr.core.instancedir}\contrib\extraction\lib" regex=".*\.jar" />      <requesthandler name="standard" class="solr.standardrequesthandler" default="true" />      <requesthandler name="/update" class="solr.updaterequesthandler">         <lst name="defaults">             <str name="update.chain">deduplication</str>         </lst>     </requesthandler>      <requesthandler name="/update/extract"         class="solr.extraction.extractingrequesthandler">         <lst name="defaults">             <str name="captureattr">true</str>             <str name="lowernames">true</str>             <str name="overwrite">false</str>             <str name="captureattr">true</str>             <str name="literalsoverride">true</str>             <str name="uprefix">ignored_</str>             <str name="fmap.a">link</str>             <str name="fmap.content">fulltext</str>             <!-- configuration here useful tests -->             <str name="update.chain">deduplication</str>         </lst>     </requesthandler>      <updaterequestprocessorchain name="deduplication">         <processor             class="org.apache.solr.update.processor.signatureupdateprocessorfactory">             <bool name="overwritedupes">false</bool>             <str name="signaturefield">uid</str>             <bool name="enabled">true</bool>             <str name="fields">content</str>             <str name="mintokenlen">10</str>             <str name="quantrate">.2</str>             <str name="signatureclass">solr.update.processor.textprofilesignature</str>         </processor>         <processor class="solr.logupdateprocessorfactory" />         <processor class="solr.runupdateprocessorfactory" />     </updaterequestprocessorchain>      <requesthandler name="/admin/"         class="org.apache.solr.handler.admin.adminhandlers" />      <locktype>none</locktype>      <admin>         <defaultquery>*:*</defaultquery>     </admin>  </config> 

see alexandre's answer , examples here. if getting fields not need, need explicitly declare them in schema , set both indexed , stored false (meaning solr ignore field). can use dynamic fields ignore whole bunch of them common prefix or suffix, typically case docs generated tika.


Comments

Popular posts from this blog

Android layout hidden on keyboard show -

google app engine - 403 Forbidden POST - Flask WTForms -

c - Why would PK11_GenerateRandom() return an error -8023? -