mirror of https://github.com/interlegis/sapl.git
committed by
20 changed files with 2797 additions and 55 deletions
@ -0,0 +1,61 @@ |
version: '2' |
services: |
sapldb: |
image: postgres:10.5-alpine |
restart: always |
environment: |
PGDATA : /var/lib/postgresql/data/ |
volumes: |
- sapldb_data:/var/lib/postgresql/data/ |
ports: |
- "5432:5432" |
saplsolr: |
image: solr:7.4-alpine |
restart: always |
command: bin/solr start -c -f |
volumes: |
- solr_data:/opt/solr/server/solr |
- solr_configsets:/opt/solr/server/solr/configsets |
ports: |
- "8983:8983" |
sapl: |
image: interlegis/sapl:3.1.138 |
# build: . |
restart: always |
environment: |
ADMIN_PASSWORD: interlegis |
ADMIN_EMAIL: email@dominio.net |
DEBUG: 'False' |
USE_TLS: 'False' |
EMAIL_HOST: smtp.dominio.net |
EMAIL_HOST_USER: usuariosmtp |
USE_SOLR: 'True' |
#SOLR_HOST: saplsolr |
SOLR_URL: http://saplsolr:8983/solr/sapl |
TZ: America/Sao_Paulo |
volumes: |
- sapl_data:/var/interlegis/sapl/data |
- sapl_media:/var/interlegis/sapl/media |
- sapl_root:/var/interlegis/sapl |
volumes_from: |
- saplsolr |
depends_on: |
- sapldb |
- saplsolr |
ports: |
- "80:80" |
volumes: |
sapldb_data: |
sapl_data: |
sapl_media: |
sapl_root: |
solr_data: |
solr_configsets: |
@ -0,0 +1,54 @@ |
# Licensed to the Apache Software Foundation (ASF) under one or more |
# contributor license agreements. See the NOTICE file distributed with |
# this work for additional information regarding copyright ownership. |
# The ASF licenses this file to You under the Apache License, Version 2.0 |
# (the "License"); you may not use this file except in compliance with |
# the License. You may obtain a copy of the License at |
# |
# http://www.apache.org/licenses/LICENSE-2.0 |
# |
# Unless required by applicable law or agreed to in writing, software |
# distributed under the License is distributed on an "AS IS" BASIS, |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
# See the License for the specific language governing permissions and |
# limitations under the License. |
# a couple of test stopwords to test that the words are really being |
# configured from this file: |
stopworda |
stopwordb |
# Standard english stop words taken from Lucene's StopAnalyzer |
a |
an |
and |
are |
as |
at |
be |
but |
by |
for |
if |
in |
into |
is |
it |
no |
not |
of |
on |
or |
such |
that |
the |
their |
then |
there |
these |
they |
this |
to |
was |
will |
with |
@ -0,0 +1,253 @@ |
| From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt |
| This file is distributed under the BSD License. |
| See http://snowball.tartarus.org/license.php |
| Also see http://www.opensource.org/licenses/bsd-license.html |
| - Encoding was converted to UTF-8. |
| - This notice was added. |
| |
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball" |
| A Portuguese stop word list. Comments begin with vertical bar. Each stop |
| word is at the start of a line. |
| The following is a ranked list (commonest to rarest) of stopwords |
| deriving from a large sample of text. |
| Extra words have been added at the end. |
de | of, from |
a | the; to, at; her |
o | the; him |
que | who, that |
e | and |
do | de + o |
da | de + a |
em | in |
um | a |
para | for |
| é from SER |
com | with |
não | not, no |
uma | a |
os | the; them |
no | em + o |
se | himself etc |
na | em + a |
por | for |
mais | more |
as | the; them |
dos | de + os |
como | as, like |
mas | but |
| foi from SER |
ao | a + o |
ele | he |
das | de + as |
| tem from TER |
à | a + a |
seu | his |
sua | her |
ou | or |
| ser from SER |
quando | when |
muito | much |
| há from HAV |
nos | em + os; us |
já | already, now |
| está from EST |
eu | I |
também | also |
só | only, just |
pelo | per + o |
pela | per + a |
até | up to |
isso | that |
ela | he |
entre | between |
| era from SER |
depois | after |
sem | without |
mesmo | same |
aos | a + os |
| ter from TER |
seus | his |
quem | whom |
nas | em + as |
me | me |
esse | that |
eles | they |
| estão from EST |
você | you |
| tinha from TER |
| foram from SER |
essa | that |
num | em + um |
nem | nor |
suas | her |
meu | my |
às | a + as |
minha | my |
| têm from TER |
numa | em + uma |
pelos | per + os |
elas | they |
| havia from HAV |
| seja from SER |
qual | which |
| será from SER |
nós | we |
| tenho from TER |
lhe | to him, her |
deles | of them |
essas | those |
esses | those |
pelas | per + as |
este | this |
| fosse from SER |
dele | of him |
| other words. There are many contractions such as naquele = em+aquele, |
| mo = me+o, but they are rare. |
| Indefinite article plural forms are also rare. |
tu | thou |
te | thee |
vocês | you (plural) |
vos | you |
lhes | to them |
meus | my |
minhas |
teu | thy |
tua |
teus |
tuas |
nosso | our |
nossa |
nossos |
nossas |
dela | of her |
delas | of them |
esta | this |
estes | these |
estas | these |
aquele | that |
aquela | that |
aqueles | those |
aquelas | those |
isto | this |
aquilo | that |
| forms of estar, to be (not including the infinitive): |
estou |
está |
estamos |
estão |
estive |
esteve |
estivemos |
estiveram |
estava |
estávamos |
estavam |
estivera |
estivéramos |
esteja |
estejamos |
estejam |
estivesse |
estivéssemos |
estivessem |
estiver |
estivermos |
estiverem |
| forms of haver, to have (not including the infinitive): |
hei |
há |
havemos |
hão |
houve |
houvemos |
houveram |
houvera |
houvéramos |
haja |
hajamos |
hajam |
houvesse |
houvéssemos |
houvessem |
houver |
houvermos |
houverem |
houverei |
houverá |
houveremos |
houverão |
houveria |
houveríamos |
houveriam |
| forms of ser, to be (not including the infinitive): |
sou |
somos |
são |
era |
éramos |
eram |
fui |
foi |
fomos |
foram |
fora |
fôramos |
seja |
sejamos |
sejam |
fosse |
fôssemos |
fossem |
for |
formos |
forem |
serei |
será |
seremos |
serão |
seria |
seríamos |
seriam |
| forms of ter, to have (not including the infinitive): |
tenho |
tem |
temos |
tém |
tinha |
tínhamos |
tinham |
tive |
teve |
tivemos |
tiveram |
tivera |
tivéramos |
tenha |
tenhamos |
tenham |
tivesse |
tivéssemos |
tivessem |
tiver |
tivermos |
tiverem |
terei |
terá |
teremos |
terão |
teria |
teríamos |
teriam |
@ -0,0 +1,573 @@ |
<?xml version="1.0" encoding="UTF-8" ?> |
<!-- |
Licensed to the Apache Software Foundation (ASF) under one or more |
contributor license agreements. See the NOTICE file distributed with |
this work for additional information regarding copyright ownership. |
The ASF licenses this file to You under the Apache License, Version 2.0 |
(the "License"); you may not use this file except in compliance with |
the License. You may obtain a copy of the License at |
http://www.apache.org/licenses/LICENSE-2.0 |
Unless required by applicable law or agreed to in writing, software |
distributed under the License is distributed on an "AS IS" BASIS, |
See the License for the specific language governing permissions and |
limitations under the License. |
--> |
<!-- |
This example schema is the recommended starting point for users. |
It should be kept correct and concise, usable out-of-the-box. |
For more information, on how to customize this file, please see |
http://lucene.apache.org/solr/guide/documents-fields-and-schema-design.html |
PERFORMANCE NOTE: this schema includes many optional features and should not |
be used for benchmarking. To improve performance one could |
- set stored="false" for all fields possible (esp large fields) when you |
only need to search on the field but don't need to return the original |
value. |
- set indexed="false" if you don't need to search on the field, but only |
return the field as a result of searching on other indexed fields. |
- remove all unneeded copyField statements |
- for best index size and searching performance, set "index" to false |
for all general text fields, use copyField to copy them to the |
catchall "text" field, and use that for searching. |
--> |
<schema name="default-config" version="1.6"> |
<!-- attribute "name" is the name of this schema and is only used for display purposes. |
version="x.y" is Solr's version number for the schema syntax and |
semantics. It should not normally be changed by applications. |
1.0: multiValued attribute did not exist, all fields are multiValued |
by nature |
1.1: multiValued attribute introduced, false by default |
1.2: omitTermFreqAndPositions attribute introduced, true by default |
except for text fields. |
1.3: removed optional field compress feature |
1.4: autoGeneratePhraseQueries attribute introduced to drive QueryParser |
behavior when a single string produces multiple tokens. Defaults |
to off for version >= 1.4 |
1.5: omitNorms defaults to true for primitive field types |
(int, float, boolean, string...) |
1.6: useDocValuesAsStored defaults to true. |
--> |
<!-- Valid attributes for fields: |
name: mandatory - the name for the field |
type: mandatory - the name of a field type from the |
fieldTypes section |
indexed: true if this field should be indexed (searchable or sortable) |
stored: true if this field should be retrievable |
docValues: true if this field should have doc values. Doc Values is |
recommended (required, if you are using *Point fields) for faceting, |
grouping, sorting and function queries. Doc Values will make the index |
faster to load, more NRT-friendly and more memory-efficient. |
They are currently only supported by StrField, UUIDField, all |
*PointFields, and depending on the field type, they might require |
the field to be single-valued, be required or have a default value |
(check the documentation of the field type you're interested in for |
more information) |
multiValued: true if this field may contain multiple values per document |
omitNorms: (expert) set to true to omit the norms associated with |
this field (this disables length normalization and index-time |
boosting for the field, and saves some memory). Only full-text |
fields or fields that need an index-time boost need norms. |
Norms are omitted for primitive (non-analyzed) types by default. |
termVectors: [false] set to true to store the term vector for a |
given field. |
When using MoreLikeThis, fields used for similarity should be |
stored for best performance. |
termPositions: Store position information with the term vector. |
This will increase storage costs. |
termOffsets: Store offset information with the term vector. This |
will increase storage costs. |
required: The field is required. It will throw an error if the |
value does not exist |
default: a value that should be used if no value is specified |
when adding a document. |
--> |
<!-- field names should consist of alphanumeric or underscore characters only and |
not start with a digit. This is not currently strictly enforced, |
but other field names will not have first class support from all components |
and back compatibility is not guaranteed. Names with both leading and |
trailing underscores (e.g. _version_) are reserved. |
--> |
<!-- In this _default configset, only four fields are pre-declared: |
id, _version_, and _text_ and _root_. All other fields will be type guessed and added via the |
"add-unknown-fields-to-the-schema" update request processor chain declared in solrconfig.xml. |
Note that many dynamic fields are also defined - you can use them to specify a |
field's type via field naming conventions - see below. |
WARNING: The _text_ catch-all field will significantly increase your index size. |
If you don't need it, consider removing it and the corresponding copyField directive. |
--> |
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" /> |
<!-- docValues are enabled by default for long type so we don't need to index the version field --> |
<field name="_version_" type="plong" indexed="false" stored="false"/> |
<field name="_root_" type="string" indexed="true" stored="false" docValues="false" /> |
<field name="_text_" type="text_general" indexed="true" stored="false" multiValued="true"/> |
<!-- Django fields --> |
<field name="django_ct" type="string" indexed="true" stored="true" multiValued="false"/> |
<field name="django_id" type="string" indexed="true" stored="true" multiValued="false"/> |
<field name="text" type="text_pt" indexed="true" stored="true" multiValued="false" /> |
<!-- This can be enabled, in case the client does not know what fields may be searched. It isn't enabled by default |
because it's very expensive to index everything twice. --> |
<!-- <copyField source="*" dest="_text_"/> --> |
<!-- Dynamic field definitions allow using convention over configuration |
for fields via the specification of patterns to match field names. |
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i) |
RESTRICTION: the glob-like pattern in the name attribute must have a "*" only at the start or the end. --> |
<dynamicField name="*_i" type="pint" indexed="true" stored="true"/> |
<dynamicField name="*_is" type="pints" indexed="true" stored="true"/> |
<dynamicField name="*_s" type="string" indexed="true" stored="true" /> |
<dynamicField name="*_ss" type="strings" indexed="true" stored="true"/> |
<dynamicField name="*_l" type="plong" indexed="true" stored="true"/> |
<dynamicField name="*_ls" type="plongs" indexed="true" stored="true"/> |
<dynamicField name="*_t" type="text_general" indexed="true" stored="true" multiValued="false"/> |
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true"/> |
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/> |
<dynamicField name="*_bs" type="booleans" indexed="true" stored="true"/> |
<dynamicField name="*_f" type="pfloat" indexed="true" stored="true"/> |
<dynamicField name="*_fs" type="pfloats" indexed="true" stored="true"/> |
<dynamicField name="*_d" type="pdouble" indexed="true" stored="true"/> |
<dynamicField name="*_ds" type="pdoubles" indexed="true" stored="true"/> |
<!-- Type used for data-driven schema, to add a string copy for each text field --> |
<dynamicField name="*_str" type="strings" stored="false" docValues="true" indexed="false" /> |
<dynamicField name="*_dt" type="pdate" indexed="true" stored="true"/> |
<dynamicField name="*_dts" type="pdate" indexed="true" stored="true" multiValued="true"/> |
<dynamicField name="*_p" type="location" indexed="true" stored="true"/> |
<dynamicField name="*_srpt" type="location_rpt" indexed="true" stored="true"/> |
<!-- payloaded dynamic fields --> |
<dynamicField name="*_dpf" type="delimited_payloads_float" indexed="true" stored="true"/> |
<dynamicField name="*_dpi" type="delimited_payloads_int" indexed="true" stored="true"/> |
<dynamicField name="*_dps" type="delimited_payloads_string" indexed="true" stored="true"/> |
<dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/> |
<!-- Field to use to determine and enforce document uniqueness. |
Unless this field is marked with required="false", it will be a required field |
--> |
<uniqueKey>id</uniqueKey> |
<!-- copyField commands copy one field to another at the time a document |
is added to the index. It's used either to index the same field differently, |
or to add multiple fields to the same field for easier/faster searching. |
<copyField source="sourceFieldName" dest="destinationFieldName"/> |
--> |
<!-- field type definitions. The "name" attribute is |
just a label to be used by field definitions. The "class" |
attribute and any other attributes determine the real |
behavior of the fieldType. |
Class names starting with "solr" refer to java classes in a |
standard package such as org.apache.solr.analysis |
--> |
<!-- sortMissingLast and sortMissingFirst attributes are optional attributes are |
currently supported on types that are sorted internally as strings |
and on numeric types. |
This includes "string", "boolean", "pint", "pfloat", "plong", "pdate", "pdouble". |
- If sortMissingLast="true", then a sort on this field will cause documents |
without the field to come after documents with the field, |
regardless of the requested sort order (asc or desc). |
- If sortMissingFirst="true", then a sort on this field will cause documents |
without the field to come before documents with the field, |
regardless of the requested sort order. |
- If sortMissingLast="false" and sortMissingFirst="false" (the default), |
then default lucene sorting will be used which places docs without the |
field first in an ascending sort and last in a descending sort. |
--> |
<!-- The StrField type is not analyzed, but indexed/stored verbatim. --> |
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" /> |
<fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true" docValues="true" /> |
<!-- boolean type: "true" or "false" --> |
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/> |
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/> |
<!-- |
Numeric field types that index values using KD-trees. |
Point fields don't support FieldCache, so they must have docValues="true" if needed for sorting, faceting, functions, etc. |
--> |
<fieldType name="pint" class="solr.IntPointField" docValues="true"/> |
<fieldType name="pfloat" class="solr.FloatPointField" docValues="true"/> |
<fieldType name="plong" class="solr.LongPointField" docValues="true"/> |
<fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/> |
<fieldType name="pints" class="solr.IntPointField" docValues="true" multiValued="true"/> |
<fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/> |
<fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/> |
<fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/> |
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and |
is a more restricted form of the canonical representation of dateTime |
http://www.w3.org/TR/xmlschema-2/#dateTime |
The trailing "Z" designates UTC time and is mandatory. |
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z |
All other components are mandatory. |
Expressions can also be used to denote calculations that should be |
performed relative to "NOW" to determine the value, ie... |
... Round to the start of the current hour |
... Exactly 1 day prior to now |
... 6 months and 3 days in the future from the start of |
the current day |
--> |
<!-- KD-tree versions of date fields --> |
<fieldType name="pdate" class="solr.DatePointField" docValues="true"/> |
<fieldType name="pdates" class="solr.DatePointField" docValues="true" multiValued="true"/> |
<!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings --> |
<fieldType name="binary" class="solr.BinaryField"/> |
<!-- solr.TextField allows the specification of custom text analyzers |
specified as a tokenizer and a list of token filters. Different |
analyzers may be specified for indexing and querying. |
The optional positionIncrementGap puts space between multiple fields of |
this type on the same document, with the purpose of preventing false phrase |
matching across fields. |
For more info on customizing your analyzer chain, please see |
http://lucene.apache.org/solr/guide/understanding-analyzers-tokenizers-and-filters.html#understanding-analyzers-tokenizers-and-filters |
--> |
<!-- One can also specify an existing Analyzer class that has a |
default constructor via the class attribute on the analyzer element. |
Example: |
<fieldType name="text_greek" class="solr.TextField"> |
<analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/> |
</fieldType> |
--> |
<!-- A text field that only splits on whitespace for exact matching of words --> |
<dynamicField name="*_ws" type="text_ws" indexed="true" stored="true"/> |
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> |
<analyzer> |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> |
</analyzer> |
</fieldType> |
<!-- A general text field that has reasonable, generic |
cross-language defaults: it tokenizes with StandardTokenizer, |
removes stop words from case-insensitive "stopwords.txt" |
(empty by default), and down cases. At query time only, it |
also applies synonyms. |
--> |
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" multiValued="true"> |
<analyzer type="index"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
<!-- in this example, we will only use synonyms at query time |
<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
<filter class="solr.FlattenGraphFilterFactory"/> |
--> |
<filter class="solr.LowerCaseFilterFactory"/> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
<filter class="solr.LowerCaseFilterFactory"/> |
</analyzer> |
</fieldType> |
<!-- SortableTextField generaly functions exactly like TextField, |
except that it supports, and by default uses, docValues for sorting (or faceting) |
on the first 1024 characters of the original field values (which is configurable). |
This makes it a bit more useful then TextField in many situations, but the trade-off |
is that it takes up more space on disk; which is why it's not used in place of TextField |
for every fieldType in this _default schema. |
--> |
<dynamicField name="*_t_sort" type="text_gen_sort" indexed="true" stored="true" multiValued="false"/> |
<dynamicField name="*_txt_sort" type="text_gen_sort" indexed="true" stored="true"/> |
<fieldType name="text_gen_sort" class="solr.SortableTextField" positionIncrementGap="100" multiValued="true"> |
<analyzer type="index"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
<filter class="solr.LowerCaseFilterFactory"/> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
<filter class="solr.LowerCaseFilterFactory"/> |
</analyzer> |
</fieldType> |
<!-- A text field with defaults appropriate for English: it tokenizes with StandardTokenizer, |
removes English stop words (lang/stopwords_en.txt), down cases, protects words from protwords.txt, and |
finally applies Porter's stemming. The query time analyzer also applies synonyms from synonyms.txt. --> |
<dynamicField name="*_txt_en" type="text_en" indexed="true" stored="true"/> |
<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> |
<analyzer type="index"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<!-- in this example, we will only use synonyms at query time |
<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
<filter class="solr.FlattenGraphFilterFactory"/> |
--> |
<!-- Case insensitive stop word removal. |
--> |
<filter class="solr.StopFilterFactory" |
ignoreCase="true" |
words="lang/stopwords_en.txt" |
/> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.EnglishPossessiveFilterFactory"/> |
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: |
<filter class="solr.EnglishMinimalStemFilterFactory"/> |
--> |
<filter class="solr.PorterStemFilterFactory"/> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
<filter class="solr.StopFilterFactory" |
ignoreCase="true" |
words="lang/stopwords_en.txt" |
/> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.EnglishPossessiveFilterFactory"/> |
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: |
<filter class="solr.EnglishMinimalStemFilterFactory"/> |
--> |
<filter class="solr.PorterStemFilterFactory"/> |
</analyzer> |
</fieldType> |
<!-- A text field with defaults appropriate for English, plus |
aggressive word-splitting and autophrase features enabled. |
This field is just like text_en, except it adds |
WordDelimiterGraphFilter to enable splitting and matching of |
words on case-change, alpha numeric boundaries, and |
non-alphanumeric chars. This means certain compound word |
cases will work, for example query "wi fi" will match |
document "WiFi" or "wi-fi". |
--> |
<dynamicField name="*_txt_en_split" type="text_en_splitting" indexed="true" stored="true"/> |
<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> |
<analyzer type="index"> |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> |
<!-- in this example, we will only use synonyms at query time |
<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
--> |
<!-- Case insensitive stop word removal. |
--> |
<filter class="solr.StopFilterFactory" |
ignoreCase="true" |
words="lang/stopwords_en.txt" |
/> |
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
<filter class="solr.PorterStemFilterFactory"/> |
<filter class="solr.FlattenGraphFilterFactory" /> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> |
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
<filter class="solr.StopFilterFactory" |
ignoreCase="true" |
words="lang/stopwords_en.txt" |
/> |
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
<filter class="solr.PorterStemFilterFactory"/> |
</analyzer> |
</fieldType> |
<!-- Less flexible matching, but less false matches. Probably not ideal for product names, |
but may be good for SKUs. Can insert dashes in the wrong place and still match. --> |
<dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight" indexed="true" stored="true"/> |
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> |
<analyzer type="index"> |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> |
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/> |
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
<filter class="solr.EnglishMinimalStemFilterFactory"/> |
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes |
possible with WordDelimiterGraphFilter in conjuncton with stemming. --> |
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
<filter class="solr.FlattenGraphFilterFactory" /> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> |
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/> |
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
<filter class="solr.EnglishMinimalStemFilterFactory"/> |
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes |
possible with WordDelimiterGraphFilter in conjuncton with stemming. --> |
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
</analyzer> |
</fieldType> |
<!-- Just like text_general except it reverses the characters of |
each token, to enable more efficient leading wildcard queries. |
--> |
<dynamicField name="*_txt_rev" type="text_general_rev" indexed="true" stored="true"/> |
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100"> |
<analyzer type="index"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" |
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
<filter class="solr.LowerCaseFilterFactory"/> |
</analyzer> |
</fieldType> |
<dynamicField name="*_phon_en" type="phonetic_en" indexed="true" stored="true"/> |
<fieldType name="phonetic_en" stored="false" indexed="true" class="solr.TextField" > |
<analyzer> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> |
</analyzer> |
</fieldType> |
<!-- lowercases the entire field value, keeping it as a single token. --> |
<dynamicField name="*_s_lower" type="lowercase" indexed="true" stored="true"/> |
<fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> |
<analyzer> |
<tokenizer class="solr.KeywordTokenizerFactory"/> |
<filter class="solr.LowerCaseFilterFactory" /> |
</analyzer> |
</fieldType> |
<!-- |
Example of using PathHierarchyTokenizerFactory at index time, so |
queries for paths match documents at that path, or in descendent paths |
--> |
<dynamicField name="*_descendent_path" type="descendent_path" indexed="true" stored="true"/> |
<fieldType name="descendent_path" class="solr.TextField"> |
<analyzer type="index"> |
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" /> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.KeywordTokenizerFactory" /> |
</analyzer> |
</fieldType> |
<!-- |
Example of using PathHierarchyTokenizerFactory at query time, so |
queries for paths match documents at that path, or in ancestor paths |
--> |
<dynamicField name="*_ancestor_path" type="ancestor_path" indexed="true" stored="true"/> |
<fieldType name="ancestor_path" class="solr.TextField"> |
<analyzer type="index"> |
<tokenizer class="solr.KeywordTokenizerFactory" /> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" /> |
</analyzer> |
</fieldType> |
<!-- This point type indexes the coordinates as separate fields (subFields) |
If subFieldType is defined, it references a type, and a dynamic field |
definition is created matching *___<typename>. Alternately, if |
subFieldSuffix is defined, that is used to create the subFields. |
Example: if subFieldType="double", then the coordinates would be |
indexed in fields myloc_0___double,myloc_1___double. |
Example: if subFieldSuffix="_d" then the coordinates would be indexed |
in fields myloc_0_d,myloc_1_d |
The subFields are an implementation detail of the fieldType, and end |
users normally should not need to know about them. |
--> |
<dynamicField name="*_point" type="point" indexed="true" stored="true"/> |
<fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/> |
<!-- A specialized field for geospatial search filters and distance sorting. --> |
<fieldType name="location" class="solr.LatLonPointSpatialField" docValues="true"/> |
<!-- A geospatial field type that supports multiValued and polygon shapes. |
For more information about this and other spatial fields see: |
http://lucene.apache.org/solr/guide/spatial-search.html |
--> |
<fieldType name="location_rpt" class="solr.SpatialRecursivePrefixTreeFieldType" |
geo="true" distErrPct="0.025" maxDistErr="0.001" distanceUnits="kilometers" /> |
<!-- Payloaded field types --> |
<fieldType name="delimited_payloads_float" stored="false" indexed="true" class="solr.TextField"> |
<analyzer> |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> |
<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/> |
</analyzer> |
</fieldType> |
<fieldType name="delimited_payloads_int" stored="false" indexed="true" class="solr.TextField"> |
<analyzer> |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> |
<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="integer"/> |
</analyzer> |
</fieldType> |
<fieldType name="delimited_payloads_string" stored="false" indexed="true" class="solr.TextField"> |
<analyzer> |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> |
<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="identity"/> |
</analyzer> |
</fieldType> |
<!-- Portuguese --> |
<dynamicField name="*_txt_pt" type="text_pt" indexed="true" stored="true"/> |
<fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100"> |
<analyzer> |
<charFilter class="solr.HTMLStripCharFilterFactory"/> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" /> |
<filter class="solr.PortugueseLightStemFilterFactory"/> |
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> --> |
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> --> |
<!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> --> |
</analyzer> |
</fieldType> |
<!-- Similarity is the scoring routine for each document vs. a query. |
A custom Similarity or SimilarityFactory may be specified here, but |
the default is fine for most applications. |
For more info: http://lucene.apache.org/solr/guide/other-schema-elements.html#OtherSchemaElements-Similarity |
--> |
<!-- |
<similarity class="com.example.solr.CustomSimilarityFactory"> |
<str name="paramkey">param value</str> |
</similarity> |
--> |
</schema> |
@ -0,0 +1,20 @@ |
{"params":{ |
"query":{ |
"defType":"edismax", |
"q.alt":"*:*", |
"rows":"10", |
"fl":"*,score", |
"":{"v":0} |
}, |
"facets":{ |
"facet":"on", |
"facet.mincount": "1", |
"":{"v":0} |
}, |
"velocity":{ |
"wt": "velocity", |
"v.template":"browse", |
"v.layout": "layout", |
"":{"v":0} |
} |
}} |
@ -0,0 +1,21 @@ |
# The ASF licenses this file to You under the Apache License, Version 2.0 |
# (the "License"); you may not use this file except in compliance with |
# the License. You may obtain a copy of the License at |
# |
# http://www.apache.org/licenses/LICENSE-2.0 |
# |
# Unless required by applicable law or agreed to in writing, software |
# distributed under the License is distributed on an "AS IS" BASIS, |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
# See the License for the specific language governing permissions and |
# limitations under the License. |
#----------------------------------------------------------------------- |
# Use a protected word file to protect against the stemmer reducing two |
# unrelated words to the same base word. |
# Some non-words that normally won't be encountered, |
# just to test that they won't be stemmed. |
dontstems |
zwhacky |
Binary file not shown.
@ -0,0 +1,165 @@ |
<?xml version="1.0" ?> |
<!-- |
Licensed to the Apache Software Foundation (ASF) under one or more |
contributor license agreements. See the NOTICE file distributed with |
this work for additional information regarding copyright ownership. |
The ASF licenses this file to You under the Apache License, Version 2.0 |
(the "License"); you may not use this file except in compliance with |
the License. You may obtain a copy of the License at |
http://www.apache.org/licenses/LICENSE-2.0 |
Unless required by applicable law or agreed to in writing, software |
distributed under the License is distributed on an "AS IS" BASIS, |
See the License for the specific language governing permissions and |
limitations under the License. |
--> |
<schema name="default" version="1.6"> |
<types> |
<fieldtype name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> |
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/> |
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/> |
<fieldtype name="binary" class="solr.BinaryField"/> |
<!-- Numeric field types that manipulate the value into |
a string value that isn't human-readable in its internal form, |
but with a lexicographic ordering the same as the numeric ordering, |
so that range queries work correctly. --> |
<fieldType name="pint" class="solr.IntPointField" docValues="true" /> |
<fieldType name="pfloat" class="solr.FloatPointField" docValues="true" /> |
<fieldType name="plong" class="solr.LongPointField" docValues="true" /> |
<fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/> |
<fieldType name="pdate" class="solr.DatePointField" docValues="true" /> |
<!-- A Trie based date field ifor faster date range queries and date faceting. --> |
<fieldType name="pints" class="solr.IntPointField" docValues="true" multiValued="true"/> |
<fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/> |
<fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/> |
<fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/> |
<fieldType name="pdates" class="solr.DatePointField" docValues="true" multiValued="true"/> |
<fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/> |
<fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/> |
<fieldtype name="geohash" class="solr.GeoHashField"/> |
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> |
<analyzer type="index"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
<!-- in this example, we will only use synonyms at query time |
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
--> |
<filter class="solr.LowerCaseFilterFactory"/> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
<filter class="solr.LowerCaseFilterFactory"/> |
</analyzer> |
</fieldType> |
<!-- Portuguese --> |
<dynamicField name="*_txt_pt" type="text_pt" indexed="true" stored="true"/> |
<fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100"> |
<analyzer> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" /> |
<filter class="solr.PortugueseLightStemFilterFactory"/> |
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> --> |
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> --> |
<!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> --> |
</analyzer> |
</fieldType> |
<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> |
<analyzer type="index"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.StopFilterFactory" |
ignoreCase="true" |
words="lang/stopwords_en.txt" |
/> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.EnglishPossessiveFilterFactory"/> |
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: |
<filter class="solr.EnglishMinimalStemFilterFactory"/> |
--> |
<filter class="solr.PorterStemFilterFactory"/> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
<filter class="solr.StopFilterFactory" |
ignoreCase="true" |
words="lang/stopwords_en.txt" |
/> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.EnglishPossessiveFilterFactory"/> |
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: |
<filter class="solr.EnglishMinimalStemFilterFactory"/> |
--> |
<filter class="solr.PorterStemFilterFactory"/> |
</analyzer> |
</fieldType> |
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> |
<analyzer> |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> |
</analyzer> |
</fieldType> |
<fieldType name="ngram" class="solr.TextField" > |
<analyzer type="index"> |
<tokenizer class="solr.KeywordTokenizerFactory"/> |
<filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.NGramFilterFactory" minGramSize="3" maxGramSize="15" /> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.KeywordTokenizerFactory"/> |
<filter class="solr.LowerCaseFilterFactory"/> |
</analyzer> |
</fieldType> |
<fieldType name="edge_ngram" class="solr.TextField" positionIncrementGap="1"> |
<analyzer type="index"> |
<tokenizer class="solr.WhitespaceTokenizerFactory" /> |
<filter class="solr.LowerCaseFilterFactory" /> |
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> |
<filter class="solr.EdgeNGramFilterFactory" minGramSize="2" maxGramSize="15" /> |
</analyzer> |
<analyzer type="query"> |
<tokenizer class="solr.WhitespaceTokenizerFactory" /> |
<filter class="solr.LowerCaseFilterFactory" /> |
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> |
</analyzer> |
</fieldType> |
</types> |
<fields> |
<!-- general --> |
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/> |
<field name="django_ct" type="string" indexed="true" stored="true" multiValued="false"/> |
<field name="django_id" type="string" indexed="true" stored="true" multiValued="false"/> |
<field name="_version_" type="plong" indexed="true" stored ="true"/> |
<field name="text" type="text_pt" indexed="true" stored="true" multiValued="false" /> |
</fields> |
<!-- field to use to determine and enforce document uniqueness. --> |
<uniqueKey>id</uniqueKey> |
<!-- field for the QueryParser to use when an explicit fieldname is absent --> |
<df>text</df> |
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" --> |
<solrQueryParser q.op="AND"/> |
</schema> |
File diff suppressed because it is too large
@ -0,0 +1,14 @@ |
# Licensed to the Apache Software Foundation (ASF) under one or more |
# contributor license agreements. See the NOTICE file distributed with |
# this work for additional information regarding copyright ownership. |
# The ASF licenses this file to You under the Apache License, Version 2.0 |
# (the "License"); you may not use this file except in compliance with |
# the License. You may obtain a copy of the License at |
# |
# http://www.apache.org/licenses/LICENSE-2.0 |
# |
# Unless required by applicable law or agreed to in writing, software |
# distributed under the License is distributed on an "AS IS" BASIS, |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
# See the License for the specific language governing permissions and |
# limitations under the License. |
@ -0,0 +1,29 @@ |
# The ASF licenses this file to You under the Apache License, Version 2.0 |
# (the "License"); you may not use this file except in compliance with |
# the License. You may obtain a copy of the License at |
# |
# http://www.apache.org/licenses/LICENSE-2.0 |
# |
# Unless required by applicable law or agreed to in writing, software |
# distributed under the License is distributed on an "AS IS" BASIS, |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
# See the License for the specific language governing permissions and |
# limitations under the License. |
#----------------------------------------------------------------------- |
#some test synonym mappings unlikely to appear in real input text |
aaafoo => aaabar |
bbbfoo => bbbfoo bbbbar |
cccfoo => cccbar cccbaz |
fooaaa,baraaa,bazaaa |
# Some synonym groups specific to this example |
GB,gib,gigabyte,gigabytes |
MB,mib,megabyte,megabytes |
Television, Televisions, TV, TVs |
#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming |
#after us won't split it into two words. |
# Synonym mappings can be used for spelling correction too |
pixima => pixma |
@ -0,0 +1,155 @@ |
import requests |
import subprocess |
import sys |
import argparse |
class SolrClient: |
LIST_CONFIGSETS = "{}/solr/admin/configs?action=LIST&omitHeader=true&wt=json" |
UPLOAD_CONFIGSET = "{}/solr/admin/configs?action=UPLOAD&name={}&wt=json" |
LIST_COLLECTIONS = "{}/solr/admin/collections?action=LIST&wt=json" |
STATUS_COLLECTION = "{}/solr/admin/collections?action=CLUSTERSTATUS&collection={}&wt=json" |
STATUS_CORE = "{}/admin/cores?action=STATUS&name={}" |
EXISTS_COLLECTION = "{}/solr/{}/admin/ping?wt=json" |
OPTIMIZE_COLLECTION = "{}/solr/{}/update?optimize=true&wt=json" |
CREATE_COLLECTION = "{}/solr/admin/collections?action=CREATE&name={}&collection.configName={}&numShards={}&replicationFactor={}&maxShardsPerNode={}&wt=json" |
DELETE_COLLECTION = "{}/solr/admin/collections?action=DELETE&name={}&wt=json" |
DELETE_DATA = "{}/solr/{}/update?commitWithin=1000&overwrite=true&wt=json" |
QUERY_DATA = "{}/solr/{}/select?q=*:*" |
CONFIGSET_NAME = "sapl_configset" |
def __init__(self, url): |
self.url = url |
def get_num_docs(self, collection_name): |
final_url = self.QUERY_DATA.format(self.url, collection_name) |
res = requests.get(final_url) |
dic = res.json() |
num_docs = dic["response"]["numFound"] |
return num_docs |
def list_collections(self): |
req_url = self.LIST_COLLECTIONS.format(self.url) |
res = requests.get(req_url) |
dic = res.json() |
return dic['collections'] |
def exists_collection(self, collection_name): |
collections = self.list_collections() |
return True if collection_name in collections else False |
def maybe_upload_configset(self, force=False): |
req_url = self.LIST_CONFIGSETS.format(self.url) |
res = requests.get(req_url) |
dic = res.json() |
configsets = dic['configSets'] |
# UPLOAD configset |
if not self.CONFIGSET_NAME in configsets or force: |
files = {'file': ('saplconfigset.zip', |
open('./solr/sapl_configset/conf/saplconfigset.zip', |
'rb'), |
'application/octet-stream', |
{'Expires': '0'})} |
req_url = self.UPLOAD_CONFIGSET.format(self.url, self.CONFIGSET_NAME) |
resp = requests.post(req_url, files=files) |
print(resp.content) |
else: |
print('O %s já presente no servidor, NÃO enviando.' % self.CONFIGSET_NAME) |
def create_collection(self, collection_name, shards=1, replication_factor=1, max_shards_per_node=1): |
self.maybe_upload_configset() |
req_url = self.CREATE_COLLECTION.format(self.url, |
collection_name, |
shards, |
replication_factor, |
max_shards_per_node) |
res = requests.post(req_url) |
if res.ok: |
print("Collection '%s' created succesfully" % collection_name) |
else: |
print("Error creating collection '%s'" % collection_name) |
as_json = res.json() |
print("Error %s: %s" % (res.status_code, as_json['error']['msg'])) |
return False |
return True |
def delete_collection(self, collection_name): |
if collection_name == '*': |
collections = self.list_collections() |
else: |
collections = [collection_name] |
for c in collections: |
req_url = self.DELETE_COLLECTION.format(self.url, c) |
res = requests.post(req_url) |
if not res.ok: |
print("Error deleting collection '%s'", c) |
print("Code {}: {}".format(res.status_code, res.text)) |
else: |
print("Collection '%s' deleted successfully!" % c) |
def delete_index_data(self, collection_name): |
req_url = self.DELETE_DATA.format(self.url, collection_name) |
res = requests.post(req_url, |
data='<delete><query>*:*</query></delete>', |
headers={'Content-Type': 'application/xml'}) |
if not res.ok: |
print("Error deleting index for collection '%s'", collection_name) |
print("Code {}: {}".format(res.status_code, res.text)) |
else: |
print("Collection '%s' data deleted successfully!" % collection_name) |
num_docs = self.get_num_docs(collection_name) |
print("Num docs: %s" % num_docs) |
if __name__ == '__main__': |
parser = argparse.ArgumentParser(description='Cria uma collection no Solr') |
# required arguments |
parser.add_argument('-u', type=str, metavar='URL', nargs=1, dest='url', |
required=True, help='Endereço do servidor Solr na forma http(s)://<address>[:port]') |
parser.add_argument('-c', type=str, metavar='COLLECTION', dest='collection', nargs=1, |
required=True, help='Collection Solr a ser criada') |
# optional arguments |
parser.add_argument('-s', type=int, dest='shards', nargs='?', |
help='Number of shards (default=1)', default=1) |
parser.add_argument('-rf', type=int, dest='replication_factor', nargs='?', |
help='Replication factor (default=1)', default=1) |
parser.add_argument('-ms', type=int, dest='max_shards_per_node', nargs='?', |
help='Max shards per node (default=1)', default=1) |
try: |
args = parser.parse_args() |
except IOError as msg: |
parser.error(str(msg)) |
sys.exit(-1) |
url = args.url.pop() |
collection = args.collection.pop() |
client = SolrClient(url=url) |
if not client.exists_collection(collection): |
print("Collection '%s' doesn't exists. Creating a new one..." % collection) |
created = client.create_collection(collection, |
shards=args.shards, |
replication_factor=args.replication_factor, |
max_shards_per_node=args.max_shards_per_node) |
if not created: |
sys.exit(-1) |
else: |
print("Collection '%s' exists." % collection) |
num_docs = client.get_num_docs(collection) |
if num_docs == 0: |
print("Performing a full reindex of '%s' collection..." % collection) |
p = subprocess.call(["python3", "manage.py", "rebuild_index", "--noinput"]) |
Reference in new issue