Browse Source

Fixes #2055 - Busca Textual (#2179)

Fixes #2055 - Busca Textual
pull/2439/head
Edward 6 years ago
committed by GitHub
parent
commit
90dec5fd76
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 5
      Dockerfile
  2. 72
      sapl/base/search_indexes.py
  3. 2
      sapl/materia/views.py
  4. 2
      sapl/norma/views.py
  5. 21
      sapl/settings.py
  6. 10
      sapl/templates/materia/materialegislativa_filter.html
  7. 10
      sapl/templates/norma/normajuridica_filter.html
  8. 61
      solr/docker-compose.yml
  9. 54
      solr/sapl_configset/conf/lang/stopwords_en.txt
  10. 253
      solr/sapl_configset/conf/lang/stopwords_pt.txt
  11. 573
      solr/sapl_configset/conf/managed-schema
  12. 20
      solr/sapl_configset/conf/params.json
  13. 21
      solr/sapl_configset/conf/protwords.txt
  14. BIN
      solr/sapl_configset/conf/saplconfigset.zip
  15. 165
      solr/sapl_configset/conf/schema.xml
  16. 1367
      solr/sapl_configset/conf/solrconfig.xml
  17. 14
      solr/sapl_configset/conf/stopwords.txt
  18. 29
      solr/sapl_configset/conf/synonyms.txt
  19. 155
      solr_api.py
  20. 18
      start.sh

5
Dockerfile

@ -1,8 +1,9 @@
FROM alpine:3.8
ENV BUILD_PACKAGES postgresql-dev graphviz-dev graphviz build-base git pkgconfig \
python3-dev libxml2-dev jpeg-dev libressl-dev libffi-dev libxslt-dev \
nodejs npm py3-lxml py3-magic postgresql-client poppler-utils antiword vim openssh-client
python3-dev libxml2-dev jpeg-dev libressl-dev libffi-dev libxslt-dev \
nodejs npm py3-lxml py3-magic postgresql-client poppler-utils antiword \
curl jq openssh-client vim openssh-client
RUN apk update --update-cache && apk upgrade

72
sapl/base/search_indexes.py

@ -1,6 +1,4 @@
import os.path
import re
import string
import textract
import logging
@ -8,6 +6,7 @@ from django.db.models import F, Q, Value
from django.db.models.fields import TextField
from django.db.models.functions import Concat
from django.template import loader
from haystack import connections
from haystack.constants import Indexable
from haystack.fields import CharField
from haystack.indexes import SearchIndex
@ -24,6 +23,7 @@ from sapl.utils import RemoveTag
class TextExtractField(CharField):
backend = None
logger = logging.getLogger(__name__)
def __init__(self, **kwargs):
@ -34,24 +34,20 @@ class TextExtractField(CharField):
self.model_attr = (self.model_attr, )
def solr_extraction(self, arquivo):
extracted_data = self._get_backend(None).extract_file_contents(
arquivo)['contents']
# Remove as tags xml
self.logger.debug("Removendo as tags xml.")
extracted_data = re.sub('<[^>]*>', '', extracted_data)
# Remove tags \t e \n
self.logger.debug("Removendo as \t e \n.")
extracted_data = extracted_data.replace(
'\n', ' ').replace('\t', ' ')
# Remove sinais de pontuação
self.logger.debug("Removendo sinais de pontuação.")
extracted_data = re.sub('[' + string.punctuation + ']',
' ', extracted_data)
# Remove espaços múltiplos
self.logger.debugger("Removendo espaços múltiplos.")
extracted_data = " ".join(extracted_data.split())
return extracted_data
if not self.backend:
self.backend = connections['default'].get_backend()
try:
with open(arquivo.path, 'rb') as f:
content = self.backend.extract_file_contents(f)
if not content or not content['contents']:
return ''
data = content['contents']
except Exception as e:
print('erro processando arquivo: ' % arquivo.path)
self.logger.error(arquivo.path)
self.logger.error('erro processando arquivo: ' % arquivo.path)
data = ''
return data
def whoosh_extraction(self, arquivo):
@ -66,11 +62,11 @@ class TextExtractField(CharField):
language='pt-br').decode('utf-8').replace('\n', ' ').replace(
'\t', ' ')
def print_error(self, arquivo):
self.logger.error("Erro inesperado processando arquivo: {}".format(arquivo.path))
msg = 'Erro inesperado processando arquivo: %s' % (
arquivo.path)
print(msg)
def print_error(self, arquivo, error):
msg = 'Erro inesperado processando arquivo %s erro: %s' % (
arquivo.path, error)
print(msg, error)
self.logger.error(msg, error)
def file_extractor(self, arquivo):
if not os.path.exists(arquivo.path) or \
@ -81,9 +77,9 @@ class TextExtractField(CharField):
if SOLR_URL:
try:
return self.solr_extraction(arquivo)
except Exception as e:
self.logger.error("Erro no arquivo {}. ".format(arquivo.path) + str(e))
self.print_error(arquivo)
except Exception as err:
print(str(err))
self.print_error(arquivo, err)
# Em ambiente de DEV utiliza-se o Whoosh
# Como ele não possui extração, faz-se uso do textract
@ -91,13 +87,13 @@ class TextExtractField(CharField):
try:
self.logger.debug("Tentando whoosh_extraction no arquivo {}".format(arquivo.path))
return self.whoosh_extraction(arquivo)
except ExtensionNotSupported as e:
self.logger.error("Erro no arquivo {}".format(arquivo.path) + str(e))
print(str(e))
except Exception as e2:
self.logger.error(str(e))
print(str(e2))
self.print_error(arquivo)
except ExtensionNotSupported as err:
print(str(err))
self.logger.error(str(err))
except Exception as err:
print(str(err))
self.print_error(arquivo, str(err))
return ''
def ta_extractor(self, value):
@ -133,7 +129,9 @@ class TextExtractField(CharField):
value = getattr(obj, attr)
if not value:
continue
data += getattr(self, func)(value)
data += getattr(self, func)(value) + ' '
data = data.replace('\n', ' ')
return data
@ -159,6 +157,10 @@ class DocumentoAcessorioIndex(SearchIndex, Indexable):
)
)
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.text.search_index = self
def get_model(self):
return self.model

2
sapl/materia/views.py

@ -1810,6 +1810,8 @@ class MateriaLegislativaPesquisaView(FilterView):
context['show_results'] = show_results_filter_set(qr)
context['USE_SOLR'] = settings.USE_SOLR if hasattr(settings, 'USE_SOLR') else False
return context

2
sapl/norma/views.py

@ -15,6 +15,7 @@ from django.views.generic import TemplateView, UpdateView
from django.views.generic.base import RedirectView
from django.views.generic.edit import FormView
from django_filters.views import FilterView
from sapl import settings
from sapl.base.models import AppConfig
from sapl.compilacao.views import IntegracaoTaView
from sapl.crud.base import (RP_DETAIL, RP_LIST, Crud, CrudAux,
@ -107,6 +108,7 @@ class NormaPesquisaView(FilterView):
context['filter_url'] = ('&' + qr.urlencode()) if len(qr) > 0 else ''
context['show_results'] = show_results_filter_set(qr)
context['USE_SOLR'] = settings.USE_SOLR if hasattr(settings, 'USE_SOLR') else False
return context

21
sapl/settings.py

@ -100,23 +100,28 @@ INSTALLED_APPS = (
# FTS = Full Text Search
# Desabilita a indexação textual até encontramos uma solução para a issue
# https://github.com/interlegis/sapl/issues/2055
#HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.BaseSignalProcessor'
#HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.BaseSignalProcessor' # Disable auto index
HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
SEARCH_BACKEND = 'haystack.backends.whoosh_backend.WhooshEngine'
SEARCH_URL = ('PATH', PROJECT_DIR.child('whoosh'))
SOLR_URL = config('SOLR_URL', cast=str, default='')
if SOLR_URL:
# SOLR
USE_SOLR = config('USE_SOLR', cast=bool, default=False)
SOLR_URL = config('SOLR_URL', cast=str, default='http://localhost:8983')
SOLR_COLLECTION = config('SOLR_COLLECTION', cast=str, default='sapl')
if USE_SOLR:
SEARCH_BACKEND = 'haystack.backends.solr_backend.SolrEngine'
SEARCH_URL = ('URL', config('SOLR_URL', cast=str))
# ...or for multicore...
# 'URL': 'http://127.0.0.1:8983/solr/mysite',
SEARCH_URL = ('URL', '{}/solr/{}'.format(SOLR_URL, SOLR_COLLECTION))
# BATCH_SIZE: default is 1000 if omitted, avoid Too Large Entity Body errors
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': SEARCH_BACKEND,
SEARCH_URL[0]: SEARCH_URL[1]
SEARCH_URL[0]: SEARCH_URL[1],
'BATCH_SIZE': 500,
'TIMEOUT': 60,
},
}

10
sapl/templates/materia/materialegislativa_filter.html

@ -3,11 +3,13 @@
{% load crispy_forms_tags %}
{% block actions %}
<div class="actions btn-group pull-right" role="group">
<!--
<a href="{% url 'sapl.base:haystack_search' %}" class="btn btn-default">
Pesquisa Textual
</a> -->
{% if USE_SOLR %}
<a href="{% url 'sapl.base:haystack_search' %}" class="btn btn-default">
Pesquisa Textual
</a>
{% endif %}
{% if perms.materia.add_materialegislativa %}
<a href="{% url 'sapl.materia:materialegislativa_create' %}" class="btn btn-default">

10
sapl/templates/norma/normajuridica_filter.html

@ -4,11 +4,11 @@
{% block actions %}
<div class="actions btn-group pull-right" role="group">
<!--
<a href="{% url 'sapl.base:haystack_search' %}" class="btn btn-default">
Pesquisa Textual
</a>
-->
{% if USE_SOLR %}
<a href="{% url 'sapl.base:haystack_search' %}" class="btn btn-default">
Pesquisa Textual
</a>
{% endif %}
{% if perms.norma.add_normajuridica %}

61
solr/docker-compose.yml

@ -0,0 +1,61 @@
version: '2'
services:
sapldb:
image: postgres:10.5-alpine
restart: always
environment:
POSTGRES_PASSWORD: sapl
POSTGRES_USER: sapl
POSTGRES_DB: sapl
PGDATA : /var/lib/postgresql/data/
volumes:
- sapldb_data:/var/lib/postgresql/data/
ports:
- "5432:5432"
saplsolr:
image: solr:7.4-alpine
restart: always
command: bin/solr start -c -f
volumes:
- solr_data:/opt/solr/server/solr
- solr_configsets:/opt/solr/server/solr/configsets
ports:
- "8983:8983"
sapl:
image: interlegis/sapl:3.1.138
# build: .
restart: always
environment:
ADMIN_PASSWORD: interlegis
ADMIN_EMAIL: email@dominio.net
DEBUG: 'False'
USE_TLS: 'False'
EMAIL_PORT: 587
EMAIL_HOST: smtp.dominio.net
EMAIL_HOST_USER: usuariosmtp
EMAIL_HOST_PASSWORD: senhasmtp
USE_SOLR: 'True'
#SOLR_COLLECTION: sapl
#SOLR_HOST: saplsolr
SOLR_URL: http://saplsolr:8983/solr/sapl
TZ: America/Sao_Paulo
volumes:
- sapl_data:/var/interlegis/sapl/data
- sapl_media:/var/interlegis/sapl/media
- sapl_root:/var/interlegis/sapl
volumes_from:
- saplsolr
depends_on:
- sapldb
- saplsolr
ports:
- "80:80"
volumes:
sapldb_data:
sapl_data:
sapl_media:
sapl_root:
solr_data:
solr_configsets:

54
solr/sapl_configset/conf/lang/stopwords_en.txt

@ -0,0 +1,54 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# a couple of test stopwords to test that the words are really being
# configured from this file:
stopworda
stopwordb
# Standard english stop words taken from Lucene's StopAnalyzer
a
an
and
are
as
at
be
but
by
for
if
in
into
is
it
no
not
of
on
or
such
that
the
their
then
there
these
they
this
to
was
will
with

253
solr/sapl_configset/conf/lang/stopwords_pt.txt

@ -0,0 +1,253 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Portuguese stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The following is a ranked list (commonest to rarest) of stopwords
| deriving from a large sample of text.
| Extra words have been added at the end.
de | of, from
a | the; to, at; her
o | the; him
que | who, that
e | and
do | de + o
da | de + a
em | in
um | a
para | for
| é from SER
com | with
não | not, no
uma | a
os | the; them
no | em + o
se | himself etc
na | em + a
por | for
mais | more
as | the; them
dos | de + os
como | as, like
mas | but
| foi from SER
ao | a + o
ele | he
das | de + as
| tem from TER
à | a + a
seu | his
sua | her
ou | or
| ser from SER
quando | when
muito | much
| há from HAV
nos | em + os; us
já | already, now
| está from EST
eu | I
também | also
só | only, just
pelo | per + o
pela | per + a
até | up to
isso | that
ela | he
entre | between
| era from SER
depois | after
sem | without
mesmo | same
aos | a + os
| ter from TER
seus | his
quem | whom
nas | em + as
me | me
esse | that
eles | they
| estão from EST
você | you
| tinha from TER
| foram from SER
essa | that
num | em + um
nem | nor
suas | her
meu | my
às | a + as
minha | my
| têm from TER
numa | em + uma
pelos | per + os
elas | they
| havia from HAV
| seja from SER
qual | which
| será from SER
nós | we
| tenho from TER
lhe | to him, her
deles | of them
essas | those
esses | those
pelas | per + as
este | this
| fosse from SER
dele | of him
| other words. There are many contractions such as naquele = em+aquele,
| mo = me+o, but they are rare.
| Indefinite article plural forms are also rare.
tu | thou
te | thee
vocês | you (plural)
vos | you
lhes | to them
meus | my
minhas
teu | thy
tua
teus
tuas
nosso | our
nossa
nossos
nossas
dela | of her
delas | of them
esta | this
estes | these
estas | these
aquele | that
aquela | that
aqueles | those
aquelas | those
isto | this
aquilo | that
| forms of estar, to be (not including the infinitive):
estou
está
estamos
estão
estive
esteve
estivemos
estiveram
estava
estávamos
estavam
estivera
estivéramos
esteja
estejamos
estejam
estivesse
estivéssemos
estivessem
estiver
estivermos
estiverem
| forms of haver, to have (not including the infinitive):
hei
havemos
hão
houve
houvemos
houveram
houvera
houvéramos
haja
hajamos
hajam
houvesse
houvéssemos
houvessem
houver
houvermos
houverem
houverei
houverá
houveremos
houverão
houveria
houveríamos
houveriam
| forms of ser, to be (not including the infinitive):
sou
somos
são
era
éramos
eram
fui
foi
fomos
foram
fora
fôramos
seja
sejamos
sejam
fosse
fôssemos
fossem
for
formos
forem
serei
será
seremos
serão
seria
seríamos
seriam
| forms of ter, to have (not including the infinitive):
tenho
tem
temos
tém
tinha
tínhamos
tinham
tive
teve
tivemos
tiveram
tivera
tivéramos
tenha
tenhamos
tenham
tivesse
tivéssemos
tivessem
tiver
tivermos
tiverem
terei
terá
teremos
terão
teria
teríamos
teriam

573
solr/sapl_configset/conf/managed-schema

@ -0,0 +1,573 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
This example schema is the recommended starting point for users.
It should be kept correct and concise, usable out-of-the-box.
For more information, on how to customize this file, please see
http://lucene.apache.org/solr/guide/documents-fields-and-schema-design.html
PERFORMANCE NOTE: this schema includes many optional features and should not
be used for benchmarking. To improve performance one could
- set stored="false" for all fields possible (esp large fields) when you
only need to search on the field but don't need to return the original
value.
- set indexed="false" if you don't need to search on the field, but only
return the field as a result of searching on other indexed fields.
- remove all unneeded copyField statements
- for best index size and searching performance, set "index" to false
for all general text fields, use copyField to copy them to the
catchall "text" field, and use that for searching.
-->
<schema name="default-config" version="1.6">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
version="x.y" is Solr's version number for the schema syntax and
semantics. It should not normally be changed by applications.
1.0: multiValued attribute did not exist, all fields are multiValued
by nature
1.1: multiValued attribute introduced, false by default
1.2: omitTermFreqAndPositions attribute introduced, true by default
except for text fields.
1.3: removed optional field compress feature
1.4: autoGeneratePhraseQueries attribute introduced to drive QueryParser
behavior when a single string produces multiple tokens. Defaults
to off for version >= 1.4
1.5: omitNorms defaults to true for primitive field types
(int, float, boolean, string...)
1.6: useDocValuesAsStored defaults to true.
-->
<!-- Valid attributes for fields:
name: mandatory - the name for the field
type: mandatory - the name of a field type from the
fieldTypes section
indexed: true if this field should be indexed (searchable or sortable)
stored: true if this field should be retrievable
docValues: true if this field should have doc values. Doc Values is
recommended (required, if you are using *Point fields) for faceting,
grouping, sorting and function queries. Doc Values will make the index
faster to load, more NRT-friendly and more memory-efficient.
They are currently only supported by StrField, UUIDField, all
*PointFields, and depending on the field type, they might require
the field to be single-valued, be required or have a default value
(check the documentation of the field type you're interested in for
more information)
multiValued: true if this field may contain multiple values per document
omitNorms: (expert) set to true to omit the norms associated with
this field (this disables length normalization and index-time
boosting for the field, and saves some memory). Only full-text
fields or fields that need an index-time boost need norms.
Norms are omitted for primitive (non-analyzed) types by default.
termVectors: [false] set to true to store the term vector for a
given field.
When using MoreLikeThis, fields used for similarity should be
stored for best performance.
termPositions: Store position information with the term vector.
This will increase storage costs.
termOffsets: Store offset information with the term vector. This
will increase storage costs.
required: The field is required. It will throw an error if the
value does not exist
default: a value that should be used if no value is specified
when adding a document.
-->
<!-- field names should consist of alphanumeric or underscore characters only and
not start with a digit. This is not currently strictly enforced,
but other field names will not have first class support from all components
and back compatibility is not guaranteed. Names with both leading and
trailing underscores (e.g. _version_) are reserved.
-->
<!-- In this _default configset, only four fields are pre-declared:
id, _version_, and _text_ and _root_. All other fields will be type guessed and added via the
"add-unknown-fields-to-the-schema" update request processor chain declared in solrconfig.xml.
Note that many dynamic fields are also defined - you can use them to specify a
field's type via field naming conventions - see below.
WARNING: The _text_ catch-all field will significantly increase your index size.
If you don't need it, consider removing it and the corresponding copyField directive.
-->
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<!-- docValues are enabled by default for long type so we don't need to index the version field -->
<field name="_version_" type="plong" indexed="false" stored="false"/>
<field name="_root_" type="string" indexed="true" stored="false" docValues="false" />
<field name="_text_" type="text_general" indexed="true" stored="false" multiValued="true"/>
<!-- Django fields -->
<field name="django_ct" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="django_id" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="text" type="text_pt" indexed="true" stored="true" multiValued="false" />
<!-- This can be enabled, in case the client does not know what fields may be searched. It isn't enabled by default
because it's very expensive to index everything twice. -->
<!-- <copyField source="*" dest="_text_"/> -->
<!-- Dynamic field definitions allow using convention over configuration
for fields via the specification of patterns to match field names.
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
RESTRICTION: the glob-like pattern in the name attribute must have a "*" only at the start or the end. -->
<dynamicField name="*_i" type="pint" indexed="true" stored="true"/>
<dynamicField name="*_is" type="pints" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true" />
<dynamicField name="*_ss" type="strings" indexed="true" stored="true"/>
<dynamicField name="*_l" type="plong" indexed="true" stored="true"/>
<dynamicField name="*_ls" type="plongs" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text_general" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_bs" type="booleans" indexed="true" stored="true"/>
<dynamicField name="*_f" type="pfloat" indexed="true" stored="true"/>
<dynamicField name="*_fs" type="pfloats" indexed="true" stored="true"/>
<dynamicField name="*_d" type="pdouble" indexed="true" stored="true"/>
<dynamicField name="*_ds" type="pdoubles" indexed="true" stored="true"/>
<!-- Type used for data-driven schema, to add a string copy for each text field -->
<dynamicField name="*_str" type="strings" stored="false" docValues="true" indexed="false" />
<dynamicField name="*_dt" type="pdate" indexed="true" stored="true"/>
<dynamicField name="*_dts" type="pdate" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_p" type="location" indexed="true" stored="true"/>
<dynamicField name="*_srpt" type="location_rpt" indexed="true" stored="true"/>
<!-- payloaded dynamic fields -->
<dynamicField name="*_dpf" type="delimited_payloads_float" indexed="true" stored="true"/>
<dynamicField name="*_dpi" type="delimited_payloads_int" indexed="true" stored="true"/>
<dynamicField name="*_dps" type="delimited_payloads_string" indexed="true" stored="true"/>
<dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>
<!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field
-->
<uniqueKey>id</uniqueKey>
<!-- copyField commands copy one field to another at the time a document
is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching.
<copyField source="sourceFieldName" dest="destinationFieldName"/>
-->
<!-- field type definitions. The "name" attribute is
just a label to be used by field definitions. The "class"
attribute and any other attributes determine the real
behavior of the fieldType.
Class names starting with "solr" refer to java classes in a
standard package such as org.apache.solr.analysis
-->
<!-- sortMissingLast and sortMissingFirst attributes are optional attributes are
currently supported on types that are sorted internally as strings
and on numeric types.
This includes "string", "boolean", "pint", "pfloat", "plong", "pdate", "pdouble".
- If sortMissingLast="true", then a sort on this field will cause documents
without the field to come after documents with the field,
regardless of the requested sort order (asc or desc).
- If sortMissingFirst="true", then a sort on this field will cause documents
without the field to come before documents with the field,
regardless of the requested sort order.
- If sortMissingLast="false" and sortMissingFirst="false" (the default),
then default lucene sorting will be used which places docs without the
field first in an ascending sort and last in a descending sort.
-->
<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
<fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true" docValues="true" />
<!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<!--
Numeric field types that index values using KD-trees.
Point fields don't support FieldCache, so they must have docValues="true" if needed for sorting, faceting, functions, etc.
-->
<fieldType name="pint" class="solr.IntPointField" docValues="true"/>
<fieldType name="pfloat" class="solr.FloatPointField" docValues="true"/>
<fieldType name="plong" class="solr.LongPointField" docValues="true"/>
<fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/>
<fieldType name="pints" class="solr.IntPointField" docValues="true" multiValued="true"/>
<fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
<fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/>
<fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
is a more restricted form of the canonical representation of dateTime
http://www.w3.org/TR/xmlschema-2/#dateTime
The trailing "Z" designates UTC time and is mandatory.
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
All other components are mandatory.
Expressions can also be used to denote calculations that should be
performed relative to "NOW" to determine the value, ie...
NOW/HOUR
... Round to the start of the current hour
NOW-1DAY
... Exactly 1 day prior to now
NOW/DAY+6MONTHS+3DAYS
... 6 months and 3 days in the future from the start of
the current day
-->
<!-- KD-tree versions of date fields -->
<fieldType name="pdate" class="solr.DatePointField" docValues="true"/>
<fieldType name="pdates" class="solr.DatePointField" docValues="true" multiValued="true"/>
<!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
<fieldType name="binary" class="solr.BinaryField"/>
<!-- solr.TextField allows the specification of custom text analyzers
specified as a tokenizer and a list of token filters. Different
analyzers may be specified for indexing and querying.
The optional positionIncrementGap puts space between multiple fields of
this type on the same document, with the purpose of preventing false phrase
matching across fields.
For more info on customizing your analyzer chain, please see
http://lucene.apache.org/solr/guide/understanding-analyzers-tokenizers-and-filters.html#understanding-analyzers-tokenizers-and-filters
-->
<!-- One can also specify an existing Analyzer class that has a
default constructor via the class attribute on the analyzer element.
Example:
<fieldType name="text_greek" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
</fieldType>
-->
<!-- A text field that only splits on whitespace for exact matching of words -->
<dynamicField name="*_ws" type="text_ws" indexed="true" stored="true"/>
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
<!-- A general text field that has reasonable, generic
cross-language defaults: it tokenizes with StandardTokenizer,
removes stop words from case-insensitive "stopwords.txt"
(empty by default), and down cases. At query time only, it
also applies synonyms.
-->
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" multiValued="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.FlattenGraphFilterFactory"/>
-->
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- SortableTextField generaly functions exactly like TextField,
except that it supports, and by default uses, docValues for sorting (or faceting)
on the first 1024 characters of the original field values (which is configurable).
This makes it a bit more useful then TextField in many situations, but the trade-off
is that it takes up more space on disk; which is why it's not used in place of TextField
for every fieldType in this _default schema.
-->
<dynamicField name="*_t_sort" type="text_gen_sort" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="*_txt_sort" type="text_gen_sort" indexed="true" stored="true"/>
<fieldType name="text_gen_sort" class="solr.SortableTextField" positionIncrementGap="100" multiValued="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- A text field with defaults appropriate for English: it tokenizes with StandardTokenizer,
removes English stop words (lang/stopwords_en.txt), down cases, protects words from protwords.txt, and
finally applies Porter's stemming. The query time analyzer also applies synonyms from synonyms.txt. -->
<dynamicField name="*_txt_en" type="text_en" indexed="true" stored="true"/>
<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.FlattenGraphFilterFactory"/>
-->
<!-- Case insensitive stop word removal.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
<filter class="solr.EnglishMinimalStemFilterFactory"/>
-->
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
<filter class="solr.EnglishMinimalStemFilterFactory"/>
-->
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<!-- A text field with defaults appropriate for English, plus
aggressive word-splitting and autophrase features enabled.
This field is just like text_en, except it adds
WordDelimiterGraphFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and
non-alphanumeric chars. This means certain compound word
cases will work, for example query "wi fi" will match
document "WiFi" or "wi-fi".
-->
<dynamicField name="*_txt_en_split" type="text_en_splitting" indexed="true" stored="true"/>
<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight" indexed="true" stored="true"/>
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
<filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- Just like text_general except it reverses the characters of
each token, to enable more efficient leading wildcard queries.
-->
<dynamicField name="*_txt_rev" type="text_general_rev" indexed="true" stored="true"/>
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<dynamicField name="*_phon_en" type="phonetic_en" indexed="true" stored="true"/>
<fieldType name="phonetic_en" stored="false" indexed="true" class="solr.TextField" >
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
</analyzer>
</fieldType>
<!-- lowercases the entire field value, keeping it as a single token. -->
<dynamicField name="*_s_lower" type="lowercase" indexed="true" stored="true"/>
<fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!--
Example of using PathHierarchyTokenizerFactory at index time, so
queries for paths match documents at that path, or in descendent paths
-->
<dynamicField name="*_descendent_path" type="descendent_path" indexed="true" stored="true"/>
<fieldType name="descendent_path" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.KeywordTokenizerFactory" />
</analyzer>
</fieldType>
<!--
Example of using PathHierarchyTokenizerFactory at query time, so
queries for paths match documents at that path, or in ancestor paths
-->
<dynamicField name="*_ancestor_path" type="ancestor_path" indexed="true" stored="true"/>
<fieldType name="ancestor_path" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.KeywordTokenizerFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
</analyzer>
</fieldType>
<!-- This point type indexes the coordinates as separate fields (subFields)
If subFieldType is defined, it references a type, and a dynamic field
definition is created matching *___<typename>. Alternately, if
subFieldSuffix is defined, that is used to create the subFields.
Example: if subFieldType="double", then the coordinates would be
indexed in fields myloc_0___double,myloc_1___double.
Example: if subFieldSuffix="_d" then the coordinates would be indexed
in fields myloc_0_d,myloc_1_d
The subFields are an implementation detail of the fieldType, and end
users normally should not need to know about them.
-->
<dynamicField name="*_point" type="point" indexed="true" stored="true"/>
<fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
<!-- A specialized field for geospatial search filters and distance sorting. -->
<fieldType name="location" class="solr.LatLonPointSpatialField" docValues="true"/>
<!-- A geospatial field type that supports multiValued and polygon shapes.
For more information about this and other spatial fields see:
http://lucene.apache.org/solr/guide/spatial-search.html
-->
<fieldType name="location_rpt" class="solr.SpatialRecursivePrefixTreeFieldType"
geo="true" distErrPct="0.025" maxDistErr="0.001" distanceUnits="kilometers" />
<!-- Payloaded field types -->
<fieldType name="delimited_payloads_float" stored="false" indexed="true" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
</analyzer>
</fieldType>
<fieldType name="delimited_payloads_int" stored="false" indexed="true" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="integer"/>
</analyzer>
</fieldType>
<fieldType name="delimited_payloads_string" stored="false" indexed="true" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="identity"/>
</analyzer>
</fieldType>
<!-- Portuguese -->
<dynamicField name="*_txt_pt" type="text_pt" indexed="true" stored="true"/>
<fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" />
<filter class="solr.PortugueseLightStemFilterFactory"/>
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
<!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->
</analyzer>
</fieldType>
<!-- Similarity is the scoring routine for each document vs. a query.
A custom Similarity or SimilarityFactory may be specified here, but
the default is fine for most applications.
For more info: http://lucene.apache.org/solr/guide/other-schema-elements.html#OtherSchemaElements-Similarity
-->
<!--
<similarity class="com.example.solr.CustomSimilarityFactory">
<str name="paramkey">param value</str>
</similarity>
-->
</schema>

20
solr/sapl_configset/conf/params.json

@ -0,0 +1,20 @@
{"params":{
"query":{
"defType":"edismax",
"q.alt":"*:*",
"rows":"10",
"fl":"*,score",
"":{"v":0}
},
"facets":{
"facet":"on",
"facet.mincount": "1",
"":{"v":0}
},
"velocity":{
"wt": "velocity",
"v.template":"browse",
"v.layout": "layout",
"":{"v":0}
}
}}

21
solr/sapl_configset/conf/protwords.txt

@ -0,0 +1,21 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-----------------------------------------------------------------------
# Use a protected word file to protect against the stemmer reducing two
# unrelated words to the same base word.
# Some non-words that normally won't be encountered,
# just to test that they won't be stemmed.
dontstems
zwhacky

BIN
solr/sapl_configset/conf/saplconfigset.zip

Binary file not shown.

165
solr/sapl_configset/conf/schema.xml

@ -0,0 +1,165 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<schema name="default" version="1.6">
<types>
<fieldtype name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<fieldtype name="binary" class="solr.BinaryField"/>
<!-- Numeric field types that manipulate the value into
a string value that isn't human-readable in its internal form,
but with a lexicographic ordering the same as the numeric ordering,
so that range queries work correctly. -->
<fieldType name="pint" class="solr.IntPointField" docValues="true" />
<fieldType name="pfloat" class="solr.FloatPointField" docValues="true" />
<fieldType name="plong" class="solr.LongPointField" docValues="true" />
<fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/>
<fieldType name="pdate" class="solr.DatePointField" docValues="true" />
<!-- A Trie based date field ifor faster date range queries and date faceting. -->
<fieldType name="pints" class="solr.IntPointField" docValues="true" multiValued="true"/>
<fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
<fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/>
<fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
<fieldType name="pdates" class="solr.DatePointField" docValues="true" multiValued="true"/>
<fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
<fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
<fieldtype name="geohash" class="solr.GeoHashField"/>
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- Portuguese -->
<dynamicField name="*_txt_pt" type="text_pt" indexed="true" stored="true"/>
<fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" />
<filter class="solr.PortugueseLightStemFilterFactory"/>
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
<!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->
</analyzer>
</fieldType>
<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
<filter class="solr.EnglishMinimalStemFilterFactory"/>
-->
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
<filter class="solr.EnglishMinimalStemFilterFactory"/>
-->
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="ngram" class="solr.TextField" >
<analyzer type="index">
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.NGramFilterFactory" minGramSize="3" maxGramSize="15" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="edge_ngram" class="solr.TextField" positionIncrementGap="1">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.EdgeNGramFilterFactory" minGramSize="2" maxGramSize="15" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
</analyzer>
</fieldType>
</types>
<fields>
<!-- general -->
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
<field name="django_ct" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="django_id" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="_version_" type="plong" indexed="true" stored ="true"/>
<field name="text" type="text_pt" indexed="true" stored="true" multiValued="false" />
</fields>
<!-- field to use to determine and enforce document uniqueness. -->
<uniqueKey>id</uniqueKey>
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
<df>text</df>
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser q.op="AND"/>
</schema>

1367
solr/sapl_configset/conf/solrconfig.xml

File diff suppressed because it is too large

14
solr/sapl_configset/conf/stopwords.txt

@ -0,0 +1,14 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

29
solr/sapl_configset/conf/synonyms.txt

@ -0,0 +1,29 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-----------------------------------------------------------------------
#some test synonym mappings unlikely to appear in real input text
aaafoo => aaabar
bbbfoo => bbbfoo bbbbar
cccfoo => cccbar cccbaz
fooaaa,baraaa,bazaaa
# Some synonym groups specific to this example
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
pixima => pixma

155
solr_api.py

@ -0,0 +1,155 @@
import requests
import subprocess
import sys
import argparse
class SolrClient:
LIST_CONFIGSETS = "{}/solr/admin/configs?action=LIST&omitHeader=true&wt=json"
UPLOAD_CONFIGSET = "{}/solr/admin/configs?action=UPLOAD&name={}&wt=json"
LIST_COLLECTIONS = "{}/solr/admin/collections?action=LIST&wt=json"
STATUS_COLLECTION = "{}/solr/admin/collections?action=CLUSTERSTATUS&collection={}&wt=json"
STATUS_CORE = "{}/admin/cores?action=STATUS&name={}"
EXISTS_COLLECTION = "{}/solr/{}/admin/ping?wt=json"
OPTIMIZE_COLLECTION = "{}/solr/{}/update?optimize=true&wt=json"
CREATE_COLLECTION = "{}/solr/admin/collections?action=CREATE&name={}&collection.configName={}&numShards={}&replicationFactor={}&maxShardsPerNode={}&wt=json"
DELETE_COLLECTION = "{}/solr/admin/collections?action=DELETE&name={}&wt=json"
DELETE_DATA = "{}/solr/{}/update?commitWithin=1000&overwrite=true&wt=json"
QUERY_DATA = "{}/solr/{}/select?q=*:*"
CONFIGSET_NAME = "sapl_configset"
def __init__(self, url):
self.url = url
def get_num_docs(self, collection_name):
final_url = self.QUERY_DATA.format(self.url, collection_name)
res = requests.get(final_url)
dic = res.json()
num_docs = dic["response"]["numFound"]
return num_docs
def list_collections(self):
req_url = self.LIST_COLLECTIONS.format(self.url)
res = requests.get(req_url)
dic = res.json()
return dic['collections']
def exists_collection(self, collection_name):
collections = self.list_collections()
return True if collection_name in collections else False
def maybe_upload_configset(self, force=False):
req_url = self.LIST_CONFIGSETS.format(self.url)
res = requests.get(req_url)
dic = res.json()
configsets = dic['configSets']
# UPLOAD configset
if not self.CONFIGSET_NAME in configsets or force:
files = {'file': ('saplconfigset.zip',
open('./solr/sapl_configset/conf/saplconfigset.zip',
'rb'),
'application/octet-stream',
{'Expires': '0'})}
req_url = self.UPLOAD_CONFIGSET.format(self.url, self.CONFIGSET_NAME)
resp = requests.post(req_url, files=files)
print(resp.content)
else:
print('O %s já presente no servidor, NÃO enviando.' % self.CONFIGSET_NAME)
def create_collection(self, collection_name, shards=1, replication_factor=1, max_shards_per_node=1):
self.maybe_upload_configset()
req_url = self.CREATE_COLLECTION.format(self.url,
collection_name,
self.CONFIGSET_NAME,
shards,
replication_factor,
max_shards_per_node)
res = requests.post(req_url)
if res.ok:
print("Collection '%s' created succesfully" % collection_name)
else:
print("Error creating collection '%s'" % collection_name)
as_json = res.json()
print("Error %s: %s" % (res.status_code, as_json['error']['msg']))
return False
return True
def delete_collection(self, collection_name):
if collection_name == '*':
collections = self.list_collections()
else:
collections = [collection_name]
for c in collections:
req_url = self.DELETE_COLLECTION.format(self.url, c)
res = requests.post(req_url)
if not res.ok:
print("Error deleting collection '%s'", c)
print("Code {}: {}".format(res.status_code, res.text))
else:
print("Collection '%s' deleted successfully!" % c)
def delete_index_data(self, collection_name):
req_url = self.DELETE_DATA.format(self.url, collection_name)
res = requests.post(req_url,
data='<delete><query>*:*</query></delete>',
headers={'Content-Type': 'application/xml'})
if not res.ok:
print("Error deleting index for collection '%s'", collection_name)
print("Code {}: {}".format(res.status_code, res.text))
else:
print("Collection '%s' data deleted successfully!" % collection_name)
num_docs = self.get_num_docs(collection_name)
print("Num docs: %s" % num_docs)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Cria uma collection no Solr')
# required arguments
parser.add_argument('-u', type=str, metavar='URL', nargs=1, dest='url',
required=True, help='Endereço do servidor Solr na forma http(s)://<address>[:port]')
parser.add_argument('-c', type=str, metavar='COLLECTION', dest='collection', nargs=1,
required=True, help='Collection Solr a ser criada')
# optional arguments
parser.add_argument('-s', type=int, dest='shards', nargs='?',
help='Number of shards (default=1)', default=1)
parser.add_argument('-rf', type=int, dest='replication_factor', nargs='?',
help='Replication factor (default=1)', default=1)
parser.add_argument('-ms', type=int, dest='max_shards_per_node', nargs='?',
help='Max shards per node (default=1)', default=1)
try:
args = parser.parse_args()
except IOError as msg:
parser.error(str(msg))
sys.exit(-1)
url = args.url.pop()
collection = args.collection.pop()
client = SolrClient(url=url)
if not client.exists_collection(collection):
print("Collection '%s' doesn't exists. Creating a new one..." % collection)
created = client.create_collection(collection,
shards=args.shards,
replication_factor=args.replication_factor,
max_shards_per_node=args.max_shards_per_node)
if not created:
sys.exit(-1)
else:
print("Collection '%s' exists." % collection)
num_docs = client.get_num_docs(collection)
if num_docs == 0:
print("Performing a full reindex of '%s' collection..." % collection)
p = subprocess.call(["python3", "manage.py", "rebuild_index", "--noinput"])

18
start.sh

@ -36,6 +36,10 @@ create_env() {
echo "EMAIL_SEND_USER = ""${EMAIL_HOST_USER-''}" >> $FILENAME
echo "DEFAULT_FROM_EMAIL = ""${EMAIL_HOST_USER-''}" >> $FILENAME
echo "SERVER_EMAIL = ""${EMAIL_HOST_USER-''}" >> $FILENAME
echo "USE_SOLR = ""${USER_SOLR-True}" >> $FILENAME
echo "SOLR_COLLECTION = ""${SOLR_COLLECTION-'sapl'}" >> $FILENAME
echo "SOLR_URL = ""${SOLR_URL-'http://saplsolr:8983'}" >> $FILENAME
echo "[ENV FILE] done."
}
@ -46,10 +50,22 @@ create_env
/bin/sh busy-wait.sh $DATABASE_URL
## SOLR
NUM_SHARDS=""${NUM_SHARDS-1}"
RF=""${RF-1}"
MAX_SHARDS_PER_NODE=""${NUM_SHARDS-1}"
# Verifica se a variável USE_SOLR foi definida e é igual a True
if [[ ! -z "$USE_SOLR" ]] && [[ "$USE_SOLR" = "True" ]]; then
python3 solr_api.py -u $SOLR_URL -c $SOLR_COLLECTION -s $NUM_SHARDS -rf $RF -ms $MAX_SHARDS_PER_NODE &
# python3 manage.py rebuild_index --noinput &
fi
# manage.py migrate --noinput nao funcionava
yes yes | python3 manage.py migrate
#python3 manage.py collectstatic --no-input
# python3 manage.py rebuild_index --noinput &
echo "Criando usuário admin..."

Loading…
Cancel
Save