Commit 87f2744166643eda4b73deb53b565916bd07c301
1 parent
badd76e0
Update docker configuration.
Showing
10 changed files
with
87 additions
and
356802 deletions
Too many changes to show.
To preserve performance only 9 of 10 files are displayed.
Dockerfile
... | ... | @@ -14,7 +14,8 @@ ENV PYTHONUNBUFFERED 1 |
14 | 14 | |
15 | 15 | # install dependencies |
16 | 16 | RUN apt-get update && \ |
17 | - apt-get install -y python3-pip | |
17 | + apt-get install -y python3-pip && \ | |
18 | + pip3 install --upgrade pip | |
18 | 19 | COPY ./requirements.txt /usr/src/collector/requirements.txt |
19 | 20 | RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /usr/src/collector/wheels -r requirements.txt |
20 | 21 | |
... | ... | @@ -30,33 +31,35 @@ FROM ubuntu:18.04 |
30 | 31 | RUN apt-get update && \ |
31 | 32 | apt-get install -y locales && \ |
32 | 33 | locale-gen pl_PL.UTF-8 |
34 | + | |
35 | +# set envs | |
33 | 36 | ENV LANG pl_PL.UTF-8 |
34 | 37 | ENV LC_ALL pl_PL.UTF-8 |
38 | +ENV HOME=/home/collector | |
39 | +ENV APP_HOME=/home/collector/app | |
35 | 40 | |
36 | -# create directory for the collector user | |
37 | -RUN mkdir -p /home/collector | |
38 | - | |
39 | -# create the collector user | |
40 | -RUN addgroup --group collector && adduser collector --ingroup collector | |
41 | +# create directory for the collector user and user itself | |
42 | +RUN mkdir -p $HOME && \ | |
43 | + addgroup --group collector && \ | |
44 | + adduser collector --ingroup collector | |
41 | 45 | |
42 | 46 | # create the appropriate directories |
43 | -ENV HOME=/home/collector | |
44 | -ENV APP_HOME=/home/collector/app | |
45 | 47 | RUN mkdir $APP_HOME |
46 | 48 | WORKDIR $APP_HOME |
47 | 49 | |
48 | 50 | # install dependencies |
49 | 51 | COPY --from=builder /usr/src/collector/wheels /wheels |
50 | 52 | COPY --from=builder /usr/src/collector/requirements.txt . |
51 | -RUN apt-get install -y netcat openjdk-8-jre python3-pip software-properties-common wget && \ | |
53 | +RUN apt-get update && apt-get install -y netcat openjdk-8-jre python3-pip software-properties-common wget && \ | |
52 | 54 | wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key|apt-key add - && \ |
53 | 55 | apt-add-repository http://download.sgjp.pl/apt/ubuntu && \ |
54 | 56 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ |
55 | 57 | echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" |tee /etc/apt/sources.list.d/pgdg.list && \ |
56 | 58 | apt-get update && \ |
57 | - apt-get install -y morfeusz2 python3-morfeusz2 && \ | |
58 | - DEBIAN_FRONTEND="noninteractive" apt-get -y install postgresql-12 | |
59 | -RUN pip3 install --no-cache /wheels/* | |
59 | + apt-get install -y morfeusz2 python3-morfeusz2 libopenblas-dev libomp-dev && \ | |
60 | + DEBIAN_FRONTEND="noninteractive" apt-get -y install postgresql-12 && \ | |
61 | + pip3 install --upgrade pip && \ | |
62 | + pip3 install --no-cache /wheels/* | |
60 | 63 | |
61 | 64 | # copy project |
62 | 65 | COPY . $APP_HOME |
... | ... | @@ -65,17 +68,29 @@ COPY . $APP_HOME |
65 | 68 | WORKDIR ./tools/liner2/g419-external-dependencies |
66 | 69 | RUN tar -xvf ./CRF++-0.57.tar.gz |
67 | 70 | WORKDIR ./CRF++-0.57 |
68 | -RUN ./configure | |
69 | -RUN make | |
70 | -RUN make install | |
71 | -RUN ldconfig | |
71 | +RUN ./configure && \ | |
72 | + make && \ | |
73 | + make install && \ | |
74 | + make clean && \ | |
75 | + ldconfig | |
76 | + | |
77 | +# install eurobert | |
78 | +WORKDIR $APP_HOME/tools | |
79 | +RUN wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/eurobert-model.tar.gz && \ | |
80 | + tar -xvf ./eurobert-model.tar.gz && \ | |
81 | + rm ./eurobert-model.tar.gz | |
82 | + | |
83 | +# install labse | |
84 | +RUN wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/labse-model.tar.gz && \ | |
85 | + tar -xvf ./labse-model.tar.gz && \ | |
86 | + rm ./labse-model.tar.gz | |
72 | 87 | WORKDIR $APP_HOME |
73 | 88 | |
74 | 89 | # copy django settings |
75 | 90 | COPY ./collector/collector/docker-settings.py $APP_HOME/collector/collector/settings.py |
76 | 91 | |
77 | 92 | # chown all the files to the collector user |
78 | -RUN chown -R collector:collector $APP_HOME | |
93 | +RUN chown -R collector:collector $HOME | |
79 | 94 | |
80 | 95 | # change to the collector user |
81 | 96 | USER collector |
... | ... |
Dockerfile.marcell
... | ... | @@ -14,7 +14,8 @@ ENV PYTHONUNBUFFERED 1 |
14 | 14 | |
15 | 15 | # install dependencies |
16 | 16 | RUN apt-get update && \ |
17 | - apt-get install -y python3-pip | |
17 | + apt-get install -y python3-pip && \ | |
18 | + pip3 install --upgrade pip | |
18 | 19 | COPY ./requirements.txt /usr/src/collector/requirements.txt |
19 | 20 | RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /usr/src/collector/wheels -r requirements.txt |
20 | 21 | |
... | ... | @@ -38,8 +39,8 @@ ENV HOME=/home/collector |
38 | 39 | ENV APP_HOME=/home/collector/app |
39 | 40 | |
40 | 41 | # create directory for the collector user and user itself |
41 | -RUN mkdir -p $HOME && \ | |
42 | - addgroup --group collector && \ | |
42 | +RUN mkdir -p $HOME && \ | |
43 | + addgroup --group collector && \ | |
43 | 44 | adduser collector --ingroup collector |
44 | 45 | |
45 | 46 | # create the appropriate directories |
... | ... | @@ -55,8 +56,9 @@ RUN apt-get update && apt-get install -y openjdk-8-jre python3-pip software-prop |
55 | 56 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ |
56 | 57 | echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" |tee /etc/apt/sources.list.d/pgdg.list && \ |
57 | 58 | apt-get update && \ |
58 | - apt-get install -y morfeusz2 python3-morfeusz2 && \ | |
59 | + apt-get install -y morfeusz2 python3-morfeusz2 libopenblas-dev libomp-dev && \ | |
59 | 60 | DEBIAN_FRONTEND="noninteractive" apt-get -y install postgresql-12 && \ |
61 | + pip3 install --upgrade pip && \ | |
60 | 62 | pip3 install --no-cache /wheels/* |
61 | 63 | |
62 | 64 | # copy project |
... | ... | @@ -66,25 +68,35 @@ COPY . $APP_HOME |
66 | 68 | WORKDIR ./tools/liner2/g419-external-dependencies |
67 | 69 | RUN tar -xvf ./CRF++-0.57.tar.gz |
68 | 70 | WORKDIR ./CRF++-0.57 |
69 | -RUN ./configure && \ | |
70 | - make && \ | |
71 | - make install && \ | |
72 | - make clean && \ | |
71 | +RUN ./configure && \ | |
72 | + make && \ | |
73 | + make install && \ | |
74 | + make clean && \ | |
73 | 75 | ldconfig |
76 | + | |
77 | +# install eurobert | |
78 | +WORKDIR $APP_HOME/tools | |
79 | +RUN wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/eurobert-model.tar.gz && \ | |
80 | + tar -xvf ./eurobert-model.tar.gz && \ | |
81 | + rm ./eurobert-model.tar.gz | |
74 | 82 | WORKDIR $APP_HOME |
75 | 83 | |
76 | 84 | # copy django settings |
77 | 85 | COPY ./collector/collector/docker-settings.py $APP_HOME/collector/collector/settings.py |
78 | 86 | |
79 | 87 | # chown all the files to the collector user |
80 | -RUN chown -R collector:collector $APP_HOME | |
88 | +RUN chown -R collector:collector $HOME | |
81 | 89 | |
82 | 90 | # configure and init database |
83 | 91 | USER postgres |
84 | 92 | RUN /etc/init.d/postgresql start && \ |
85 | 93 | psql --command "CREATE USER collector WITH SUPERUSER PASSWORD 'collector';" && \ |
86 | 94 | createdb -O collector collector && \ |
87 | - psql collector < $APP_HOME/resources/db/marcell-init.db | |
95 | + wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/marcell-init.db.tar.gz && \ | |
96 | + tar -xvf ./marcell-init.db.tar.gz && \ | |
97 | + rm ./marcell-init.db.tar.gz && \ | |
98 | + psql collector < ./marcell-init.db && \ | |
99 | + rm ./marcell-init.db | |
88 | 100 | |
89 | 101 | # change to the root user |
90 | 102 | USER 0 |
... | ... |
README.md
... | ... | @@ -2,13 +2,22 @@ |
2 | 2 | |
3 | 3 | ## Running MARCELL annotate API using Docker |
4 | 4 | |
5 | +#### Using docker-compose | |
6 | + | |
5 | 7 | ```sh |
6 | 8 | docker-compose up -d |
7 | 9 | ``` |
8 | 10 | |
9 | -### Exemplary usage in Python | |
11 | +#### Using docker | |
10 | 12 | |
11 | 13 | ```sh |
14 | +docker build -t "marcell-pl" -f Dockerfile.marcell . | |
15 | +docker run --name "marcell-pl-running" -p 8006:8000 -d marcell-pl | |
16 | +``` | |
17 | + | |
18 | +### Exemplary usage in Python | |
19 | + | |
20 | +``` | |
12 | 21 | import requests |
13 | 22 | |
14 | 23 | url = 'http://<container_url>:<exposed_port>/annotate' |
... | ... | @@ -20,4 +29,3 @@ with open('/text/file/path.[txt/html/pdf]', 'rb') as text_file, open('/meta/fil |
20 | 29 | ## Conda based installation |
21 | 30 | |
22 | 31 | See INSTALL.md for installation instructions. |
23 | - | |
... | ... |
collector/collector/docker-settings.py
... | ... | @@ -68,6 +68,9 @@ SOLR_URL = 'http://localhost:8983/solr/' |
68 | 68 | # Language-agnostic BERT sentence embedding model path |
69 | 69 | LABSE_MODEL_PATH = os.path.join(TOOLS_DIR, 'labse', 'labse_bert_model') |
70 | 70 | |
71 | +# expose REST API | |
72 | +EXPOSE_API = True | |
73 | + | |
71 | 74 | # Quick-start development settings - unsuitable for production |
72 | 75 | # See https://docs.djangoproject.com/en/2.1/howto/deployment/checklist/ |
73 | 76 | |
... | ... |
collector/pipeline/urls.py
1 | 1 | from django.urls import path |
2 | 2 | |
3 | 3 | from . import views |
4 | +from collector import settings | |
4 | 5 | |
5 | -urlpatterns = [ | |
6 | - path('annotate', views.AnnotateView.as_view(), name='annotate'), | |
7 | -] | |
6 | + | |
7 | +urlpatterns = [] | |
8 | +if settings.EXPOSE_API: | |
9 | + urlpatterns = [ | |
10 | + path('annotate', views.AnnotateView.as_view(), name='annotate'), | |
11 | + ] | |
... | ... |
collector/projects/marcell/utils.py
... | ... | @@ -100,6 +100,6 @@ def vectorize_eurovocs(): |
100 | 100 | labse = Labse(settings.LABSE_MODEL_PATH) |
101 | 101 | for eurovoc_label in EuroVocLabel.objects.filter(vector__isnull=True): |
102 | 102 | print('Creating EuroVoc vector representation for "{}".'.format(eurovoc_label.text)) |
103 | - laser_vector = [float(x) for x in list(labse.embed(eurovoc_label.text))] | |
104 | - eurovoc_label.vector = laser_vector | |
103 | + labse_vector = [float(x) for x in list(labse.embed(eurovoc_label.text))] | |
104 | + eurovoc_label.vector = labse_vector | |
105 | 105 | eurovoc_label.save() |
... | ... |
collector/terminology/eurovoc.py
... | ... | @@ -20,10 +20,13 @@ def annotate(documents): |
20 | 20 | |
21 | 21 | def _add_document_level_domains(document, title_based_tld_preditor, min_score, k): |
22 | 22 | print('Adding EuroVoc domains to: {}.'.format(document.id)) |
23 | + eurovoc_domains = [] | |
24 | + tld_score_value = 'sim' | |
25 | + | |
23 | 26 | if document.keywords.exists(): |
24 | - tld_score_value = 'sim' | |
25 | 27 | eurovoc_domains = _get_keyword_based_domains(document.keywords.all(), min_score, k) |
26 | - else: | |
28 | + | |
29 | + if not eurovoc_domains: | |
27 | 30 | tld_score_value = 'prob' |
28 | 31 | eurovoc_domains = _get_title_based_domains(document.title, title_based_tld_preditor, min_score, k) |
29 | 32 | |
... | ... |
entrypoint.sh
... | ... | @@ -15,6 +15,8 @@ python3 collector/manage.py makemigrations --noinput |
15 | 15 | python3 collector/manage.py migrate |
16 | 16 | python3 collector/manage.py configure_marcell_pipelines |
17 | 17 | python3 collector/manage.py load_eurovoc_terms -i resources/eurovoc -l pl |
18 | +python3 collector/manage.py load_eurovoc_terms -i resources/eurovoc -l en | |
18 | 19 | python3 collector/manage.py load_iate_terms -i resources/iate/iate.tbx |
20 | +python3 collector/manage.py map_eurovoc_terms | |
19 | 21 | |
20 | 22 | exec "$@" |
... | ... |
requirements.txt
... | ... | @@ -6,15 +6,14 @@ Django==2.2.5 |
6 | 6 | faiss |
7 | 7 | gunicorn==20.0.4 |
8 | 8 | Keras==2.2.5 |
9 | -laserembeddings[en,pl] | |
10 | 9 | natsort |
11 | 10 | psycopg2-binary |
12 | 11 | pysolr |
13 | 12 | python-dateutil |
14 | -pytorch | |
15 | 13 | scikit-learn==0.22.1 |
16 | 14 | scrapy |
17 | -tensorflow==1.14 | |
15 | +tensorflow>=1.15,<2 | |
18 | 16 | tika==1.19 |
19 | -xmldiff | |
20 | - | |
17 | +torch | |
18 | +transformers>=3.0.2 | |
19 | +xmldiff | |
21 | 20 | \ No newline at end of file |
... | ... |