Commit 87f2744166643eda4b73deb53b565916bd07c301

Authored by Bartłomiej Nitoń
1 parent badd76e0

Update docker configuration.

Too many changes to show.

To preserve performance only 9 of 10 files are displayed.

Dockerfile
... ... @@ -14,7 +14,8 @@ ENV PYTHONUNBUFFERED 1
14 14  
15 15 # install dependencies
16 16 RUN apt-get update && \
17   - apt-get install -y python3-pip
  17 + apt-get install -y python3-pip && \
  18 + pip3 install --upgrade pip
18 19 COPY ./requirements.txt /usr/src/collector/requirements.txt
19 20 RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /usr/src/collector/wheels -r requirements.txt
20 21  
... ... @@ -30,33 +31,35 @@ FROM ubuntu:18.04
30 31 RUN apt-get update && \
31 32 apt-get install -y locales && \
32 33 locale-gen pl_PL.UTF-8
  34 +
  35 +# set envs
33 36 ENV LANG pl_PL.UTF-8
34 37 ENV LC_ALL pl_PL.UTF-8
  38 +ENV HOME=/home/collector
  39 +ENV APP_HOME=/home/collector/app
35 40  
36   -# create directory for the collector user
37   -RUN mkdir -p /home/collector
38   -
39   -# create the collector user
40   -RUN addgroup --group collector && adduser collector --ingroup collector
  41 +# create directory for the collector user and user itself
  42 +RUN mkdir -p $HOME && \
  43 + addgroup --group collector && \
  44 + adduser collector --ingroup collector
41 45  
42 46 # create the appropriate directories
43   -ENV HOME=/home/collector
44   -ENV APP_HOME=/home/collector/app
45 47 RUN mkdir $APP_HOME
46 48 WORKDIR $APP_HOME
47 49  
48 50 # install dependencies
49 51 COPY --from=builder /usr/src/collector/wheels /wheels
50 52 COPY --from=builder /usr/src/collector/requirements.txt .
51   -RUN apt-get install -y netcat openjdk-8-jre python3-pip software-properties-common wget && \
  53 +RUN apt-get update && apt-get install -y netcat openjdk-8-jre python3-pip software-properties-common wget && \
52 54 wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key|apt-key add - && \
53 55 apt-add-repository http://download.sgjp.pl/apt/ubuntu && \
54 56 wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \
55 57 echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" |tee /etc/apt/sources.list.d/pgdg.list && \
56 58 apt-get update && \
57   - apt-get install -y morfeusz2 python3-morfeusz2 && \
58   - DEBIAN_FRONTEND="noninteractive" apt-get -y install postgresql-12
59   -RUN pip3 install --no-cache /wheels/*
  59 + apt-get install -y morfeusz2 python3-morfeusz2 libopenblas-dev libomp-dev && \
  60 + DEBIAN_FRONTEND="noninteractive" apt-get -y install postgresql-12 && \
  61 + pip3 install --upgrade pip && \
  62 + pip3 install --no-cache /wheels/*
60 63  
61 64 # copy project
62 65 COPY . $APP_HOME
... ... @@ -65,17 +68,29 @@ COPY . $APP_HOME
65 68 WORKDIR ./tools/liner2/g419-external-dependencies
66 69 RUN tar -xvf ./CRF++-0.57.tar.gz
67 70 WORKDIR ./CRF++-0.57
68   -RUN ./configure
69   -RUN make
70   -RUN make install
71   -RUN ldconfig
  71 +RUN ./configure && \
  72 + make && \
  73 + make install && \
  74 + make clean && \
  75 + ldconfig
  76 +
  77 +# install eurobert
  78 +WORKDIR $APP_HOME/tools
  79 +RUN wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/eurobert-model.tar.gz && \
  80 + tar -xvf ./eurobert-model.tar.gz && \
  81 + rm ./eurobert-model.tar.gz
  82 +
  83 +# install labse
  84 +RUN wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/labse-model.tar.gz && \
  85 + tar -xvf ./labse-model.tar.gz && \
  86 + rm ./labse-model.tar.gz
72 87 WORKDIR $APP_HOME
73 88  
74 89 # copy django settings
75 90 COPY ./collector/collector/docker-settings.py $APP_HOME/collector/collector/settings.py
76 91  
77 92 # chown all the files to the collector user
78   -RUN chown -R collector:collector $APP_HOME
  93 +RUN chown -R collector:collector $HOME
79 94  
80 95 # change to the collector user
81 96 USER collector
... ...
Dockerfile.marcell
... ... @@ -14,7 +14,8 @@ ENV PYTHONUNBUFFERED 1
14 14  
15 15 # install dependencies
16 16 RUN apt-get update && \
17   - apt-get install -y python3-pip
  17 + apt-get install -y python3-pip && \
  18 + pip3 install --upgrade pip
18 19 COPY ./requirements.txt /usr/src/collector/requirements.txt
19 20 RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /usr/src/collector/wheels -r requirements.txt
20 21  
... ... @@ -38,8 +39,8 @@ ENV HOME=/home/collector
38 39 ENV APP_HOME=/home/collector/app
39 40  
40 41 # create directory for the collector user and user itself
41   -RUN mkdir -p $HOME && \
42   - addgroup --group collector && \
  42 +RUN mkdir -p $HOME && \
  43 + addgroup --group collector && \
43 44 adduser collector --ingroup collector
44 45  
45 46 # create the appropriate directories
... ... @@ -55,8 +56,9 @@ RUN apt-get update && apt-get install -y openjdk-8-jre python3-pip software-prop
55 56 wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \
56 57 echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" |tee /etc/apt/sources.list.d/pgdg.list && \
57 58 apt-get update && \
58   - apt-get install -y morfeusz2 python3-morfeusz2 && \
  59 + apt-get install -y morfeusz2 python3-morfeusz2 libopenblas-dev libomp-dev && \
59 60 DEBIAN_FRONTEND="noninteractive" apt-get -y install postgresql-12 && \
  61 + pip3 install --upgrade pip && \
60 62 pip3 install --no-cache /wheels/*
61 63  
62 64 # copy project
... ... @@ -66,25 +68,35 @@ COPY . $APP_HOME
66 68 WORKDIR ./tools/liner2/g419-external-dependencies
67 69 RUN tar -xvf ./CRF++-0.57.tar.gz
68 70 WORKDIR ./CRF++-0.57
69   -RUN ./configure && \
70   - make && \
71   - make install && \
72   - make clean && \
  71 +RUN ./configure && \
  72 + make && \
  73 + make install && \
  74 + make clean && \
73 75 ldconfig
  76 +
  77 +# install eurobert
  78 +WORKDIR $APP_HOME/tools
  79 +RUN wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/eurobert-model.tar.gz && \
  80 + tar -xvf ./eurobert-model.tar.gz && \
  81 + rm ./eurobert-model.tar.gz
74 82 WORKDIR $APP_HOME
75 83  
76 84 # copy django settings
77 85 COPY ./collector/collector/docker-settings.py $APP_HOME/collector/collector/settings.py
78 86  
79 87 # chown all the files to the collector user
80   -RUN chown -R collector:collector $APP_HOME
  88 +RUN chown -R collector:collector $HOME
81 89  
82 90 # configure and init database
83 91 USER postgres
84 92 RUN /etc/init.d/postgresql start && \
85 93 psql --command "CREATE USER collector WITH SUPERUSER PASSWORD 'collector';" && \
86 94 createdb -O collector collector && \
87   - psql collector < $APP_HOME/resources/db/marcell-init.db
  95 + wget https://manage.legis.nlp.ipipan.waw.pl/download/marcell/marcell-init.db.tar.gz && \
  96 + tar -xvf ./marcell-init.db.tar.gz && \
  97 + rm ./marcell-init.db.tar.gz && \
  98 + psql collector < ./marcell-init.db && \
  99 + rm ./marcell-init.db
88 100  
89 101 # change to the root user
90 102 USER 0
... ...
README.md
... ... @@ -2,13 +2,22 @@
2 2  
3 3 ## Running MARCELL annotate API using Docker
4 4  
  5 +#### Using docker-compose
  6 +
5 7 ```sh
6 8 docker-compose up -d
7 9 ```
8 10  
9   -### Exemplary usage in Python
  11 +#### Using docker
10 12  
11 13 ```sh
  14 +docker build -t "marcell-pl" -f Dockerfile.marcell .
  15 +docker run --name "marcell-pl-running" -p 8006:8000 -d marcell-pl
  16 +```
  17 +
  18 +### Exemplary usage in Python
  19 +
  20 +```
12 21 import requests
13 22  
14 23 url = 'http://<container_url>:<exposed_port>/annotate'
... ... @@ -20,4 +29,3 @@ with open(&#39;/text/file/path.[txt/html/pdf]&#39;, &#39;rb&#39;) as text_file, open(&#39;/meta/fil
20 29 ## Conda based installation
21 30  
22 31 See INSTALL.md for installation instructions.
23   -
... ...
collector/collector/docker-settings.py
... ... @@ -68,6 +68,9 @@ SOLR_URL = &#39;http://localhost:8983/solr/&#39;
68 68 # Language-agnostic BERT sentence embedding model path
69 69 LABSE_MODEL_PATH = os.path.join(TOOLS_DIR, 'labse', 'labse_bert_model')
70 70  
  71 +# expose REST API
  72 +EXPOSE_API = True
  73 +
71 74 # Quick-start development settings - unsuitable for production
72 75 # See https://docs.djangoproject.com/en/2.1/howto/deployment/checklist/
73 76  
... ...
collector/pipeline/urls.py
1 1 from django.urls import path
2 2  
3 3 from . import views
  4 +from collector import settings
4 5  
5   -urlpatterns = [
6   - path('annotate', views.AnnotateView.as_view(), name='annotate'),
7   -]
  6 +
  7 +urlpatterns = []
  8 +if settings.EXPOSE_API:
  9 + urlpatterns = [
  10 + path('annotate', views.AnnotateView.as_view(), name='annotate'),
  11 + ]
... ...
collector/projects/marcell/utils.py
... ... @@ -100,6 +100,6 @@ def vectorize_eurovocs():
100 100 labse = Labse(settings.LABSE_MODEL_PATH)
101 101 for eurovoc_label in EuroVocLabel.objects.filter(vector__isnull=True):
102 102 print('Creating EuroVoc vector representation for "{}".'.format(eurovoc_label.text))
103   - laser_vector = [float(x) for x in list(labse.embed(eurovoc_label.text))]
104   - eurovoc_label.vector = laser_vector
  103 + labse_vector = [float(x) for x in list(labse.embed(eurovoc_label.text))]
  104 + eurovoc_label.vector = labse_vector
105 105 eurovoc_label.save()
... ...
collector/terminology/eurovoc.py
... ... @@ -20,10 +20,13 @@ def annotate(documents):
20 20  
21 21 def _add_document_level_domains(document, title_based_tld_preditor, min_score, k):
22 22 print('Adding EuroVoc domains to: {}.'.format(document.id))
  23 + eurovoc_domains = []
  24 + tld_score_value = 'sim'
  25 +
23 26 if document.keywords.exists():
24   - tld_score_value = 'sim'
25 27 eurovoc_domains = _get_keyword_based_domains(document.keywords.all(), min_score, k)
26   - else:
  28 +
  29 + if not eurovoc_domains:
27 30 tld_score_value = 'prob'
28 31 eurovoc_domains = _get_title_based_domains(document.title, title_based_tld_preditor, min_score, k)
29 32  
... ...
entrypoint.sh
... ... @@ -15,6 +15,8 @@ python3 collector/manage.py makemigrations --noinput
15 15 python3 collector/manage.py migrate
16 16 python3 collector/manage.py configure_marcell_pipelines
17 17 python3 collector/manage.py load_eurovoc_terms -i resources/eurovoc -l pl
  18 +python3 collector/manage.py load_eurovoc_terms -i resources/eurovoc -l en
18 19 python3 collector/manage.py load_iate_terms -i resources/iate/iate.tbx
  20 +python3 collector/manage.py map_eurovoc_terms
19 21  
20 22 exec "$@"
... ...
requirements.txt
... ... @@ -6,15 +6,14 @@ Django==2.2.5
6 6 faiss
7 7 gunicorn==20.0.4
8 8 Keras==2.2.5
9   -laserembeddings[en,pl]
10 9 natsort
11 10 psycopg2-binary
12 11 pysolr
13 12 python-dateutil
14   -pytorch
15 13 scikit-learn==0.22.1
16 14 scrapy
17   -tensorflow==1.14
  15 +tensorflow>=1.15,<2
18 16 tika==1.19
19   -xmldiff
20   -
  17 +torch
  18 +transformers>=3.0.2
  19 +xmldiff
21 20 \ No newline at end of file
... ...