Source code for sparknlp.training.pub_tator

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains helper classes for PubTator datasets."""

from sparknlp.internal import ExtendedJavaWrapper


[docs]class PubTator(ExtendedJavaWrapper): """The PubTator format includes medical papers’ titles, abstracts, and tagged chunks. For more information see `PubTator Docs <http://bioportal.bioontology.org/ontologies/EDAM?p=classes&conceptid=format_3783>`_ and `MedMentions Docs <http://github.com/chanzuckerberg/MedMentions>`_. :meth:`.readDataset` is used to create a Spark DataFrame from a PubTator text file. **Input File Format**:: 25763772 0 5 DCTN4 T116,T123 C4308010 25763772 23 63 chronic Pseudomonas aeruginosa infection T047 C0854135 25763772 67 82 cystic fibrosis T047 C0010674 25763772 83 120 Pseudomonas aeruginosa (Pa) infection T047 C0854135 25763772 124 139 cystic fibrosis T047 C0010674 Examples -------- >>> from sparknlp.training import PubTator >>> pubTatorFile = "./src/test/resources/corpus_pubtator_sample.txt" >>> pubTatorDataSet = PubTator().readDataset(spark, pubTatorFile) >>> pubTatorDataSet.show(1) +--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+ | doc_id| finished_token| finished_pos| finished_ner|finished_token_metadata|finished_pos_metadata|finished_label_metadata| +--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+ |25763772|[DCTN4, as, a, mo...|[NNP, IN, DT, NN,...|[B-T116, O, O, O,...| [[sentence, 0], [...| [[word, DCTN4], [...| [[word, DCTN4], [...| +--------+--------------------+--------------------+--------------------+-----------------------+---------------------+-----------------------+ """ def __init__(self): super(PubTator, self).__init__("com.johnsnowlabs.nlp.training.PubTator")
[docs] def readDataset(self, spark, path, isPaddedToken=True): # ToDo Replace with std pyspark """Reads the dataset from an external resource. Parameters ---------- spark : :class:`pyspark.sql.SparkSession` Initiated Spark Session with Spark NLP path : str Path to the resource isPaddedToken : str, optional Whether tokens are padded, by default True Returns ------- :class:`pyspark.sql.DataFrame` Spark Dataframe with the data """ jSession = spark._jsparkSession jdf = self._java_obj.readDataset(jSession, path, isPaddedToken) dataframe = self.getDataFrame(spark, jdf) return dataframe