Utilizes the reticulate package to import the flair.datasets
dataset from Flair's datasets in Python, enabling the use of this dataset in
an R environment.
References
Python equivalent:
from flair.datasets import UD_ENGLISH
corpus = UD_ENGLISH().downsample(0.1)
See also
https://github.com/flairNLP/flair for additional information on Flair's capabilities and datasets in NLP.
Examples
if (FALSE) { # \dontrun{
UD_ENGLISH <- flair_datasets()$UD_ENGLISH
corpus <- UD_ENGLISH()$downsample(0.1)} # }
# print all the datasets from flair
names(flair_datasets())
#> [1] "AGNEWS"
#> [2] "AMAZON_REVIEWS"
#> [3] "ANAT_EM"
#> [4] "AZDZ"
#> [5] "BC2GM"
#> [6] "BIOBERT_CHEMICAL_BC4CHEMD"
#> [7] "BIOBERT_CHEMICAL_BC5CDR"
#> [8] "BIOBERT_DISEASE_BC5CDR"
#> [9] "BIOBERT_DISEASE_NCBI"
#> [10] "BIOBERT_GENE_BC2GM"
#> [11] "BIOBERT_GENE_JNLPBA"
#> [12] "BIOBERT_SPECIES_LINNAEUS"
#> [13] "BIOBERT_SPECIES_S800"
#> [14] "BIONLP2013_CG"
#> [15] "BIONLP2013_PC"
#> [16] "BIOSCOPE"
#> [17] "BIOSEMANTICS"
#> [18] "BIO_INFER"
#> [19] "CDR"
#> [20] "CELL_FINDER"
#> [21] "CEMP"
#> [22] "CHEMDNER"
#> [23] "CLL"
#> [24] "COMMUNICATIVE_FUNCTIONS"
#> [25] "CONLL_03"
#> [26] "CONLL_03_DUTCH"
#> [27] "CONLL_03_GERMAN"
#> [28] "CONLL_03_SPANISH"
#> [29] "CONLL_2000"
#> [30] "CRAFT"
#> [31] "CRAFT_V4"
#> [32] "CSVClassificationCorpus"
#> [33] "CSVClassificationDataset"
#> [34] "CTD_CHEMICALS_DICTIONARY"
#> [35] "CTD_DISEASES_DICTIONARY"
#> [36] "ClassificationCorpus"
#> [37] "ClassificationDataset"
#> [38] "ColumnCorpus"
#> [39] "ColumnDataset"
#> [40] "DECA"
#> [41] "DataLoader"
#> [42] "DataPairCorpus"
#> [43] "DataPairDataset"
#> [44] "DataTripleCorpus"
#> [45] "DataTripleDataset"
#> [46] "EntityLinkingDictionary"
#> [47] "FEWNERD"
#> [48] "FSU"
#> [49] "FeideggerCorpus"
#> [50] "FeideggerDataset"
#> [51] "FlairDatapointDataset"
#> [52] "GELLUS"
#> [53] "GERMEVAL_2018_OFFENSIVE_LANGUAGE"
#> [54] "GLUE_COLA"
#> [55] "GLUE_MNLI"
#> [56] "GLUE_MRPC"
#> [57] "GLUE_QNLI"
#> [58] "GLUE_QQP"
#> [59] "GLUE_RTE"
#> [60] "GLUE_SST2"
#> [61] "GLUE_STSB"
#> [62] "GLUE_WNLI"
#> [63] "GO_EMOTIONS"
#> [64] "GPRO"
#> [65] "HUNER_CELL_LINE"
#> [66] "HUNER_CELL_LINE_CELL_FINDER"
#> [67] "HUNER_CELL_LINE_CLL"
#> [68] "HUNER_CELL_LINE_GELLUS"
#> [69] "HUNER_CELL_LINE_JNLPBA"
#> [70] "HUNER_CHEMICAL"
#> [71] "HUNER_CHEMICAL_CDR"
#> [72] "HUNER_CHEMICAL_CEMP"
#> [73] "HUNER_CHEMICAL_CHEBI"
#> [74] "HUNER_CHEMICAL_CHEMDNER"
#> [75] "HUNER_CHEMICAL_CRAFT_V4"
#> [76] "HUNER_CHEMICAL_SCAI"
#> [77] "HUNER_DISEASE"
#> [78] "HUNER_DISEASE_CDR"
#> [79] "HUNER_DISEASE_MIRNA"
#> [80] "HUNER_DISEASE_NCBI"
#> [81] "HUNER_DISEASE_PDR"
#> [82] "HUNER_DISEASE_SCAI"
#> [83] "HUNER_DISEASE_VARIOME"
#> [84] "HUNER_GENE"
#> [85] "HUNER_GENE_BC2GM"
#> [86] "HUNER_GENE_BIO_INFER"
#> [87] "HUNER_GENE_CELL_FINDER"
#> [88] "HUNER_GENE_CHEBI"
#> [89] "HUNER_GENE_CRAFT_V4"
#> [90] "HUNER_GENE_DECA"
#> [91] "HUNER_GENE_FSU"
#> [92] "HUNER_GENE_GPRO"
#> [93] "HUNER_GENE_IEPA"
#> [94] "HUNER_GENE_JNLPBA"
#> [95] "HUNER_GENE_LOCTEXT"
#> [96] "HUNER_GENE_MIRNA"
#> [97] "HUNER_GENE_OSIRIS"
#> [98] "HUNER_GENE_VARIOME"
#> [99] "HUNER_SPECIES"
#> [100] "HUNER_SPECIES_CELL_FINDER"
#> [101] "HUNER_SPECIES_CHEBI"
#> [102] "HUNER_SPECIES_CRAFT_V4"
#> [103] "HUNER_SPECIES_LINNEAUS"
#> [104] "HUNER_SPECIES_LOCTEXT"
#> [105] "HUNER_SPECIES_MIRNA"
#> [106] "HUNER_SPECIES_S800"
#> [107] "HUNER_SPECIES_VARIOME"
#> [108] "HunerEntityLinkingDictionary"
#> [109] "IEPA"
#> [110] "IMDB"
#> [111] "JNLPBA"
#> [112] "KEYPHRASE_INSPEC"
#> [113] "KEYPHRASE_SEMEVAL2010"
#> [114] "KEYPHRASE_SEMEVAL2017"
#> [115] "LINNEAUS"
#> [116] "LOCTEXT"
#> [117] "MASAKHA_POS"
#> [118] "MIRNA"
#> [119] "MongoDataset"
#> [120] "NCBI_DISEASE"
#> [121] "NCBI_GENE_HUMAN_DICTIONARY"
#> [122] "NCBI_TAXONOMY_DICTIONARY"
#> [123] "NEL_ENGLISH_AIDA"
#> [124] "NEL_ENGLISH_AQUAINT"
#> [125] "NEL_ENGLISH_IITB"
#> [126] "NEL_ENGLISH_REDDIT"
#> [127] "NEL_ENGLISH_TWEEKI"
#> [128] "NEL_GERMAN_HIPE"
#> [129] "NER_ARABIC_ANER"
#> [130] "NER_ARABIC_AQMAR"
#> [131] "NER_BASQUE"
#> [132] "NER_CHINESE_WEIBO"
#> [133] "NER_DANISH_DANE"
#> [134] "NER_ENGLISH_MOVIE_COMPLEX"
#> [135] "NER_ENGLISH_MOVIE_SIMPLE"
#> [136] "NER_ENGLISH_PERSON"
#> [137] "NER_ENGLISH_RESTAURANT"
#> [138] "NER_ENGLISH_SEC_FILLINGS"
#> [139] "NER_ENGLISH_STACKOVERFLOW"
#> [140] "NER_ENGLISH_TWITTER"
#> [141] "NER_ENGLISH_WEBPAGES"
#> [142] "NER_ENGLISH_WIKIGOLD"
#> [143] "NER_ENGLISH_WNUT_2020"
#> [144] "NER_ESTONIAN_NOISY"
#> [145] "NER_FINNISH"
#> [146] "NER_GERMAN_BIOFID"
#> [147] "NER_GERMAN_EUROPARL"
#> [148] "NER_GERMAN_GERMEVAL"
#> [149] "NER_GERMAN_LEGAL"
#> [150] "NER_GERMAN_MOBIE"
#> [151] "NER_GERMAN_POLITICS"
#> [152] "NER_HIPE_2022"
#> [153] "NER_HUNGARIAN"
#> [154] "NER_ICDAR_EUROPEANA"
#> [155] "NER_ICELANDIC"
#> [156] "NER_JAPANESE"
#> [157] "NER_MASAKHANE"
#> [158] "NER_MULTI_CONER"
#> [159] "NER_MULTI_CONER_V2"
#> [160] "NER_MULTI_WIKIANN"
#> [161] "NER_MULTI_WIKINER"
#> [162] "NER_MULTI_XTREME"
#> [163] "NER_NERMUD"
#> [164] "NER_SWEDISH"
#> [165] "NER_TURKU"
#> [166] "NER_UKRAINIAN"
#> [167] "NEWSGROUPS"
#> [168] "ONTONOTES"
#> [169] "OSIRIS"
#> [170] "OcrJsonDataset"
#> [171] "OpusParallelCorpus"
#> [172] "PDR"
#> [173] "ParallelTextCorpus"
#> [174] "ParallelTextDataset"
#> [175] "RE_ENGLISH_CONLL04"
#> [176] "RE_ENGLISH_DRUGPROT"
#> [177] "RE_ENGLISH_SEMEVAL2010"
#> [178] "RE_ENGLISH_TACRED"
#> [179] "S800"
#> [180] "SCAI_CHEMICALS"
#> [181] "SCAI_DISEASE"
#> [182] "SENTEVAL_CR"
#> [183] "SENTEVAL_MPQA"
#> [184] "SENTEVAL_MR"
#> [185] "SENTEVAL_SST_BINARY"
#> [186] "SENTEVAL_SST_GRANULAR"
#> [187] "SENTEVAL_SUBJ"
#> [188] "SENTIMENT_140"
#> [189] "SROIE"
#> [190] "STACKOVERFLOW"
#> [191] "SUPERGLUE_RTE"
#> [192] "SentenceDataset"
#> [193] "StringDataset"
#> [194] "TREC_50"
#> [195] "TREC_6"
#> [196] "UD_AFRIKAANS"
#> [197] "UD_ANCIENT_GREEK"
#> [198] "UD_ARABIC"
#> [199] "UD_ARMENIAN"
#> [200] "UD_BASQUE"
#> [201] "UD_BAVARIAN_MAIBAAM"
#> [202] "UD_BELARUSIAN"
#> [203] "UD_BULGARIAN"
#> [204] "UD_BURYAT"
#> [205] "UD_CATALAN"
#> [206] "UD_CHINESE"
#> [207] "UD_CHINESE_KYOTO"
#> [208] "UD_COPTIC"
#> [209] "UD_CROATIAN"
#> [210] "UD_CZECH"
#> [211] "UD_DANISH"
#> [212] "UD_DUTCH"
#> [213] "UD_ENGLISH"
#> [214] "UD_ESTONIAN"
#> [215] "UD_FAROESE"
#> [216] "UD_FINNISH"
#> [217] "UD_FRENCH"
#> [218] "UD_GALICIAN"
#> [219] "UD_GERMAN"
#> [220] "UD_GERMAN_HDT"
#> [221] "UD_GOTHIC"
#> [222] "UD_GREEK"
#> [223] "UD_HEBREW"
#> [224] "UD_HINDI"
#> [225] "UD_INDONESIAN"
#> [226] "UD_IRISH"
#> [227] "UD_ITALIAN"
#> [228] "UD_JAPANESE"
#> [229] "UD_KAZAKH"
#> [230] "UD_KOREAN"
#> [231] "UD_LATIN"
#> [232] "UD_LATVIAN"
#> [233] "UD_LITHUANIAN"
#> [234] "UD_LIVVI"
#> [235] "UD_MALTESE"
#> [236] "UD_MARATHI"
#> [237] "UD_NAIJA"
#> [238] "UD_NORTH_SAMI"
#> [239] "UD_NORWEGIAN"
#> [240] "UD_OLD_CHURCH_SLAVONIC"
#> [241] "UD_OLD_FRENCH"
#> [242] "UD_PERSIAN"
#> [243] "UD_POLISH"
#> [244] "UD_PORTUGUESE"
#> [245] "UD_ROMANIAN"
#> [246] "UD_RUSSIAN"
#> [247] "UD_SERBIAN"
#> [248] "UD_SLOVAK"
#> [249] "UD_SLOVENIAN"
#> [250] "UD_SPANISH"
#> [251] "UD_SWEDISH"
#> [252] "UD_TURKISH"
#> [253] "UD_UKRAINIAN"
#> [254] "UD_WOLOF"
#> [255] "UP_CHINESE"
#> [256] "UP_ENGLISH"
#> [257] "UP_FINNISH"
#> [258] "UP_FRENCH"
#> [259] "UP_GERMAN"
#> [260] "UP_ITALIAN"
#> [261] "UP_SPANISH"
#> [262] "UP_SPANISH_ANCORA"
#> [263] "UniversalDependenciesCorpus"
#> [264] "UniversalDependenciesDataset"
#> [265] "VARIOME"
#> [266] "WASSA_ANGER"
#> [267] "WASSA_FEAR"
#> [268] "WASSA_JOY"
#> [269] "WASSA_SADNESS"
#> [270] "WNUT_17"
#> [271] "WSD_MASC"
#> [272] "WSD_OMSTI"
#> [273] "WSD_RAGANATO_ALL"
#> [274] "WSD_SEMCOR"
#> [275] "WSD_TRAINOMATIC"
#> [276] "WSD_UFSAC"
#> [277] "WSD_WORDNET_GLOSS_TAGGED"
#> [278] "YAHOO_ANSWERS"
#> [279] "ZELDA"
#> [280] "base"
#> [281] "biomedical"
#> [282] "document_classification"
#> [283] "entity_linking"
#> [284] "ocr"
#> [285] "relation_extraction"
#> [286] "sequence_labeling"
#> [287] "text_image"
#> [288] "text_text"
#> [289] "treebanks"