language/v1/language_syntax

# # Copyright 2020 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # DO NOT EDIT! This is a generated sample ("Request", "language_syntax_gcs") # To install the latest published package dependency, execute the following: # pip install google-cloud-language # sample-metadata # title: Analyzing Syntax (GCS) # description: Analyzing Syntax in text file stored in Cloud Storage # usage: python3 samples/v1/language_syntax_gcs.py [--gcs_content_uri "gs://cloud-samples-data/language/syntax-sentence.txt"] # [START language_syntax_gcs] from google.cloud import language_v1 def sample_analyze_syntax(gcs_content_uri): """ Analyzing Syntax in text file stored in Cloud Storage Args: gcs_content_uri Google Cloud Storage URI where the file content is located. e.g. gs://[Your Bucket]/[Path to File] """ client = language_v1.LanguageServiceClient() # gcs_content_uri = 'gs://cloud-samples-data/language/syntax-sentence.txt' # Available types: PLAIN_TEXT, HTML type_ = language_v1.Document.Type.PLAIN_TEXT # Optional. If not specified, the language is automatically detected. # For list of supported languages: # https://cloud.google.com/natural-language/docs/languages language = "en" document = { "gcs_content_uri": gcs_content_uri, "type_": type_, "language": language, } # Available values: NONE, UTF8, UTF16, UTF32 encoding_type = language_v1.EncodingType.UTF8 response = client.analyze_syntax( request={"document": document, "encoding_type": encoding_type} ) # Loop through tokens returned from the API for token in response.tokens: # Get the text content of this token. Usually a word or punctuation. text = token.text print(f"Token text: {text.content}") print(f"Location of this token in overall document: {text.begin_offset}") # Get the part of speech information for this token. # Part of speech is defined in: # http://www.lrec-conf.org/proceedings/lrec2012/pdf/274_Paper.pdf part_of_speech = token.part_of_speech # Get the tag, e.g. NOUN, ADJ for Adjective, et al. print( "Part of Speech tag: {}".format( language_v1.PartOfSpeech.Tag(part_of_speech.tag).name ) ) # Get the voice, e.g. ACTIVE or PASSIVE print( "Voice: {}".format( language_v1.PartOfSpeech.Voice(part_of_speech.voice).name ) ) # Get the tense, e.g. PAST, FUTURE, PRESENT, et al. print( "Tense: {}".format( language_v1.PartOfSpeech.Tense(part_of_speech.tense).name ) ) # See API reference for additional Part of Speech information available # Get the lemma of the token. Wikipedia lemma description # https://en.wikipedia.org/wiki/Lemma_(morphology) print(f"Lemma: {token.lemma}") # Get the dependency tree parse information for this token. # For more information on dependency labels: # http://www.aclweb.org/anthology/P13-2017 dependency_edge = token.dependency_edge print(f"Head token index: {dependency_edge.head_token_index}") print( "Label: {}".format( language_v1.DependencyEdge.Label(dependency_edge.label).name ) ) # Get the language of the text, which will be the same as # the language specified in the request or, if not specified, # the automatically-detected language. print(f"Language of the text: {response.language}") # [END language_syntax_gcs] def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument( "--gcs_content_uri", type=str, default="gs://cloud-samples-data/language/syntax-sentence.txt", ) args = parser.parse_args() sample_analyze_syntax(args.gcs_content_uri) if __name__ == "__main__": main()

language/v1/language_syntax_gcs.py (55 lines of code) (raw):