Updated the function submit_query_input_in_chunks, but reducing the number of initial jobs, the default slice length, and adding an if block to control the submission of the first batch of jobs

2019-03-27 18:45:24 -04:00 · 2019-03-27 18:45:24 -04:00 · 7cca77380b
parent 0bb37b1178
commit 7cca77380b
2 changed files with 58 additions and 47 deletions
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 source "https://rubygems.org"
-#ruby-2.3.0-dev
+#ruby-2.3.0

 gem 'rest-client'
 gem 'yard'
--- a/lib/classyfire_api.rb
+++ b/lib/classyfire_api.rb
@ -34,7 +34,7 @@ module ClassyFireAPI
  #
  # @param query_id [Integer] the ID of the query.
  # @param format [String] the format of the query (either JSON, CSV, or SDF)
-  # @return [Text] A text file displaying the classification results for 
+  # @return [Text] A text file displaying the classification results for
  # the query's entities in the specified format.
  def ClassyFireAPI.get_query(query_id,format="json")
    begin
@ -53,13 +53,13 @@ module ClassyFireAPI
      e.response
    rescue RestClient::RequestTimeout => e
      e.response
-    end      
+    end
  end

  # Return data for the TaxNode with ID chemontid.
  #
  # @param chemontid [String] the ChemOnt ID of the entity.
-  # @return [Text] A text displaying the classification results for the entity in the specified format. 
+  # @return [Text] A text displaying the classification results for the entity in the specified format.
  # Use JSON.parse to get a the json object.
  def ClassyFireAPI.get_chemont_node(chemontid)
    chemont_id = chemontid.to_s.gsub("CHEMONTID:","C")
@ -86,7 +86,7 @@ module ClassyFireAPI
    begin
      if format == "json"
        RestClient.get "#{URL}/entities/#{fingerprint}.#{format}", :accept => :json
-      end 
+      end
    rescue RestClient::ResourceNotFound => e
      e.response
    rescue  RestClient::InternalServerError => e
@ -112,7 +112,7 @@ module ClassyFireAPI
        RestClient.get "#{URL}/entities/#{inchikey_id}.#{format}", :accept => :sdf
      elsif format == "csv"
        RestClient.get "#{URL}/entities/#{inchikey_id}.#{format}", :accept => :csv
-      end 
+      end
    rescue RestClient::ResourceNotFound => e
      e.response
    rescue  RestClient::InternalServerError => e
@ -138,82 +138,92 @@ module ClassyFireAPI
  end

  # Takes a tab-separated file and submit the contained structures in bulks of a given size
-  # 
+  #
  # For 'STRUCTURE' or 'IUPAC_NAME'query types, each line must contain either
  #   1) Only a structural represenation: SMILES, InChI for the 'STRUCTURE' query_type or a IUPAC name
  #     for the 'IUPAC NAME' query type.
-  #   2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the 
+  #   2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the
  #     'STRUCTURE' query_type or a IUPAC name for the 'IUPAC NAME' query type.
-  # 
+  #
  # For 'FASTA' query type, just submit the query as a standard FASTA text.
  # @param input_file [Text] The path to the input file.
-  # @param: slice_length [Integer] The maximum number of entries for each query input (the whole file 
+  # @param: slice_length [Integer] The maximum number of entries for each query input (the whole file
  # is fragmented into n part of #slice_length entries each).
  # @param: start [Integer] The starting index. Submit framgments from the index 'start'.
-  def ClassyFireAPI.submit_query_input_in_chunks(input_file,slice_length=1000, start=1, type='STRUCTURE')
+  def ClassyFireAPI.submit_query_input_in_chunks(input_file,slice_length=10, start=1, type='STRUCTURE')
    @start_time = Time.now
    absolute_path = File.expand_path(input_file)
    f             = File.open(absolute_path, 'r')
    input         = []

+
    lines = File.readlines(absolute_path)
+    puts lines.length, lines[0]
    i = 0
    lines.uniq.each do |line|
      i += 1
      sline = line.strip.split("\t")
      if sline.length == 1
        input <<"#{sline[0]}"
-      elsif sline.length == 2
+      elsif sline.length >= 2
        #ID\tSMILES (OR INCHI, OR VALID IUPAC NAME)
        input <<"#{sline[0]}\t#{sline[1]}"
      end
      # input <<"#{sline[0]}"
    end
-
+    # puts "=============",input.length, input[0]
    query_ids = []
    subdivised_groups = input.uniq.each_slice(slice_length).to_a
    puts "nr of subdivised_groups: #{subdivised_groups.length}"
    # puts subdivised_groups[0]
-    sleeping_time = 240
-    initial_nr_of_jobs = 30
+    sleeping_time = 60
+    initial_nr_of_jobs = 2
    i = start

-    while i <  initial_nr_of_jobs
+    if i < initial_nr_of_jobs
+      while i <  initial_nr_of_jobs

-      title = File.basename(absolute_path).split(".")[0] + "_yannick" + "_part_#{i}"
+        title = File.basename(absolute_path).split(".")[0] + "_yannick" + "_part_#{i}"

-      if i <= subdivised_groups.length
-        puts "\n\n\n\n---------------------- -----------"
-        begin
-          puts "submitting #{title}"
-          # puts subdivised_groups[i-1].join("\n")
-          q       = submit_query(title,subdivised_groups[i-1].join("\n"),type)
-          # puts q
-          query_ids << JSON.parse(q)['id']
-        rescue Exception => e  
-          puts e.message  
-          puts e.backtrace.inspect
+        if i <= subdivised_groups.length
+          puts "\n\n\n\n---------------------- -----------"
+          begin
+            puts "submitting #{title}"
+            # puts subdivised_groups[i-1].join("\n")
+            q       = submit_query(title,subdivised_groups[i-1].join("\n"),type)
+            puts JSON.parse(q)['id']
+            query_ids << JSON.parse(q)['id']
+            sleep(10)
+          rescue Exception => e
+            puts e.message
+            puts e.backtrace.inspect
+          end
+          i = i + 1
+        else
+          break
        end
-        i = i + 1
-      else
-        break
+        query_ids
      end
+      puts "Going to sleep at #{Time.now - @start_time} for #{sleeping_time} s."
+      sleep(sleeping_time)
+      puts "Waking up at #{Time.now - @start_time}"
    end

-    puts "Going to sleep at #{Time.now - @start_time} for #{sleeping_time} s."
-    sleep(sleeping_time)
-    puts "Waking up at #{Time.now - @start_time}"
+

    while i >= initial_nr_of_jobs && i < subdivised_groups.length
      k = 0
-      for k in (i..(i + initial_nr_of_jobs))
+      for k in (i...(i + initial_nr_of_jobs))
        title     = File.basename(absolute_path).split(".")[0] + "_yannick" + "_part_#{k}"
        i = i + 1
        begin
          puts "submitting #{title}"
          q = submit_query(title,subdivised_groups[k-1].join("\n"),type)
-        rescue Exception => e  
-          puts e.message  
+          puts JSON.parse(q)['id']
+          query_ids << JSON.parse(q)['id']
+          sleep(10)
+        rescue Exception => e
+          puts e.message
          puts e.backtrace.inspect
        end
      end
@ -222,6 +232,7 @@ module ClassyFireAPI
      sleep(sleeping_time)
      puts "Waking up at #{Time.now - @start_time}"
    end
+
  end

  # Takes each file in a folder, and submit the contained structures in bluks of a given size.
@ -229,13 +240,13 @@ module ClassyFireAPI
  # For 'STRUCTURE' or 'IUPAC_NAME'query types, each line must contain either
  #   1) Only a structural represenation: SMILES, InChI for the 'STRUCTURE' query_type or a IUPAC name
  #     for the 'IUPAC NAME' query type.
-  #   2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the 
+  #   2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the
  #     'STRUCTURE' query_type or a IUPAC name for the 'IUPAC NAME' query type.
-  # 
+  #
  # For 'FASTA' query type, just submit the query as a standard FASTA text.
  # @param: input_file [String] The path to the folder.
-  # @param: slice_length [Integer] The maximum number of entries for each query input (each file 
-  # is fragmented into n part of #slice_length entries each), 'integer'  
+  # @param: slice_length [Integer] The maximum number of entries for each query input (each file
+  # is fragmented into n part of #slice_length entries each), 'integer'
  # @param type [String] the query_type 'STRUCTURE' (default) or 'IUPAC_NAME' or 'FASTA'
  def ClassyFireAPI.submit_queries_from_directory(folder,slice_length,type="STRUCTURE")
    if File.directory?(folder)
@ -249,7 +260,7 @@ module ClassyFireAPI
  end


-  # Reads a tab separated file, and use the structure representation 
+  # Reads a tab separated file, and use the structure representation
  #to retrieve the strutcure's classification from ClassyFire.
  #
  # @param input [String] path to the input file.
@ -311,7 +322,7 @@ module ClassyFireAPI
        # res += ","
      rescue Exception => e
        e.message
-      end        
+      end
    end
    res += "]}"
    f_output.print res
@ -328,7 +339,7 @@ module ClassyFireAPI
    h             = Hash.new
    directory     = absolute_path.split('/')[0...-1].join("/")
    f_output      = File.new(output, 'w')
-    puts 
+    puts
    res = String.new

    res += "{"
@ -347,7 +358,7 @@ module ClassyFireAPI
        puts i
        # puts "#{key} :: #{h[key]}"
        begin
-          inchikey = %x(obabel -:"#{h[key]}" -oinchikey).strip.split("\t")[0] 
+          inchikey = %x(obabel -:"#{h[key]}" -oinchikey).strip.split("\t")[0]
          # puts inchikey
          qr = JSON.parse(ClassyFireAPI.get_entity_classification(inchikey,format="json"))
          qr['identifier'] = key
@ -371,7 +382,7 @@ module ClassyFireAPI
        # res += ","
      rescue Exception => e
        e.message
-      end        
+      end
    end
    res += "]}"
    f_output.print res
@ -439,7 +450,7 @@ module ClassyFireAPI
        end
      rescue Exception => e
        e.message
-      end        
+      end
    end
  end
 end