Updated the function submit_query_input_in_chunks, but reducing the number of initial jobs, the default slice length, and adding an if block to control the submission of the first batch of jobs

This commit is contained in:
yandj 2019-03-27 18:45:24 -04:00
parent 0bb37b1178
commit 7cca77380b
2 changed files with 58 additions and 47 deletions

View File

@ -1,5 +1,5 @@
source "https://rubygems.org" source "https://rubygems.org"
#ruby-2.3.0-dev #ruby-2.3.0
gem 'rest-client' gem 'rest-client'
gem 'yard' gem 'yard'

View File

@ -34,7 +34,7 @@ module ClassyFireAPI
# #
# @param query_id [Integer] the ID of the query. # @param query_id [Integer] the ID of the query.
# @param format [String] the format of the query (either JSON, CSV, or SDF) # @param format [String] the format of the query (either JSON, CSV, or SDF)
# @return [Text] A text file displaying the classification results for # @return [Text] A text file displaying the classification results for
# the query's entities in the specified format. # the query's entities in the specified format.
def ClassyFireAPI.get_query(query_id,format="json") def ClassyFireAPI.get_query(query_id,format="json")
begin begin
@ -53,13 +53,13 @@ module ClassyFireAPI
e.response e.response
rescue RestClient::RequestTimeout => e rescue RestClient::RequestTimeout => e
e.response e.response
end end
end end
# Return data for the TaxNode with ID chemontid. # Return data for the TaxNode with ID chemontid.
# #
# @param chemontid [String] the ChemOnt ID of the entity. # @param chemontid [String] the ChemOnt ID of the entity.
# @return [Text] A text displaying the classification results for the entity in the specified format. # @return [Text] A text displaying the classification results for the entity in the specified format.
# Use JSON.parse to get a the json object. # Use JSON.parse to get a the json object.
def ClassyFireAPI.get_chemont_node(chemontid) def ClassyFireAPI.get_chemont_node(chemontid)
chemont_id = chemontid.to_s.gsub("CHEMONTID:","C") chemont_id = chemontid.to_s.gsub("CHEMONTID:","C")
@ -86,7 +86,7 @@ module ClassyFireAPI
begin begin
if format == "json" if format == "json"
RestClient.get "#{URL}/entities/#{fingerprint}.#{format}", :accept => :json RestClient.get "#{URL}/entities/#{fingerprint}.#{format}", :accept => :json
end end
rescue RestClient::ResourceNotFound => e rescue RestClient::ResourceNotFound => e
e.response e.response
rescue RestClient::InternalServerError => e rescue RestClient::InternalServerError => e
@ -112,7 +112,7 @@ module ClassyFireAPI
RestClient.get "#{URL}/entities/#{inchikey_id}.#{format}", :accept => :sdf RestClient.get "#{URL}/entities/#{inchikey_id}.#{format}", :accept => :sdf
elsif format == "csv" elsif format == "csv"
RestClient.get "#{URL}/entities/#{inchikey_id}.#{format}", :accept => :csv RestClient.get "#{URL}/entities/#{inchikey_id}.#{format}", :accept => :csv
end end
rescue RestClient::ResourceNotFound => e rescue RestClient::ResourceNotFound => e
e.response e.response
rescue RestClient::InternalServerError => e rescue RestClient::InternalServerError => e
@ -138,82 +138,92 @@ module ClassyFireAPI
end end
# Takes a tab-separated file and submit the contained structures in bulks of a given size # Takes a tab-separated file and submit the contained structures in bulks of a given size
# #
# For 'STRUCTURE' or 'IUPAC_NAME'query types, each line must contain either # For 'STRUCTURE' or 'IUPAC_NAME'query types, each line must contain either
# 1) Only a structural represenation: SMILES, InChI for the 'STRUCTURE' query_type or a IUPAC name # 1) Only a structural represenation: SMILES, InChI for the 'STRUCTURE' query_type or a IUPAC name
# for the 'IUPAC NAME' query type. # for the 'IUPAC NAME' query type.
# 2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the # 2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the
# 'STRUCTURE' query_type or a IUPAC name for the 'IUPAC NAME' query type. # 'STRUCTURE' query_type or a IUPAC name for the 'IUPAC NAME' query type.
# #
# For 'FASTA' query type, just submit the query as a standard FASTA text. # For 'FASTA' query type, just submit the query as a standard FASTA text.
# @param input_file [Text] The path to the input file. # @param input_file [Text] The path to the input file.
# @param: slice_length [Integer] The maximum number of entries for each query input (the whole file # @param: slice_length [Integer] The maximum number of entries for each query input (the whole file
# is fragmented into n part of #slice_length entries each). # is fragmented into n part of #slice_length entries each).
# @param: start [Integer] The starting index. Submit framgments from the index 'start'. # @param: start [Integer] The starting index. Submit framgments from the index 'start'.
def ClassyFireAPI.submit_query_input_in_chunks(input_file,slice_length=1000, start=1, type='STRUCTURE') def ClassyFireAPI.submit_query_input_in_chunks(input_file,slice_length=10, start=1, type='STRUCTURE')
@start_time = Time.now @start_time = Time.now
absolute_path = File.expand_path(input_file) absolute_path = File.expand_path(input_file)
f = File.open(absolute_path, 'r') f = File.open(absolute_path, 'r')
input = [] input = []
lines = File.readlines(absolute_path) lines = File.readlines(absolute_path)
puts lines.length, lines[0]
i = 0 i = 0
lines.uniq.each do |line| lines.uniq.each do |line|
i += 1 i += 1
sline = line.strip.split("\t") sline = line.strip.split("\t")
if sline.length == 1 if sline.length == 1
input <<"#{sline[0]}" input <<"#{sline[0]}"
elsif sline.length == 2 elsif sline.length >= 2
#ID\tSMILES (OR INCHI, OR VALID IUPAC NAME) #ID\tSMILES (OR INCHI, OR VALID IUPAC NAME)
input <<"#{sline[0]}\t#{sline[1]}" input <<"#{sline[0]}\t#{sline[1]}"
end end
# input <<"#{sline[0]}" # input <<"#{sline[0]}"
end end
# puts "=============",input.length, input[0]
query_ids = [] query_ids = []
subdivised_groups = input.uniq.each_slice(slice_length).to_a subdivised_groups = input.uniq.each_slice(slice_length).to_a
puts "nr of subdivised_groups: #{subdivised_groups.length}" puts "nr of subdivised_groups: #{subdivised_groups.length}"
# puts subdivised_groups[0] # puts subdivised_groups[0]
sleeping_time = 240 sleeping_time = 60
initial_nr_of_jobs = 30 initial_nr_of_jobs = 2
i = start i = start
while i < initial_nr_of_jobs if i < initial_nr_of_jobs
while i < initial_nr_of_jobs
title = File.basename(absolute_path).split(".")[0] + "_yannick" + "_part_#{i}" title = File.basename(absolute_path).split(".")[0] + "_yannick" + "_part_#{i}"
if i <= subdivised_groups.length if i <= subdivised_groups.length
puts "\n\n\n\n---------------------- -----------" puts "\n\n\n\n---------------------- -----------"
begin begin
puts "submitting #{title}" puts "submitting #{title}"
# puts subdivised_groups[i-1].join("\n") # puts subdivised_groups[i-1].join("\n")
q = submit_query(title,subdivised_groups[i-1].join("\n"),type) q = submit_query(title,subdivised_groups[i-1].join("\n"),type)
# puts q puts JSON.parse(q)['id']
query_ids << JSON.parse(q)['id'] query_ids << JSON.parse(q)['id']
rescue Exception => e sleep(10)
puts e.message rescue Exception => e
puts e.backtrace.inspect puts e.message
puts e.backtrace.inspect
end
i = i + 1
else
break
end end
i = i + 1 query_ids
else
break
end end
puts "Going to sleep at #{Time.now - @start_time} for #{sleeping_time} s."
sleep(sleeping_time)
puts "Waking up at #{Time.now - @start_time}"
end end
puts "Going to sleep at #{Time.now - @start_time} for #{sleeping_time} s."
sleep(sleeping_time)
puts "Waking up at #{Time.now - @start_time}"
while i >= initial_nr_of_jobs && i < subdivised_groups.length while i >= initial_nr_of_jobs && i < subdivised_groups.length
k = 0 k = 0
for k in (i..(i + initial_nr_of_jobs)) for k in (i...(i + initial_nr_of_jobs))
title = File.basename(absolute_path).split(".")[0] + "_yannick" + "_part_#{k}" title = File.basename(absolute_path).split(".")[0] + "_yannick" + "_part_#{k}"
i = i + 1 i = i + 1
begin begin
puts "submitting #{title}" puts "submitting #{title}"
q = submit_query(title,subdivised_groups[k-1].join("\n"),type) q = submit_query(title,subdivised_groups[k-1].join("\n"),type)
rescue Exception => e puts JSON.parse(q)['id']
puts e.message query_ids << JSON.parse(q)['id']
sleep(10)
rescue Exception => e
puts e.message
puts e.backtrace.inspect puts e.backtrace.inspect
end end
end end
@ -222,6 +232,7 @@ module ClassyFireAPI
sleep(sleeping_time) sleep(sleeping_time)
puts "Waking up at #{Time.now - @start_time}" puts "Waking up at #{Time.now - @start_time}"
end end
end end
# Takes each file in a folder, and submit the contained structures in bluks of a given size. # Takes each file in a folder, and submit the contained structures in bluks of a given size.
@ -229,13 +240,13 @@ module ClassyFireAPI
# For 'STRUCTURE' or 'IUPAC_NAME'query types, each line must contain either # For 'STRUCTURE' or 'IUPAC_NAME'query types, each line must contain either
# 1) Only a structural represenation: SMILES, InChI for the 'STRUCTURE' query_type or a IUPAC name # 1) Only a structural represenation: SMILES, InChI for the 'STRUCTURE' query_type or a IUPAC name
# for the 'IUPAC NAME' query type. # for the 'IUPAC NAME' query type.
# 2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the # 2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the
# 'STRUCTURE' query_type or a IUPAC name for the 'IUPAC NAME' query type. # 'STRUCTURE' query_type or a IUPAC name for the 'IUPAC NAME' query type.
# #
# For 'FASTA' query type, just submit the query as a standard FASTA text. # For 'FASTA' query type, just submit the query as a standard FASTA text.
# @param: input_file [String] The path to the folder. # @param: input_file [String] The path to the folder.
# @param: slice_length [Integer] The maximum number of entries for each query input (each file # @param: slice_length [Integer] The maximum number of entries for each query input (each file
# is fragmented into n part of #slice_length entries each), 'integer' # is fragmented into n part of #slice_length entries each), 'integer'
# @param type [String] the query_type 'STRUCTURE' (default) or 'IUPAC_NAME' or 'FASTA' # @param type [String] the query_type 'STRUCTURE' (default) or 'IUPAC_NAME' or 'FASTA'
def ClassyFireAPI.submit_queries_from_directory(folder,slice_length,type="STRUCTURE") def ClassyFireAPI.submit_queries_from_directory(folder,slice_length,type="STRUCTURE")
if File.directory?(folder) if File.directory?(folder)
@ -249,7 +260,7 @@ module ClassyFireAPI
end end
# Reads a tab separated file, and use the structure representation # Reads a tab separated file, and use the structure representation
#to retrieve the strutcure's classification from ClassyFire. #to retrieve the strutcure's classification from ClassyFire.
# #
# @param input [String] path to the input file. # @param input [String] path to the input file.
@ -311,7 +322,7 @@ module ClassyFireAPI
# res += "," # res += ","
rescue Exception => e rescue Exception => e
e.message e.message
end end
end end
res += "]}" res += "]}"
f_output.print res f_output.print res
@ -328,7 +339,7 @@ module ClassyFireAPI
h = Hash.new h = Hash.new
directory = absolute_path.split('/')[0...-1].join("/") directory = absolute_path.split('/')[0...-1].join("/")
f_output = File.new(output, 'w') f_output = File.new(output, 'w')
puts puts
res = String.new res = String.new
res += "{" res += "{"
@ -347,7 +358,7 @@ module ClassyFireAPI
puts i puts i
# puts "#{key} :: #{h[key]}" # puts "#{key} :: #{h[key]}"
begin begin
inchikey = %x(obabel -:"#{h[key]}" -oinchikey).strip.split("\t")[0] inchikey = %x(obabel -:"#{h[key]}" -oinchikey).strip.split("\t")[0]
# puts inchikey # puts inchikey
qr = JSON.parse(ClassyFireAPI.get_entity_classification(inchikey,format="json")) qr = JSON.parse(ClassyFireAPI.get_entity_classification(inchikey,format="json"))
qr['identifier'] = key qr['identifier'] = key
@ -371,7 +382,7 @@ module ClassyFireAPI
# res += "," # res += ","
rescue Exception => e rescue Exception => e
e.message e.message
end end
end end
res += "]}" res += "]}"
f_output.print res f_output.print res
@ -439,7 +450,7 @@ module ClassyFireAPI
end end
rescue Exception => e rescue Exception => e
e.message e.message
end end
end end
end end
end end