Updated the function submit_query_input_in_chunks, but reducing the number of initial jobs, the default slice length, and adding an if block to control the submission of the first batch of jobs
This commit is contained in:
parent
0bb37b1178
commit
7cca77380b
2
Gemfile
2
Gemfile
|
@ -1,5 +1,5 @@
|
|||
source "https://rubygems.org"
|
||||
#ruby-2.3.0-dev
|
||||
#ruby-2.3.0
|
||||
|
||||
gem 'rest-client'
|
||||
gem 'yard'
|
||||
|
|
|
@ -34,7 +34,7 @@ module ClassyFireAPI
|
|||
#
|
||||
# @param query_id [Integer] the ID of the query.
|
||||
# @param format [String] the format of the query (either JSON, CSV, or SDF)
|
||||
# @return [Text] A text file displaying the classification results for
|
||||
# @return [Text] A text file displaying the classification results for
|
||||
# the query's entities in the specified format.
|
||||
def ClassyFireAPI.get_query(query_id,format="json")
|
||||
begin
|
||||
|
@ -53,13 +53,13 @@ module ClassyFireAPI
|
|||
e.response
|
||||
rescue RestClient::RequestTimeout => e
|
||||
e.response
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Return data for the TaxNode with ID chemontid.
|
||||
#
|
||||
# @param chemontid [String] the ChemOnt ID of the entity.
|
||||
# @return [Text] A text displaying the classification results for the entity in the specified format.
|
||||
# @return [Text] A text displaying the classification results for the entity in the specified format.
|
||||
# Use JSON.parse to get a the json object.
|
||||
def ClassyFireAPI.get_chemont_node(chemontid)
|
||||
chemont_id = chemontid.to_s.gsub("CHEMONTID:","C")
|
||||
|
@ -86,7 +86,7 @@ module ClassyFireAPI
|
|||
begin
|
||||
if format == "json"
|
||||
RestClient.get "#{URL}/entities/#{fingerprint}.#{format}", :accept => :json
|
||||
end
|
||||
end
|
||||
rescue RestClient::ResourceNotFound => e
|
||||
e.response
|
||||
rescue RestClient::InternalServerError => e
|
||||
|
@ -112,7 +112,7 @@ module ClassyFireAPI
|
|||
RestClient.get "#{URL}/entities/#{inchikey_id}.#{format}", :accept => :sdf
|
||||
elsif format == "csv"
|
||||
RestClient.get "#{URL}/entities/#{inchikey_id}.#{format}", :accept => :csv
|
||||
end
|
||||
end
|
||||
rescue RestClient::ResourceNotFound => e
|
||||
e.response
|
||||
rescue RestClient::InternalServerError => e
|
||||
|
@ -138,82 +138,92 @@ module ClassyFireAPI
|
|||
end
|
||||
|
||||
# Takes a tab-separated file and submit the contained structures in bulks of a given size
|
||||
#
|
||||
#
|
||||
# For 'STRUCTURE' or 'IUPAC_NAME'query types, each line must contain either
|
||||
# 1) Only a structural represenation: SMILES, InChI for the 'STRUCTURE' query_type or a IUPAC name
|
||||
# for the 'IUPAC NAME' query type.
|
||||
# 2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the
|
||||
# 2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the
|
||||
# 'STRUCTURE' query_type or a IUPAC name for the 'IUPAC NAME' query type.
|
||||
#
|
||||
#
|
||||
# For 'FASTA' query type, just submit the query as a standard FASTA text.
|
||||
# @param input_file [Text] The path to the input file.
|
||||
# @param: slice_length [Integer] The maximum number of entries for each query input (the whole file
|
||||
# @param: slice_length [Integer] The maximum number of entries for each query input (the whole file
|
||||
# is fragmented into n part of #slice_length entries each).
|
||||
# @param: start [Integer] The starting index. Submit framgments from the index 'start'.
|
||||
def ClassyFireAPI.submit_query_input_in_chunks(input_file,slice_length=1000, start=1, type='STRUCTURE')
|
||||
def ClassyFireAPI.submit_query_input_in_chunks(input_file,slice_length=10, start=1, type='STRUCTURE')
|
||||
@start_time = Time.now
|
||||
absolute_path = File.expand_path(input_file)
|
||||
f = File.open(absolute_path, 'r')
|
||||
input = []
|
||||
|
||||
|
||||
lines = File.readlines(absolute_path)
|
||||
puts lines.length, lines[0]
|
||||
i = 0
|
||||
lines.uniq.each do |line|
|
||||
i += 1
|
||||
sline = line.strip.split("\t")
|
||||
if sline.length == 1
|
||||
input <<"#{sline[0]}"
|
||||
elsif sline.length == 2
|
||||
elsif sline.length >= 2
|
||||
#ID\tSMILES (OR INCHI, OR VALID IUPAC NAME)
|
||||
input <<"#{sline[0]}\t#{sline[1]}"
|
||||
end
|
||||
# input <<"#{sline[0]}"
|
||||
end
|
||||
|
||||
# puts "=============",input.length, input[0]
|
||||
query_ids = []
|
||||
subdivised_groups = input.uniq.each_slice(slice_length).to_a
|
||||
puts "nr of subdivised_groups: #{subdivised_groups.length}"
|
||||
# puts subdivised_groups[0]
|
||||
sleeping_time = 240
|
||||
initial_nr_of_jobs = 30
|
||||
sleeping_time = 60
|
||||
initial_nr_of_jobs = 2
|
||||
i = start
|
||||
|
||||
while i < initial_nr_of_jobs
|
||||
if i < initial_nr_of_jobs
|
||||
while i < initial_nr_of_jobs
|
||||
|
||||
title = File.basename(absolute_path).split(".")[0] + "_yannick" + "_part_#{i}"
|
||||
title = File.basename(absolute_path).split(".")[0] + "_yannick" + "_part_#{i}"
|
||||
|
||||
if i <= subdivised_groups.length
|
||||
puts "\n\n\n\n---------------------- -----------"
|
||||
begin
|
||||
puts "submitting #{title}"
|
||||
# puts subdivised_groups[i-1].join("\n")
|
||||
q = submit_query(title,subdivised_groups[i-1].join("\n"),type)
|
||||
# puts q
|
||||
query_ids << JSON.parse(q)['id']
|
||||
rescue Exception => e
|
||||
puts e.message
|
||||
puts e.backtrace.inspect
|
||||
if i <= subdivised_groups.length
|
||||
puts "\n\n\n\n---------------------- -----------"
|
||||
begin
|
||||
puts "submitting #{title}"
|
||||
# puts subdivised_groups[i-1].join("\n")
|
||||
q = submit_query(title,subdivised_groups[i-1].join("\n"),type)
|
||||
puts JSON.parse(q)['id']
|
||||
query_ids << JSON.parse(q)['id']
|
||||
sleep(10)
|
||||
rescue Exception => e
|
||||
puts e.message
|
||||
puts e.backtrace.inspect
|
||||
end
|
||||
i = i + 1
|
||||
else
|
||||
break
|
||||
end
|
||||
i = i + 1
|
||||
else
|
||||
break
|
||||
query_ids
|
||||
end
|
||||
puts "Going to sleep at #{Time.now - @start_time} for #{sleeping_time} s."
|
||||
sleep(sleeping_time)
|
||||
puts "Waking up at #{Time.now - @start_time}"
|
||||
end
|
||||
|
||||
puts "Going to sleep at #{Time.now - @start_time} for #{sleeping_time} s."
|
||||
sleep(sleeping_time)
|
||||
puts "Waking up at #{Time.now - @start_time}"
|
||||
|
||||
|
||||
while i >= initial_nr_of_jobs && i < subdivised_groups.length
|
||||
k = 0
|
||||
for k in (i..(i + initial_nr_of_jobs))
|
||||
for k in (i...(i + initial_nr_of_jobs))
|
||||
title = File.basename(absolute_path).split(".")[0] + "_yannick" + "_part_#{k}"
|
||||
i = i + 1
|
||||
begin
|
||||
puts "submitting #{title}"
|
||||
q = submit_query(title,subdivised_groups[k-1].join("\n"),type)
|
||||
rescue Exception => e
|
||||
puts e.message
|
||||
puts JSON.parse(q)['id']
|
||||
query_ids << JSON.parse(q)['id']
|
||||
sleep(10)
|
||||
rescue Exception => e
|
||||
puts e.message
|
||||
puts e.backtrace.inspect
|
||||
end
|
||||
end
|
||||
|
@ -222,6 +232,7 @@ module ClassyFireAPI
|
|||
sleep(sleeping_time)
|
||||
puts "Waking up at #{Time.now - @start_time}"
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
# Takes each file in a folder, and submit the contained structures in bluks of a given size.
|
||||
|
@ -229,13 +240,13 @@ module ClassyFireAPI
|
|||
# For 'STRUCTURE' or 'IUPAC_NAME'query types, each line must contain either
|
||||
# 1) Only a structural represenation: SMILES, InChI for the 'STRUCTURE' query_type or a IUPAC name
|
||||
# for the 'IUPAC NAME' query type.
|
||||
# 2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the
|
||||
# 2) a tab-separated pair of an ID and the corresponding sructure representation: SMILES, InChI for the
|
||||
# 'STRUCTURE' query_type or a IUPAC name for the 'IUPAC NAME' query type.
|
||||
#
|
||||
#
|
||||
# For 'FASTA' query type, just submit the query as a standard FASTA text.
|
||||
# @param: input_file [String] The path to the folder.
|
||||
# @param: slice_length [Integer] The maximum number of entries for each query input (each file
|
||||
# is fragmented into n part of #slice_length entries each), 'integer'
|
||||
# @param: slice_length [Integer] The maximum number of entries for each query input (each file
|
||||
# is fragmented into n part of #slice_length entries each), 'integer'
|
||||
# @param type [String] the query_type 'STRUCTURE' (default) or 'IUPAC_NAME' or 'FASTA'
|
||||
def ClassyFireAPI.submit_queries_from_directory(folder,slice_length,type="STRUCTURE")
|
||||
if File.directory?(folder)
|
||||
|
@ -249,7 +260,7 @@ module ClassyFireAPI
|
|||
end
|
||||
|
||||
|
||||
# Reads a tab separated file, and use the structure representation
|
||||
# Reads a tab separated file, and use the structure representation
|
||||
#to retrieve the strutcure's classification from ClassyFire.
|
||||
#
|
||||
# @param input [String] path to the input file.
|
||||
|
@ -311,7 +322,7 @@ module ClassyFireAPI
|
|||
# res += ","
|
||||
rescue Exception => e
|
||||
e.message
|
||||
end
|
||||
end
|
||||
end
|
||||
res += "]}"
|
||||
f_output.print res
|
||||
|
@ -328,7 +339,7 @@ module ClassyFireAPI
|
|||
h = Hash.new
|
||||
directory = absolute_path.split('/')[0...-1].join("/")
|
||||
f_output = File.new(output, 'w')
|
||||
puts
|
||||
puts
|
||||
res = String.new
|
||||
|
||||
res += "{"
|
||||
|
@ -347,7 +358,7 @@ module ClassyFireAPI
|
|||
puts i
|
||||
# puts "#{key} :: #{h[key]}"
|
||||
begin
|
||||
inchikey = %x(obabel -:"#{h[key]}" -oinchikey).strip.split("\t")[0]
|
||||
inchikey = %x(obabel -:"#{h[key]}" -oinchikey).strip.split("\t")[0]
|
||||
# puts inchikey
|
||||
qr = JSON.parse(ClassyFireAPI.get_entity_classification(inchikey,format="json"))
|
||||
qr['identifier'] = key
|
||||
|
@ -371,7 +382,7 @@ module ClassyFireAPI
|
|||
# res += ","
|
||||
rescue Exception => e
|
||||
e.message
|
||||
end
|
||||
end
|
||||
end
|
||||
res += "]}"
|
||||
f_output.print res
|
||||
|
@ -439,7 +450,7 @@ module ClassyFireAPI
|
|||
end
|
||||
rescue Exception => e
|
||||
e.message
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue