-
Notifications
You must be signed in to change notification settings - Fork 1
c05 apoio, outubro
Peter edited this page Oct 5, 2017
·
2 revisions
Ao usar API do PMC retornou os seguintes casos classificados como "Comment" que eram artigos de fato,
pmcid | ISSN | doctype | date2 |
---|---|---|---|
27733281 | 0140-6736 | Comment | 2016-10-08 |
27733282 | 0140-6736 | Comment | 2016-10-08 |
27733283 | 0140-6736 | Comment | 2016-10-08 |
CREATE TABLE amostra1 (
pmcid int,
issn text,
doctype text,
date2 text
);
COPY amostra1 FROM '/tmp/c05_articles_pmid-EXPANDED.csv' WITH (FORMAT csv) HEADER;
DELETE FROM amostra1 WHERE pmcid is null OR pmcid=0; --2
SELECT count(issn) as n_issn, count(*) as n, count(distinct pmcid) from amostra1; -- 531 | 1458 | 1442
CREATE TABLE amostra1_limpo AS
SELECT DISTINCT pmcid, issn, doctype, date2::date as date2
FROM amostra1
WHERE doctype NOT IN ('Editorial');
select * from amostra1_limpo where doctype!='Journal Article' order by doctype, pmcid;
Casos que não eram Journal Article:
pmcid | doctype | date2 |
---|---|---|
27028323 | Clinical Trial | 2016-03-30 |
27242166 | Clinical Trial | 2016-06-27 |
27509020 | Clinical Trial | 2016-08-10 |
27820821 | Clinical Trial | 2016-11-07 |
27887591 | Clinical Trial | 2016-11-25 |
27733281 | Comment | 2016-10-08 |
27733282 | Comment | 2016-10-08 |
27733283 | Comment | 2016-10-08 |
28090535 | Editorial | 2016-01-01 |
23410000 | Evaluation Studies | 2013-02-14 |
23809208 | Evaluation Studies | 2013-06-28 |
23967358 | Evaluation Studies | 2013-08-15 |
24148652 | Evaluation Studies | 2013-10-22 |
24603874 | Evaluation Studies | 2014-03-06 |
26641858 | Evaluation Studies | 2015-12-07 |
26910315 | Evaluation Studies | 2016-02-24 |
26962084 | Evaluation Studies | 2016-05-01 |
26986061 | Evaluation Studies | 2016-03-17 |
27102015 | Evaluation Studies | 2016-04-21 |
27314656 | Evaluation Studies | 2016-05-01 |
27532497 | Evaluation Studies | 2016-08-17 |
27549586 | Evaluation Studies | 2016-10-15 |
27618941 | Evaluation Studies | 2016-09-13 |
27768689 | Evaluation Studies | 2016-10-21 |
27768698 | Evaluation Studies | 2016-10-21 |
28116530 | Evaluation Studies | 2017-03-01 |
28153043 | Evaluation Studies | 2017-02-02 |
28183291 | Evaluation Studies | 2017-02-10 |
28183328 | Evaluation Studies | 2017-02-10 |
28339458 | Evaluation Studies | 2017-03-24 |
28542603 | Evaluation Studies | 2017-05-25 |
28651016 | Evaluation Studies | 2017-06-26 |
28719659 | Evaluation Studies | 2017-07-18 |
27564984 | Historical Article | 2016-09-01 |
27708473 | Historical Article | 2016-09-01 |
28285851 | Historical Article | 2017-06-01 |
28727821 | Historical Article | 2017-07-20 |
..37 rows | ... | ... |
CREATE TABLE doaj_info AS
-- conferir porque não está completo, ver http://miar.ub.edu/issn/0021-9584 ou http://www.issn.cc/1932-6203
-- ambos tem URLs da revista
SELECT CASE when is1 is not null THEN issn.n2c(is1) ELSE issn.n2c(is2) END as issnl,
pup_country, license, lic_attribs, license_ismachine
FROM (
SELECT "Journal ISSN (print version)" as is1, "Journal EISSN (online version)" as is2,
"Country of publisher" as pup_country, "Machine-readable CC licensing information embedded or displayed" as license_ismachine,
"Journal license" as license, "License attributes" as lic_attribs
FROM tmpcsv_doaj
) t
;
Ver https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=28727821
- Richness R = 310 ISSNs
- Richness R = 123 ISSNs filtering with
select issn, count(*) from amostra1_limpo group by 1 having count(*)>1 order by 2 desc
Simpson Diversity-dominance Index
Simpson's Index (D) measures the probability that two individuals randomly selected from a sample will belong to the same species
select sum( (count(issn)::float/(select count(*) from amostra1_limpo)::float)^2 ) as "D"
from amostra1_limpo group by issn
;
select count(*) from amostra1_limpo group by 1 ;
-- P=N::float/count(*)::float
VACUUM FULL;
-- 1.3*N/TOT ... limit N;
-- precisa retornar mesmo tipo
CREATE or replace FUNCTION amostra1_limpo_get(bigint) RETURNS SETOF amostra1_limpo AS $$
SELECT *
FROM amostra1_limpo
TABLESAMPLE BERNOULLI( ROUND(120.0*$1::float/(select count(*) from amostra1_limpo)::float) ) LIMIT $1;
$$ LANGUAGE SQL;
-- opcao dinamica. RETURN QUERY EXECUTE format(
-- 'SELECT * FROM %I ... TABLESAMPLE BERNOULLI((select count(*) from %I)) ',
-- tname, 120, tname,
-- );
-- simpson2
WITH t AS (
SELECT count(issn) as n, count(issn)-1 as n1
FROM amostra1_limpo
GROUP BY issn --HAVING count(*)>1
) SELECT (SELECT sum(n*n1) from t)::float/(SELECT sum(n)*sum(n1) from t)::float as "D";
D = 0.053, ou seja 5,3% de chance de dois artigos aleatórios terem mesmo ISSN. (com count>10 sobre para 9%) considerando tudo temos 4,6% de chance.
Conceito em https://en.wikipedia.org/wiki/Species_evenness
WITH qt AS (
SELECT count(issn) as n
FROM amostra1_limpo
GROUP BY issn -- HAVING count(*)>1
) SELECT -sum( p*ln(p) ) / ln((SELECT sum(n) FROM qt)::float) as "E"
FROM (
SELECT n::float/(SELECT sum(n) FROM qt)::float as p
FROM qt
) t
; -- 0.54 com count, 0.6 = 60%
-- distribuição por ano:
select count(*) as n, count(distinct issn) as is, substr(date2::text,0,5)::int as ano from amostra1_limpo group by 3 order by 3;